Spaces:

Cyberlace
/

api-swara-audio-analysis

Sleeping

App Files Files Community

api-swara-audio-analysis / app /services /articulation.py

fariedalfarizi

Fix FillerWordsDetector duplicate and unpacking error

82e47b6 16 days ago

raw

history blame

12.1 kB

	"""
	Articulation Analysis Service
	Analisis artikulasi/pronunciation dengan BERT-based alignment
	"""

	import torch
	import numpy as np
	import string
	from typing import Dict, List, Tuple, Optional
	from dataclasses import dataclass, asdict
	import re
	import warnings
	from difflib import SequenceMatcher
	warnings.filterwarnings('ignore')


	@dataclass
	class WordScore:
	"""Score untuk satu kata"""
	index: int
	expected: str
	detected: str
	is_correct: bool
	similarity: float
	is_filler: bool = False
	match_type: str = "match"


	class FillerWordsDetector:
	"""Deteksi kata pengisi dalam Bahasa Indonesia"""

	FILLER_WORDS = {
	'um', 'umm', 'ummm', 'em', 'emm', 'emmm',
	'eh', 'ehh', 'ehhh', 'ehm', 'ehmm', 'ehmmm',
	'ah', 'ahh', 'ahhh', 'ahm', 'ahmm', 'ahmmm',
	'hmm', 'hmmm', 'hmmmm',
	'uh', 'uhh', 'uhhh', 'uhm', 'uhmm',
	'anu', 'ano', 'gitu', 'gituloh', 'gitu loh',
	'kayak', 'kayaknya', 'kayak gini', 'kayak gitu',
	'apa', 'apa ya', 'apa namanya',
	'maksudnya', 'maksud saya', 'jadi', 'jadinya',
	'nah', 'terus', 'lalu', 'kemudian',
	'gini', 'begini', 'begitu',
	'semacam', 'semisal', 'ibaratnya',
	'ya kan', 'kan', 'ya', 'yah',
	'sepertinya', 'mungkin',
	'toh', 'sih', 'deh', 'dong', 'lah',
	}

	@classmethod
	def is_filler(cls, word: str) -> bool:
	"""Check if word is a filler"""
	word_clean = word.lower().strip().rstrip(string.punctuation)

	if word_clean in cls.FILLER_WORDS:
	return True

	if re.match(r'^(um+\|em+\|eh+m\|ah+m\|uh+m*\|hmm+)$', word_clean):
	return True

	return False

	@classmethod
	def count_fillers(cls, text: str) -> Tuple[int, List[str]]:
	"""Count filler words in text"""
	words = text.lower().split()
	fillers = [w for w in words if cls.is_filler(w)]
	return len(fillers), fillers


	class ProfanityDetector:
	"""Deteksi kata tidak senonoh dalam Bahasa Indonesia dan Inggris"""

	PROFANITY_WORDS = {
	'anjir', 'anjay', 'njir', 'njay', 'anjrit', 'njrit', 'shit', 'fuck',
	'tolol', 'oon', 'bego', 'gak ada otak', 'goblok', 'bodoh', 'anjim',
	'anjing', 'anjrot', 'asu', 'babi', 'bacot', 'bajingan', 'banci',
	'bangke', 'bangor', 'bangsat', 'bejad', 'bencong', 'bodat', 'bugil',
	'bundir', 'bunuh', 'burik', 'burit', 'cawek', 'cemen', 'cipok', 'cium',
	'colai', 'coli', 'colmek', 'cukimai', 'cukimay', 'culun', 'cumbu',
	'dancuk', 'dewasa', 'dick', 'dildo', 'encuk', 'gay', 'gei', 'gembel',
	'gey', 'gigolo', 'gila', 'goblog', 'haram', 'hencet', 'hentai', 'idiot',
	'jablai', 'jablay', 'jancok', 'jancuk', 'jangkik', 'jembut', 'jilat',
	'jingan', 'kampang', 'keparat', 'kimak', 'kirik', 'klentit', 'klitoris',
	'konthol', 'kontol', 'koplok', 'kunyuk', 'kutang', 'kutis', 'kwontol',
	'lonte', 'maho', 'masturbasi', 'matane', 'mati', 'memek', 'mesum',
	'modar', 'modyar', 'mokad', 'najis', 'nazi', 'ndhasmu', 'nenen',
	'ngentot', 'ngolom', 'ngulum', 'nigga', 'nigger', 'onani', 'orgasme',
	'paksa', 'pantat', 'pantek', 'pecun', 'peli', 'penis', 'pentil', 'pepek',
	'perek', 'perkosa', 'piatu', 'porno', 'pukimak', 'qontol', 'selangkang',
	'sempak', 'senggama', 'setan', 'setubuh', 'silet', 'silit', 'sinting',
	'sodomi', 'stres', 'telanjang', 'telaso', 'tete', 'tewas', 'titit',
	'togel', 'toket', 'tusbol', 'urin', 'vagina'
	}

	@classmethod
	def detect_profanity(cls, text: str) -> Dict:
	"""
	Deteksi kata tidak senonoh dalam teks

	Returns:
	Dict dengan keys:
	- has_profanity: bool
	- profanity_count: int
	- profanity_words: List[str] (kata yang terdeteksi)
	"""
	# Normalisasi text
	text_lower = text.lower()
	words = re.findall(r'\b\w+\b', text_lower)

	# Cari kata tidak senonoh
	found_profanity = []
	for word in words:
	if word in cls.PROFANITY_WORDS:
	found_profanity.append(word)

	# Cari phrase (2-3 kata)
	phrases_2 = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
	phrases_3 = [f"{words[i]} {words[i+1]} {words[i+2]}" for i in range(len(words)-2)]

	for phrase in phrases_2 + phrases_3:
	if phrase in cls.PROFANITY_WORDS:
	found_profanity.append(phrase)

	return {
	'has_profanity': len(found_profanity) > 0,
	'profanity_count': len(found_profanity),
	'profanity_words': list(set(found_profanity)) # Remove duplicates
	}
	import string
	word_clean = word.lower().strip().rstrip(string.punctuation)

	if word_clean in cls.FILLER_WORDS:
	return True

	if re.match(r'^(um+\|em+\|eh+m\|ah+m\|uh+m*\|hmm+)$', word_clean):
	return True

	return False

	@classmethod
	def count_fillers(cls, text: str) -> Tuple[int, List[str]]:
	"""Count filler words in text"""
	words = text.lower().split()
	fillers = [w for w in words if cls.is_filler(w)]
	return len(fillers), fillers


	class SequenceAligner:
	"""Sequence alignment untuk word matching"""

	@staticmethod
	def calculate_similarity(word1: str, word2: str) -> float:
	"""Calculate similarity between two words"""
	return SequenceMatcher(None, word1.lower(), word2.lower()).ratio()

	@staticmethod
	def align_sequences(
	reference: List[str],
	detected: List[str],
	match_threshold: float = 0.7
	) -> List[Tuple[Optional[str], Optional[str], str]]:
	"""Align two sequences dengan dynamic programming"""
	m, n = len(reference), len(detected)

	dp = [[None for _ in range(n + 1)] for _ in range(m + 1)]

	MATCH_SCORE = 2
	MISMATCH_PENALTY = -1
	GAP_PENALTY = -1

	for i in range(m + 1):
	dp[i][0] = (i * GAP_PENALTY, 'up')
	for j in range(n + 1):
	dp[0][j] = (j * GAP_PENALTY, 'left')
	dp[0][0] = (0, 'done')

	for i in range(1, m + 1):
	for j in range(1, n + 1):
	ref_word = reference[i-1]
	det_word = detected[j-1]

	similarity = SequenceAligner.calculate_similarity(ref_word, det_word)

	if similarity >= match_threshold:
	match_score = MATCH_SCORE
	else:
	match_score = MISMATCH_PENALTY

	diagonal = dp[i-1][j-1][0] + match_score
	up = dp[i-1][j][0] + GAP_PENALTY
	left = dp[i][j-1][0] + GAP_PENALTY

	max_score = max(diagonal, up, left)

	if max_score == diagonal:
	dp[i][j] = (max_score, 'diagonal')
	elif max_score == up:
	dp[i][j] = (max_score, 'up')
	else:
	dp[i][j] = (max_score, 'left')

	alignment = []
	i, j = m, n

	while i > 0 or j > 0:
	if dp[i][j][1] == 'diagonal':
	ref_word = reference[i-1]
	det_word = detected[j-1]
	similarity = SequenceAligner.calculate_similarity(ref_word, det_word)

	if similarity >= match_threshold:
	match_type = "match"
	else:
	match_type = "substitution"

	alignment.append((ref_word, det_word, match_type))
	i -= 1
	j -= 1
	elif dp[i][j][1] == 'up':
	alignment.append((reference[i-1], None, "deletion"))
	i -= 1
	else:
	alignment.append((None, detected[j-1], "insertion"))
	j -= 1

	alignment.reverse()
	return alignment


	class ArticulationService:
	"""Articulation assessment service"""

	def __init__(self):
	"""Initialize service"""
	print("🗣️ Initializing Articulation Service")
	self.filler_detector = FillerWordsDetector()
	self.aligner = SequenceAligner()
	print("✅ Articulation Service ready!\n")

	def normalize_text(self, text: str) -> str:
	"""Normalize text for comparison"""
	text = text.lower()
	text = re.sub(r'[,\.!?;:]+', ' ', text)
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def tokenize_words(self, text: str) -> List[str]:
	"""Split text into words"""
	text = self.normalize_text(text)
	words = [w for w in text.split() if w]
	return words

	def analyze(self, transcribed_text: str, reference_text: str) -> Dict:
	"""
	Analisis artikulasi

	Args:
	transcribed_text: Teks hasil transcription
	reference_text: Teks referensi

	Returns:
	Dict berisi hasil analisis
	"""
	print(f"🗣️ Analyzing articulation...")

	# Tokenize
	reference_words = self.tokenize_words(reference_text)
	detected_words = self.tokenize_words(transcribed_text)

	# Detect fillers
	filler_count, filler_list = self.filler_detector.count_fillers(transcribed_text)

	# Alignment
	alignment = self.aligner.align_sequences(
	reference_words,
	detected_words,
	match_threshold=0.7
	)

	# Convert to word scores
	word_scores = []
	correct_words = 0

	for idx, (ref_word, det_word, match_type) in enumerate(alignment):
	is_filler = False
	if det_word and self.filler_detector.is_filler(det_word):
	is_filler = True

	if match_type == "match":
	is_correct = True
	similarity = self.aligner.calculate_similarity(ref_word or "", det_word or "")
	if not is_filler:
	correct_words += 1
	else:
	is_correct = False
	similarity = self.aligner.calculate_similarity(ref_word or "", det_word or "") if ref_word and det_word else 0.0

	word_score = WordScore(
	index=idx,
	expected=ref_word or "[INSERTION]",
	detected=det_word or "[DELETION]",
	is_correct=is_correct,
	similarity=similarity,
	is_filler=is_filler,
	match_type=match_type
	)

	word_scores.append(word_score)

	# Calculate metrics
	total_words = len(reference_words)
	accuracy_percentage = (correct_words / total_words * 100) if total_words > 0 else 0

	# Determine category
	if accuracy_percentage >= 81:
	category = "Sangat Baik"
	points = 5
	elif accuracy_percentage >= 61:
	category = "Baik"
	points = 4
	elif accuracy_percentage >= 41:
	category = "Cukup"
	points = 3
	elif accuracy_percentage >= 21:
	category = "Buruk"
	points = 2
	else:
	category = "Perlu Ditingkatkan"
	points = 1

	print(f"✅ Articulation analysis complete!\n")

	return {
	'score': points,
	'category': category,
	'accuracy_percentage': round(accuracy_percentage, 1),
	'correct_words': correct_words,
	'total_words': total_words,
	'filler_count': filler_count,
	'filler_words': list(set(filler_list))[:10],
	# 'word_scores': [asdict(ws) for ws in word_scores[:50]] # Limit to first 50 words
	}