Spaces:
Sleeping
Sleeping
| """ | |
| Articulation Analysis Service | |
| Analisis artikulasi/pronunciation dengan BERT-based alignment | |
| """ | |
| import torch | |
| import numpy as np | |
| import string | |
| from typing import Dict, List, Tuple, Optional | |
| from dataclasses import dataclass, asdict | |
| import re | |
| import warnings | |
| from difflib import SequenceMatcher | |
| warnings.filterwarnings('ignore') | |
| class WordScore: | |
| """Score untuk satu kata""" | |
| index: int | |
| expected: str | |
| detected: str | |
| is_correct: bool | |
| similarity: float | |
| is_filler: bool = False | |
| match_type: str = "match" | |
| class FillerWordsDetector: | |
| """Deteksi kata pengisi dalam Bahasa Indonesia""" | |
| FILLER_WORDS = { | |
| 'um', 'umm', 'ummm', 'em', 'emm', 'emmm', | |
| 'eh', 'ehh', 'ehhh', 'ehm', 'ehmm', 'ehmmm', | |
| 'ah', 'ahh', 'ahhh', 'ahm', 'ahmm', 'ahmmm', | |
| 'hmm', 'hmmm', 'hmmmm', | |
| 'uh', 'uhh', 'uhhh', 'uhm', 'uhmm', | |
| 'anu', 'ano', 'gitu', 'gituloh', 'gitu loh', | |
| 'kayak', 'kayaknya', 'kayak gini', 'kayak gitu', | |
| 'apa', 'apa ya', 'apa namanya', | |
| 'maksudnya', 'maksud saya', 'jadi', 'jadinya', | |
| 'nah', 'terus', 'lalu', 'kemudian', | |
| 'gini', 'begini', 'begitu', | |
| 'semacam', 'semisal', 'ibaratnya', | |
| 'ya kan', 'kan', 'ya', 'yah', | |
| 'sepertinya', 'mungkin', | |
| 'toh', 'sih', 'deh', 'dong', 'lah', | |
| } | |
| def is_filler(cls, word: str) -> bool: | |
| """Check if word is a filler""" | |
| word_clean = word.lower().strip().rstrip(string.punctuation) | |
| if word_clean in cls.FILLER_WORDS: | |
| return True | |
| if re.match(r'^(um+|em+|eh+m*|ah+m*|uh+m*|hmm+)$', word_clean): | |
| return True | |
| return False | |
| def count_fillers(cls, text: str) -> Tuple[int, List[str]]: | |
| """Count filler words in text""" | |
| words = text.lower().split() | |
| fillers = [w for w in words if cls.is_filler(w)] | |
| return len(fillers), fillers | |
| class ProfanityDetector: | |
| """Deteksi kata tidak senonoh dalam Bahasa Indonesia dan Inggris""" | |
| PROFANITY_WORDS = { | |
| 'anjir', 'anjay', 'njir', 'njay', 'anjrit', 'njrit', 'shit', 'fuck', | |
| 'tolol', 'oon', 'bego', 'gak ada otak', 'goblok', 'bodoh', 'anjim', | |
| 'anjing', 'anjrot', 'asu', 'babi', 'bacot', 'bajingan', 'banci', | |
| 'bangke', 'bangor', 'bangsat', 'bejad', 'bencong', 'bodat', 'bugil', | |
| 'bundir', 'bunuh', 'burik', 'burit', 'cawek', 'cemen', 'cipok', 'cium', | |
| 'colai', 'coli', 'colmek', 'cukimai', 'cukimay', 'culun', 'cumbu', | |
| 'dancuk', 'dewasa', 'dick', 'dildo', 'encuk', 'gay', 'gei', 'gembel', | |
| 'gey', 'gigolo', 'gila', 'goblog', 'haram', 'hencet', 'hentai', 'idiot', | |
| 'jablai', 'jablay', 'jancok', 'jancuk', 'jangkik', 'jembut', 'jilat', | |
| 'jingan', 'kampang', 'keparat', 'kimak', 'kirik', 'klentit', 'klitoris', | |
| 'konthol', 'kontol', 'koplok', 'kunyuk', 'kutang', 'kutis', 'kwontol', | |
| 'lonte', 'maho', 'masturbasi', 'matane', 'mati', 'memek', 'mesum', | |
| 'modar', 'modyar', 'mokad', 'najis', 'nazi', 'ndhasmu', 'nenen', | |
| 'ngentot', 'ngolom', 'ngulum', 'nigga', 'nigger', 'onani', 'orgasme', | |
| 'paksa', 'pantat', 'pantek', 'pecun', 'peli', 'penis', 'pentil', 'pepek', | |
| 'perek', 'perkosa', 'piatu', 'porno', 'pukimak', 'qontol', 'selangkang', | |
| 'sempak', 'senggama', 'setan', 'setubuh', 'silet', 'silit', 'sinting', | |
| 'sodomi', 'stres', 'telanjang', 'telaso', 'tete', 'tewas', 'titit', | |
| 'togel', 'toket', 'tusbol', 'urin', 'vagina' | |
| } | |
| def detect_profanity(cls, text: str) -> Dict: | |
| """ | |
| Deteksi kata tidak senonoh dalam teks | |
| Returns: | |
| Dict dengan keys: | |
| - has_profanity: bool | |
| - profanity_count: int | |
| - profanity_words: List[str] (kata yang terdeteksi) | |
| """ | |
| # Normalisasi text | |
| text_lower = text.lower() | |
| words = re.findall(r'\b\w+\b', text_lower) | |
| # Cari kata tidak senonoh | |
| found_profanity = [] | |
| for word in words: | |
| if word in cls.PROFANITY_WORDS: | |
| found_profanity.append(word) | |
| # Cari phrase (2-3 kata) | |
| phrases_2 = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)] | |
| phrases_3 = [f"{words[i]} {words[i+1]} {words[i+2]}" for i in range(len(words)-2)] | |
| for phrase in phrases_2 + phrases_3: | |
| if phrase in cls.PROFANITY_WORDS: | |
| found_profanity.append(phrase) | |
| return { | |
| 'has_profanity': len(found_profanity) > 0, | |
| 'profanity_count': len(found_profanity), | |
| 'profanity_words': list(set(found_profanity)) # Remove duplicates | |
| } | |
| import string | |
| word_clean = word.lower().strip().rstrip(string.punctuation) | |
| if word_clean in cls.FILLER_WORDS: | |
| return True | |
| if re.match(r'^(um+|em+|eh+m*|ah+m*|uh+m*|hmm+)$', word_clean): | |
| return True | |
| return False | |
| def count_fillers(cls, text: str) -> Tuple[int, List[str]]: | |
| """Count filler words in text""" | |
| words = text.lower().split() | |
| fillers = [w for w in words if cls.is_filler(w)] | |
| return len(fillers), fillers | |
| class SequenceAligner: | |
| """Sequence alignment untuk word matching""" | |
| def calculate_similarity(word1: str, word2: str) -> float: | |
| """Calculate similarity between two words""" | |
| return SequenceMatcher(None, word1.lower(), word2.lower()).ratio() | |
| def align_sequences( | |
| reference: List[str], | |
| detected: List[str], | |
| match_threshold: float = 0.7 | |
| ) -> List[Tuple[Optional[str], Optional[str], str]]: | |
| """Align two sequences dengan dynamic programming""" | |
| m, n = len(reference), len(detected) | |
| dp = [[None for _ in range(n + 1)] for _ in range(m + 1)] | |
| MATCH_SCORE = 2 | |
| MISMATCH_PENALTY = -1 | |
| GAP_PENALTY = -1 | |
| for i in range(m + 1): | |
| dp[i][0] = (i * GAP_PENALTY, 'up') | |
| for j in range(n + 1): | |
| dp[0][j] = (j * GAP_PENALTY, 'left') | |
| dp[0][0] = (0, 'done') | |
| for i in range(1, m + 1): | |
| for j in range(1, n + 1): | |
| ref_word = reference[i-1] | |
| det_word = detected[j-1] | |
| similarity = SequenceAligner.calculate_similarity(ref_word, det_word) | |
| if similarity >= match_threshold: | |
| match_score = MATCH_SCORE | |
| else: | |
| match_score = MISMATCH_PENALTY | |
| diagonal = dp[i-1][j-1][0] + match_score | |
| up = dp[i-1][j][0] + GAP_PENALTY | |
| left = dp[i][j-1][0] + GAP_PENALTY | |
| max_score = max(diagonal, up, left) | |
| if max_score == diagonal: | |
| dp[i][j] = (max_score, 'diagonal') | |
| elif max_score == up: | |
| dp[i][j] = (max_score, 'up') | |
| else: | |
| dp[i][j] = (max_score, 'left') | |
| alignment = [] | |
| i, j = m, n | |
| while i > 0 or j > 0: | |
| if dp[i][j][1] == 'diagonal': | |
| ref_word = reference[i-1] | |
| det_word = detected[j-1] | |
| similarity = SequenceAligner.calculate_similarity(ref_word, det_word) | |
| if similarity >= match_threshold: | |
| match_type = "match" | |
| else: | |
| match_type = "substitution" | |
| alignment.append((ref_word, det_word, match_type)) | |
| i -= 1 | |
| j -= 1 | |
| elif dp[i][j][1] == 'up': | |
| alignment.append((reference[i-1], None, "deletion")) | |
| i -= 1 | |
| else: | |
| alignment.append((None, detected[j-1], "insertion")) | |
| j -= 1 | |
| alignment.reverse() | |
| return alignment | |
| class ArticulationService: | |
| """Articulation assessment service""" | |
| def __init__(self): | |
| """Initialize service""" | |
| print("🗣️ Initializing Articulation Service") | |
| self.filler_detector = FillerWordsDetector() | |
| self.aligner = SequenceAligner() | |
| print("✅ Articulation Service ready!\n") | |
| def normalize_text(self, text: str) -> str: | |
| """Normalize text for comparison""" | |
| text = text.lower() | |
| text = re.sub(r'[,\.!?;:]+', ' ', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |
| def tokenize_words(self, text: str) -> List[str]: | |
| """Split text into words""" | |
| text = self.normalize_text(text) | |
| words = [w for w in text.split() if w] | |
| return words | |
| def analyze(self, transcribed_text: str, reference_text: str) -> Dict: | |
| """ | |
| Analisis artikulasi | |
| Args: | |
| transcribed_text: Teks hasil transcription | |
| reference_text: Teks referensi | |
| Returns: | |
| Dict berisi hasil analisis | |
| """ | |
| print(f"🗣️ Analyzing articulation...") | |
| # Tokenize | |
| reference_words = self.tokenize_words(reference_text) | |
| detected_words = self.tokenize_words(transcribed_text) | |
| # Detect fillers | |
| filler_count, filler_list = self.filler_detector.count_fillers(transcribed_text) | |
| # Alignment | |
| alignment = self.aligner.align_sequences( | |
| reference_words, | |
| detected_words, | |
| match_threshold=0.7 | |
| ) | |
| # Convert to word scores | |
| word_scores = [] | |
| correct_words = 0 | |
| for idx, (ref_word, det_word, match_type) in enumerate(alignment): | |
| is_filler = False | |
| if det_word and self.filler_detector.is_filler(det_word): | |
| is_filler = True | |
| if match_type == "match": | |
| is_correct = True | |
| similarity = self.aligner.calculate_similarity(ref_word or "", det_word or "") | |
| if not is_filler: | |
| correct_words += 1 | |
| else: | |
| is_correct = False | |
| similarity = self.aligner.calculate_similarity(ref_word or "", det_word or "") if ref_word and det_word else 0.0 | |
| word_score = WordScore( | |
| index=idx, | |
| expected=ref_word or "[INSERTION]", | |
| detected=det_word or "[DELETION]", | |
| is_correct=is_correct, | |
| similarity=similarity, | |
| is_filler=is_filler, | |
| match_type=match_type | |
| ) | |
| word_scores.append(word_score) | |
| # Calculate metrics | |
| total_words = len(reference_words) | |
| accuracy_percentage = (correct_words / total_words * 100) if total_words > 0 else 0 | |
| # Determine category | |
| if accuracy_percentage >= 81: | |
| category = "Sangat Baik" | |
| points = 5 | |
| elif accuracy_percentage >= 61: | |
| category = "Baik" | |
| points = 4 | |
| elif accuracy_percentage >= 41: | |
| category = "Cukup" | |
| points = 3 | |
| elif accuracy_percentage >= 21: | |
| category = "Buruk" | |
| points = 2 | |
| else: | |
| category = "Perlu Ditingkatkan" | |
| points = 1 | |
| print(f"✅ Articulation analysis complete!\n") | |
| return { | |
| 'score': points, | |
| 'category': category, | |
| 'accuracy_percentage': round(accuracy_percentage, 1), | |
| 'correct_words': correct_words, | |
| 'total_words': total_words, | |
| 'filler_count': filler_count, | |
| 'filler_words': list(set(filler_list))[:10], | |
| # 'word_scores': [asdict(ws) for ws in word_scores[:50]] # Limit to first 50 words | |
| } | |