fariedalfarizi's picture
Fix FillerWordsDetector duplicate and unpacking error
82e47b6
raw
history blame
12.1 kB
"""
Articulation Analysis Service
Analisis artikulasi/pronunciation dengan BERT-based alignment
"""
import torch
import numpy as np
import string
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass, asdict
import re
import warnings
from difflib import SequenceMatcher
warnings.filterwarnings('ignore')
@dataclass
class WordScore:
"""Score untuk satu kata"""
index: int
expected: str
detected: str
is_correct: bool
similarity: float
is_filler: bool = False
match_type: str = "match"
class FillerWordsDetector:
"""Deteksi kata pengisi dalam Bahasa Indonesia"""
FILLER_WORDS = {
'um', 'umm', 'ummm', 'em', 'emm', 'emmm',
'eh', 'ehh', 'ehhh', 'ehm', 'ehmm', 'ehmmm',
'ah', 'ahh', 'ahhh', 'ahm', 'ahmm', 'ahmmm',
'hmm', 'hmmm', 'hmmmm',
'uh', 'uhh', 'uhhh', 'uhm', 'uhmm',
'anu', 'ano', 'gitu', 'gituloh', 'gitu loh',
'kayak', 'kayaknya', 'kayak gini', 'kayak gitu',
'apa', 'apa ya', 'apa namanya',
'maksudnya', 'maksud saya', 'jadi', 'jadinya',
'nah', 'terus', 'lalu', 'kemudian',
'gini', 'begini', 'begitu',
'semacam', 'semisal', 'ibaratnya',
'ya kan', 'kan', 'ya', 'yah',
'sepertinya', 'mungkin',
'toh', 'sih', 'deh', 'dong', 'lah',
}
@classmethod
def is_filler(cls, word: str) -> bool:
"""Check if word is a filler"""
word_clean = word.lower().strip().rstrip(string.punctuation)
if word_clean in cls.FILLER_WORDS:
return True
if re.match(r'^(um+|em+|eh+m*|ah+m*|uh+m*|hmm+)$', word_clean):
return True
return False
@classmethod
def count_fillers(cls, text: str) -> Tuple[int, List[str]]:
"""Count filler words in text"""
words = text.lower().split()
fillers = [w for w in words if cls.is_filler(w)]
return len(fillers), fillers
class ProfanityDetector:
"""Deteksi kata tidak senonoh dalam Bahasa Indonesia dan Inggris"""
PROFANITY_WORDS = {
'anjir', 'anjay', 'njir', 'njay', 'anjrit', 'njrit', 'shit', 'fuck',
'tolol', 'oon', 'bego', 'gak ada otak', 'goblok', 'bodoh', 'anjim',
'anjing', 'anjrot', 'asu', 'babi', 'bacot', 'bajingan', 'banci',
'bangke', 'bangor', 'bangsat', 'bejad', 'bencong', 'bodat', 'bugil',
'bundir', 'bunuh', 'burik', 'burit', 'cawek', 'cemen', 'cipok', 'cium',
'colai', 'coli', 'colmek', 'cukimai', 'cukimay', 'culun', 'cumbu',
'dancuk', 'dewasa', 'dick', 'dildo', 'encuk', 'gay', 'gei', 'gembel',
'gey', 'gigolo', 'gila', 'goblog', 'haram', 'hencet', 'hentai', 'idiot',
'jablai', 'jablay', 'jancok', 'jancuk', 'jangkik', 'jembut', 'jilat',
'jingan', 'kampang', 'keparat', 'kimak', 'kirik', 'klentit', 'klitoris',
'konthol', 'kontol', 'koplok', 'kunyuk', 'kutang', 'kutis', 'kwontol',
'lonte', 'maho', 'masturbasi', 'matane', 'mati', 'memek', 'mesum',
'modar', 'modyar', 'mokad', 'najis', 'nazi', 'ndhasmu', 'nenen',
'ngentot', 'ngolom', 'ngulum', 'nigga', 'nigger', 'onani', 'orgasme',
'paksa', 'pantat', 'pantek', 'pecun', 'peli', 'penis', 'pentil', 'pepek',
'perek', 'perkosa', 'piatu', 'porno', 'pukimak', 'qontol', 'selangkang',
'sempak', 'senggama', 'setan', 'setubuh', 'silet', 'silit', 'sinting',
'sodomi', 'stres', 'telanjang', 'telaso', 'tete', 'tewas', 'titit',
'togel', 'toket', 'tusbol', 'urin', 'vagina'
}
@classmethod
def detect_profanity(cls, text: str) -> Dict:
"""
Deteksi kata tidak senonoh dalam teks
Returns:
Dict dengan keys:
- has_profanity: bool
- profanity_count: int
- profanity_words: List[str] (kata yang terdeteksi)
"""
# Normalisasi text
text_lower = text.lower()
words = re.findall(r'\b\w+\b', text_lower)
# Cari kata tidak senonoh
found_profanity = []
for word in words:
if word in cls.PROFANITY_WORDS:
found_profanity.append(word)
# Cari phrase (2-3 kata)
phrases_2 = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
phrases_3 = [f"{words[i]} {words[i+1]} {words[i+2]}" for i in range(len(words)-2)]
for phrase in phrases_2 + phrases_3:
if phrase in cls.PROFANITY_WORDS:
found_profanity.append(phrase)
return {
'has_profanity': len(found_profanity) > 0,
'profanity_count': len(found_profanity),
'profanity_words': list(set(found_profanity)) # Remove duplicates
}
import string
word_clean = word.lower().strip().rstrip(string.punctuation)
if word_clean in cls.FILLER_WORDS:
return True
if re.match(r'^(um+|em+|eh+m*|ah+m*|uh+m*|hmm+)$', word_clean):
return True
return False
@classmethod
def count_fillers(cls, text: str) -> Tuple[int, List[str]]:
"""Count filler words in text"""
words = text.lower().split()
fillers = [w for w in words if cls.is_filler(w)]
return len(fillers), fillers
class SequenceAligner:
"""Sequence alignment untuk word matching"""
@staticmethod
def calculate_similarity(word1: str, word2: str) -> float:
"""Calculate similarity between two words"""
return SequenceMatcher(None, word1.lower(), word2.lower()).ratio()
@staticmethod
def align_sequences(
reference: List[str],
detected: List[str],
match_threshold: float = 0.7
) -> List[Tuple[Optional[str], Optional[str], str]]:
"""Align two sequences dengan dynamic programming"""
m, n = len(reference), len(detected)
dp = [[None for _ in range(n + 1)] for _ in range(m + 1)]
MATCH_SCORE = 2
MISMATCH_PENALTY = -1
GAP_PENALTY = -1
for i in range(m + 1):
dp[i][0] = (i * GAP_PENALTY, 'up')
for j in range(n + 1):
dp[0][j] = (j * GAP_PENALTY, 'left')
dp[0][0] = (0, 'done')
for i in range(1, m + 1):
for j in range(1, n + 1):
ref_word = reference[i-1]
det_word = detected[j-1]
similarity = SequenceAligner.calculate_similarity(ref_word, det_word)
if similarity >= match_threshold:
match_score = MATCH_SCORE
else:
match_score = MISMATCH_PENALTY
diagonal = dp[i-1][j-1][0] + match_score
up = dp[i-1][j][0] + GAP_PENALTY
left = dp[i][j-1][0] + GAP_PENALTY
max_score = max(diagonal, up, left)
if max_score == diagonal:
dp[i][j] = (max_score, 'diagonal')
elif max_score == up:
dp[i][j] = (max_score, 'up')
else:
dp[i][j] = (max_score, 'left')
alignment = []
i, j = m, n
while i > 0 or j > 0:
if dp[i][j][1] == 'diagonal':
ref_word = reference[i-1]
det_word = detected[j-1]
similarity = SequenceAligner.calculate_similarity(ref_word, det_word)
if similarity >= match_threshold:
match_type = "match"
else:
match_type = "substitution"
alignment.append((ref_word, det_word, match_type))
i -= 1
j -= 1
elif dp[i][j][1] == 'up':
alignment.append((reference[i-1], None, "deletion"))
i -= 1
else:
alignment.append((None, detected[j-1], "insertion"))
j -= 1
alignment.reverse()
return alignment
class ArticulationService:
"""Articulation assessment service"""
def __init__(self):
"""Initialize service"""
print("🗣️ Initializing Articulation Service")
self.filler_detector = FillerWordsDetector()
self.aligner = SequenceAligner()
print("✅ Articulation Service ready!\n")
def normalize_text(self, text: str) -> str:
"""Normalize text for comparison"""
text = text.lower()
text = re.sub(r'[,\.!?;:]+', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def tokenize_words(self, text: str) -> List[str]:
"""Split text into words"""
text = self.normalize_text(text)
words = [w for w in text.split() if w]
return words
def analyze(self, transcribed_text: str, reference_text: str) -> Dict:
"""
Analisis artikulasi
Args:
transcribed_text: Teks hasil transcription
reference_text: Teks referensi
Returns:
Dict berisi hasil analisis
"""
print(f"🗣️ Analyzing articulation...")
# Tokenize
reference_words = self.tokenize_words(reference_text)
detected_words = self.tokenize_words(transcribed_text)
# Detect fillers
filler_count, filler_list = self.filler_detector.count_fillers(transcribed_text)
# Alignment
alignment = self.aligner.align_sequences(
reference_words,
detected_words,
match_threshold=0.7
)
# Convert to word scores
word_scores = []
correct_words = 0
for idx, (ref_word, det_word, match_type) in enumerate(alignment):
is_filler = False
if det_word and self.filler_detector.is_filler(det_word):
is_filler = True
if match_type == "match":
is_correct = True
similarity = self.aligner.calculate_similarity(ref_word or "", det_word or "")
if not is_filler:
correct_words += 1
else:
is_correct = False
similarity = self.aligner.calculate_similarity(ref_word or "", det_word or "") if ref_word and det_word else 0.0
word_score = WordScore(
index=idx,
expected=ref_word or "[INSERTION]",
detected=det_word or "[DELETION]",
is_correct=is_correct,
similarity=similarity,
is_filler=is_filler,
match_type=match_type
)
word_scores.append(word_score)
# Calculate metrics
total_words = len(reference_words)
accuracy_percentage = (correct_words / total_words * 100) if total_words > 0 else 0
# Determine category
if accuracy_percentage >= 81:
category = "Sangat Baik"
points = 5
elif accuracy_percentage >= 61:
category = "Baik"
points = 4
elif accuracy_percentage >= 41:
category = "Cukup"
points = 3
elif accuracy_percentage >= 21:
category = "Buruk"
points = 2
else:
category = "Perlu Ditingkatkan"
points = 1
print(f"✅ Articulation analysis complete!\n")
return {
'score': points,
'category': category,
'accuracy_percentage': round(accuracy_percentage, 1),
'correct_words': correct_words,
'total_words': total_words,
'filler_count': filler_count,
'filler_words': list(set(filler_list))[:10],
# 'word_scores': [asdict(ws) for ws in word_scores[:50]] # Limit to first 50 words
}