Spaces:
Sleeping
Sleeping
Commit
·
9abd5a8
1
Parent(s):
6dc05e5
feat: enhance filler detection & profanity detection with fuzzy matching
Browse files- Filler words: hybrid exact/fuzzy matching (90% threshold for long words)
- Remove 'ini' from filler list (legitimate word, not filler)
- Profanity: hybrid detection (exact + fuzzy + leet speak normalization)
- Add 130+ Indonesian profanity words and multi-word phrases
- Character substitution detection (t0l0l → tolol, f*ck → fuck)
- Fuzzy matching 85% threshold for typo variations
- Add rapidfuzz dependency for advanced string matching
- app/services/articulation.py +127 -25
- requirements.txt +1 -0
app/services/articulation.py
CHANGED
|
@@ -10,6 +10,7 @@ import numpy as np
|
|
| 10 |
from typing import Dict, List, Tuple, Optional
|
| 11 |
import re
|
| 12 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
class ArticulationService:
|
|
@@ -43,7 +44,7 @@ class ArticulationService:
|
|
| 43 |
# Filler words bahasa Indonesia
|
| 44 |
self.filler_words = [
|
| 45 |
'eh', 'ehm', 'em', 'aa', 'ah', 'mm', 'hmm',
|
| 46 |
-
'anu', '
|
| 47 |
'ya', 'yaa', 'nah', 'terus', 'jadi', 'soalnya'
|
| 48 |
]
|
| 49 |
|
|
@@ -148,39 +149,55 @@ class ArticulationService:
|
|
| 148 |
"""Deteksi kata-kata pengisi (filler words)"""
|
| 149 |
print("🔎 Detecting filler words...")
|
| 150 |
|
| 151 |
-
|
|
|
|
| 152 |
total_words = len(words)
|
| 153 |
|
| 154 |
if total_words == 0:
|
| 155 |
return {
|
| 156 |
'filler_count': 0,
|
| 157 |
-
'filler_ratio': 0,
|
| 158 |
'filler_words_found': []
|
| 159 |
}
|
| 160 |
|
| 161 |
-
# Count filler words
|
| 162 |
filler_found = []
|
| 163 |
filler_count = 0
|
| 164 |
|
| 165 |
for word in words:
|
| 166 |
-
# Clean word
|
| 167 |
-
clean_word = re.sub(r'[^\w\s]', '', word)
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
filler_count += 1
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
filler_ratio = filler_count / total_words
|
| 174 |
|
| 175 |
-
print(f" Filler Words: {filler_count}/{total_words}
|
| 176 |
if filler_found:
|
| 177 |
print(f" Found: {', '.join(filler_found)}")
|
| 178 |
|
| 179 |
return {
|
| 180 |
'filler_count': filler_count,
|
| 181 |
-
'
|
| 182 |
-
'filler_words_found': filler_found,
|
| 183 |
-
'total_words': total_words
|
| 184 |
}
|
| 185 |
|
| 186 |
def analyze_speech_rate_stability(self, audio_path: str) -> Dict:
|
|
@@ -451,11 +468,8 @@ class ArticulationService:
|
|
| 451 |
'consistency': round(clarity['consistency'], 3)
|
| 452 |
}
|
| 453 |
|
| 454 |
-
result['
|
| 455 |
-
|
| 456 |
-
'ratio': round(filler['filler_ratio'], 3),
|
| 457 |
-
'words_found': filler['filler_words_found']
|
| 458 |
-
}
|
| 459 |
|
| 460 |
result['stability_metrics'] = {
|
| 461 |
'syllable_rate': round(stability['avg_syllable_rate'], 2),
|
|
@@ -477,18 +491,106 @@ class ArticulationService:
|
|
| 477 |
\
|
| 478 |
|
| 479 |
class ProfanityDetector:
|
|
|
|
|
|
|
|
|
|
| 480 |
PROFANITY_WORDS = {
|
| 481 |
-
'anjir', 'anjay', '
|
| 482 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
}
|
| 484 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
@classmethod
|
| 486 |
def detect_profanity(cls, text: str) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
text_lower = text.lower()
|
| 488 |
-
|
| 489 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
return {
|
| 491 |
'has_profanity': len(found_profanity) > 0,
|
| 492 |
-
'profanity_count':
|
| 493 |
'profanity_words': list(set(found_profanity))
|
| 494 |
}
|
|
|
|
| 10 |
from typing import Dict, List, Tuple, Optional
|
| 11 |
import re
|
| 12 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
| 13 |
+
from rapidfuzz import fuzz
|
| 14 |
|
| 15 |
|
| 16 |
class ArticulationService:
|
|
|
|
| 44 |
# Filler words bahasa Indonesia
|
| 45 |
self.filler_words = [
|
| 46 |
'eh', 'ehm', 'em', 'aa', 'ah', 'mm', 'hmm',
|
| 47 |
+
'anu', 'itu', 'gitu', 'kayak', 'seperti',
|
| 48 |
'ya', 'yaa', 'nah', 'terus', 'jadi', 'soalnya'
|
| 49 |
]
|
| 50 |
|
|
|
|
| 149 |
"""Deteksi kata-kata pengisi (filler words)"""
|
| 150 |
print("🔎 Detecting filler words...")
|
| 151 |
|
| 152 |
+
# Split by whitespace to preserve original form
|
| 153 |
+
words = transcript.split()
|
| 154 |
total_words = len(words)
|
| 155 |
|
| 156 |
if total_words == 0:
|
| 157 |
return {
|
| 158 |
'filler_count': 0,
|
|
|
|
| 159 |
'filler_words_found': []
|
| 160 |
}
|
| 161 |
|
| 162 |
+
# Count filler words using fuzzy matching + exact match for short words
|
| 163 |
filler_found = []
|
| 164 |
filler_count = 0
|
| 165 |
|
| 166 |
for word in words:
|
| 167 |
+
# Clean word for checking (lowercase, remove punctuation)
|
| 168 |
+
clean_word = re.sub(r'[^\w\s]', '', word.lower())
|
| 169 |
+
|
| 170 |
+
# Skip empty words
|
| 171 |
+
if not clean_word:
|
| 172 |
+
continue
|
| 173 |
+
|
| 174 |
+
is_filler = False
|
| 175 |
+
|
| 176 |
+
# For short words (2-3 chars), use exact match to avoid false positives
|
| 177 |
+
if len(clean_word) <= 3:
|
| 178 |
+
if clean_word in self.filler_words:
|
| 179 |
+
is_filler = True
|
| 180 |
+
else:
|
| 181 |
+
# For longer words, use fuzzy matching with 90% threshold
|
| 182 |
+
for filler_word in self.filler_words:
|
| 183 |
+
similarity = fuzz.ratio(clean_word, filler_word)
|
| 184 |
+
if similarity >= 90: # 90% threshold untuk presisi lebih tinggi
|
| 185 |
+
is_filler = True
|
| 186 |
+
break
|
| 187 |
+
|
| 188 |
+
if is_filler:
|
| 189 |
filler_count += 1
|
| 190 |
+
# Keep original word form (with punctuation like 'ehm...')
|
| 191 |
+
if word not in filler_found:
|
| 192 |
+
filler_found.append(word)
|
|
|
|
| 193 |
|
| 194 |
+
print(f" Filler Words: {filler_count}/{total_words}")
|
| 195 |
if filler_found:
|
| 196 |
print(f" Found: {', '.join(filler_found)}")
|
| 197 |
|
| 198 |
return {
|
| 199 |
'filler_count': filler_count,
|
| 200 |
+
'filler_words_found': filler_found
|
|
|
|
|
|
|
| 201 |
}
|
| 202 |
|
| 203 |
def analyze_speech_rate_stability(self, audio_path: str) -> Dict:
|
|
|
|
| 468 |
'consistency': round(clarity['consistency'], 3)
|
| 469 |
}
|
| 470 |
|
| 471 |
+
result['filler_count'] = filler['filler_count']
|
| 472 |
+
result['filler_words'] = filler['filler_words_found']
|
|
|
|
|
|
|
|
|
|
| 473 |
|
| 474 |
result['stability_metrics'] = {
|
| 475 |
'syllable_rate': round(stability['avg_syllable_rate'], 2),
|
|
|
|
| 491 |
\
|
| 492 |
|
| 493 |
class ProfanityDetector:
|
| 494 |
+
"""Deteksi kata tidak senonoh menggunakan hybrid approach (exact + fuzzy + pattern)"""
|
| 495 |
+
|
| 496 |
+
# Base profanity words (kata dasar)
|
| 497 |
PROFANITY_WORDS = {
|
| 498 |
+
'anjir', 'anjay', 'njir', 'njay', 'anjrit', 'njrit', 'anjim', 'anjing',
|
| 499 |
+
'anjrot', 'asu', 'babi', 'bacot', 'bajingan', 'banci', 'bangke', 'bangor',
|
| 500 |
+
'bangsat', 'bego', 'bejad', 'bencong', 'bodat', 'bodoh', 'bugil', 'bundir',
|
| 501 |
+
'bunuh', 'burik', 'burit', 'cawek', 'cemen', 'cipok', 'cium', 'colai', 'coli',
|
| 502 |
+
'colmek', 'cukimai', 'cukimay', 'culun', 'cumbu', 'dancuk', 'dewasa', 'dick',
|
| 503 |
+
'dildo', 'encuk', 'fuck', 'gay', 'gei', 'gembel', 'gey', 'gigolo', 'gila',
|
| 504 |
+
'goblog', 'goblok', 'haram', 'hencet', 'hentai', 'idiot', 'jablai', 'jablay',
|
| 505 |
+
'jancok', 'jancuk', 'jangkik', 'jembut', 'jilat', 'jingan', 'kampang',
|
| 506 |
+
'keparat', 'kimak', 'kirik', 'klentit', 'klitoris', 'konthol', 'kontol',
|
| 507 |
+
'koplok', 'kunyuk', 'kutang', 'kutis', 'kwontol', 'lonte', 'maho',
|
| 508 |
+
'masturbasi', 'matane', 'mati', 'memek', 'mesum', 'modar', 'modyar', 'mokad',
|
| 509 |
+
'najis', 'nazi', 'ndhasmu', 'nenen', 'ngentot', 'ngolom', 'ngulum', 'nigga',
|
| 510 |
+
'nigger', 'onani', 'oon', 'orgasme', 'paksa', 'pantat', 'pantek', 'pecun',
|
| 511 |
+
'peli', 'penis', 'pentil', 'pepek', 'perek', 'perkosa', 'piatu', 'porno',
|
| 512 |
+
'pukimak', 'qontol', 'selangkang', 'sempak', 'senggama', 'setan', 'setubuh',
|
| 513 |
+
'shit', 'silet', 'silit', 'sinting', 'sodomi', 'stres', 'telanjang', 'telaso',
|
| 514 |
+
'tete', 'tewas', 'titit', 'togel', 'toket', 'tolol', 'tusbol', 'urin', 'vagina'
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
# Multi-word profanity phrases
|
| 518 |
+
PROFANITY_PHRASES = {
|
| 519 |
+
'gak ada otak', 'tidak ada otak', 'ga ada otak'
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
# Character substitution map (leet speak)
|
| 523 |
+
CHAR_SUBSTITUTIONS = {
|
| 524 |
+
'0': 'o', '1': 'i', '3': 'e', '4': 'a', '5': 's',
|
| 525 |
+
'7': 't', '8': 'b', '@': 'a', '$': 's', '*': ''
|
| 526 |
}
|
| 527 |
|
| 528 |
+
@classmethod
|
| 529 |
+
def normalize_word(cls, word: str) -> str:
|
| 530 |
+
"""Normalize word by replacing common character substitutions"""
|
| 531 |
+
normalized = word.lower()
|
| 532 |
+
for char, replacement in cls.CHAR_SUBSTITUTIONS.items():
|
| 533 |
+
normalized = normalized.replace(char, replacement)
|
| 534 |
+
return normalized
|
| 535 |
+
|
| 536 |
@classmethod
|
| 537 |
def detect_profanity(cls, text: str) -> dict:
|
| 538 |
+
"""
|
| 539 |
+
Detect profanity using hybrid approach:
|
| 540 |
+
1. Exact match for quick detection
|
| 541 |
+
2. Fuzzy match for typo variations
|
| 542 |
+
3. Pattern matching for character substitution (leet speak)
|
| 543 |
+
"""
|
| 544 |
text_lower = text.lower()
|
| 545 |
+
|
| 546 |
+
# Extract words and normalize
|
| 547 |
+
raw_words = re.findall(r'\w+', text_lower)
|
| 548 |
+
|
| 549 |
+
found_profanity = []
|
| 550 |
+
profanity_count = 0
|
| 551 |
+
|
| 552 |
+
# Step 1: Check multi-word phrases first
|
| 553 |
+
for phrase in cls.PROFANITY_PHRASES:
|
| 554 |
+
if phrase in text_lower:
|
| 555 |
+
profanity_count += 1
|
| 556 |
+
if phrase not in found_profanity:
|
| 557 |
+
found_profanity.append(phrase)
|
| 558 |
+
|
| 559 |
+
# Step 2: Check individual words
|
| 560 |
+
for word in raw_words:
|
| 561 |
+
is_profane = False
|
| 562 |
+
matched_word = word
|
| 563 |
+
|
| 564 |
+
# A. Exact match (fastest)
|
| 565 |
+
if word in cls.PROFANITY_WORDS:
|
| 566 |
+
is_profane = True
|
| 567 |
+
|
| 568 |
+
# B. Normalize and check (handle leet speak: t0l0l → tolol)
|
| 569 |
+
elif len(word) > 0:
|
| 570 |
+
normalized = cls.normalize_word(word)
|
| 571 |
+
if normalized in cls.PROFANITY_WORDS:
|
| 572 |
+
is_profane = True
|
| 573 |
+
matched_word = normalized
|
| 574 |
+
|
| 575 |
+
# C. Fuzzy match for typo variations (anjiir, anjiirr, etc.)
|
| 576 |
+
if not is_profane and len(word) > 3:
|
| 577 |
+
for profane_word in cls.PROFANITY_WORDS:
|
| 578 |
+
# Only compare words with similar length (±3 chars)
|
| 579 |
+
if abs(len(word) - len(profane_word)) <= 3:
|
| 580 |
+
similarity = fuzz.ratio(word, profane_word)
|
| 581 |
+
if similarity >= 85: # 85% threshold for profanity
|
| 582 |
+
is_profane = True
|
| 583 |
+
matched_word = profane_word
|
| 584 |
+
break
|
| 585 |
+
|
| 586 |
+
if is_profane:
|
| 587 |
+
profanity_count += 1
|
| 588 |
+
# Keep original word if not already in list
|
| 589 |
+
if word not in found_profanity:
|
| 590 |
+
found_profanity.append(word)
|
| 591 |
+
|
| 592 |
return {
|
| 593 |
'has_profanity': len(found_profanity) > 0,
|
| 594 |
+
'profanity_count': profanity_count,
|
| 595 |
'profanity_words': list(set(found_profanity))
|
| 596 |
}
|
requirements.txt
CHANGED
|
@@ -20,6 +20,7 @@ protobuf==3.20.3
|
|
| 20 |
# NLP
|
| 21 |
sentence-transformers==2.2.2
|
| 22 |
scikit-learn==1.3.2
|
|
|
|
| 23 |
|
| 24 |
# Audio processing
|
| 25 |
librosa==0.10.1
|
|
|
|
| 20 |
# NLP
|
| 21 |
sentence-transformers==2.2.2
|
| 22 |
scikit-learn==1.3.2
|
| 23 |
+
rapidfuzz==3.5.2
|
| 24 |
|
| 25 |
# Audio processing
|
| 26 |
librosa==0.10.1
|