fariedalfarizi commited on
Commit
9abd5a8
·
1 Parent(s): 6dc05e5

feat: enhance filler detection & profanity detection with fuzzy matching

Browse files

- Filler words: hybrid exact/fuzzy matching (90% threshold for long words)
- Remove 'ini' from filler list (legitimate word, not filler)
- Profanity: hybrid detection (exact + fuzzy + leet speak normalization)
- Add 130+ Indonesian profanity words and multi-word phrases
- Character substitution detection (t0l0l → tolol, f*ck → fuck)
- Fuzzy matching 85% threshold for typo variations
- Add rapidfuzz dependency for advanced string matching

Files changed (2) hide show
  1. app/services/articulation.py +127 -25
  2. requirements.txt +1 -0
app/services/articulation.py CHANGED
@@ -10,6 +10,7 @@ import numpy as np
10
  from typing import Dict, List, Tuple, Optional
11
  import re
12
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
13
 
14
 
15
  class ArticulationService:
@@ -43,7 +44,7 @@ class ArticulationService:
43
  # Filler words bahasa Indonesia
44
  self.filler_words = [
45
  'eh', 'ehm', 'em', 'aa', 'ah', 'mm', 'hmm',
46
- 'anu', 'ini', 'itu', 'gitu', 'kayak', 'seperti',
47
  'ya', 'yaa', 'nah', 'terus', 'jadi', 'soalnya'
48
  ]
49
 
@@ -148,39 +149,55 @@ class ArticulationService:
148
  """Deteksi kata-kata pengisi (filler words)"""
149
  print("🔎 Detecting filler words...")
150
 
151
- words = transcript.lower().split()
 
152
  total_words = len(words)
153
 
154
  if total_words == 0:
155
  return {
156
  'filler_count': 0,
157
- 'filler_ratio': 0,
158
  'filler_words_found': []
159
  }
160
 
161
- # Count filler words
162
  filler_found = []
163
  filler_count = 0
164
 
165
  for word in words:
166
- # Clean word
167
- clean_word = re.sub(r'[^\w\s]', '', word)
168
- if clean_word in self.filler_words:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  filler_count += 1
170
- if clean_word not in filler_found:
171
- filler_found.append(clean_word)
172
-
173
- filler_ratio = filler_count / total_words
174
 
175
- print(f" Filler Words: {filler_count}/{total_words} ({filler_ratio*100:.1f}%)")
176
  if filler_found:
177
  print(f" Found: {', '.join(filler_found)}")
178
 
179
  return {
180
  'filler_count': filler_count,
181
- 'filler_ratio': filler_ratio,
182
- 'filler_words_found': filler_found,
183
- 'total_words': total_words
184
  }
185
 
186
  def analyze_speech_rate_stability(self, audio_path: str) -> Dict:
@@ -451,11 +468,8 @@ class ArticulationService:
451
  'consistency': round(clarity['consistency'], 3)
452
  }
453
 
454
- result['filler_metrics'] = {
455
- 'count': filler['filler_count'],
456
- 'ratio': round(filler['filler_ratio'], 3),
457
- 'words_found': filler['filler_words_found']
458
- }
459
 
460
  result['stability_metrics'] = {
461
  'syllable_rate': round(stability['avg_syllable_rate'], 2),
@@ -477,18 +491,106 @@ class ArticulationService:
477
  \
478
 
479
  class ProfanityDetector:
 
 
 
480
  PROFANITY_WORDS = {
481
- 'anjir', 'anjay', 'shit', 'fuck', 'tolol', 'bego', 'goblok',
482
- 'bodoh', 'anjing', 'bangsat', 'brengsek'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  }
484
 
 
 
 
 
 
 
 
 
485
  @classmethod
486
  def detect_profanity(cls, text: str) -> dict:
 
 
 
 
 
 
487
  text_lower = text.lower()
488
- words = re.findall(r'\\\w+\', text_lower)
489
- found_profanity = [w for w in words if w in cls.PROFANITY_WORDS]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  return {
491
  'has_profanity': len(found_profanity) > 0,
492
- 'profanity_count': len(found_profanity),
493
  'profanity_words': list(set(found_profanity))
494
  }
 
10
  from typing import Dict, List, Tuple, Optional
11
  import re
12
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
13
+ from rapidfuzz import fuzz
14
 
15
 
16
  class ArticulationService:
 
44
  # Filler words bahasa Indonesia
45
  self.filler_words = [
46
  'eh', 'ehm', 'em', 'aa', 'ah', 'mm', 'hmm',
47
+ 'anu', 'itu', 'gitu', 'kayak', 'seperti',
48
  'ya', 'yaa', 'nah', 'terus', 'jadi', 'soalnya'
49
  ]
50
 
 
149
  """Deteksi kata-kata pengisi (filler words)"""
150
  print("🔎 Detecting filler words...")
151
 
152
+ # Split by whitespace to preserve original form
153
+ words = transcript.split()
154
  total_words = len(words)
155
 
156
  if total_words == 0:
157
  return {
158
  'filler_count': 0,
 
159
  'filler_words_found': []
160
  }
161
 
162
+ # Count filler words using fuzzy matching + exact match for short words
163
  filler_found = []
164
  filler_count = 0
165
 
166
  for word in words:
167
+ # Clean word for checking (lowercase, remove punctuation)
168
+ clean_word = re.sub(r'[^\w\s]', '', word.lower())
169
+
170
+ # Skip empty words
171
+ if not clean_word:
172
+ continue
173
+
174
+ is_filler = False
175
+
176
+ # For short words (2-3 chars), use exact match to avoid false positives
177
+ if len(clean_word) <= 3:
178
+ if clean_word in self.filler_words:
179
+ is_filler = True
180
+ else:
181
+ # For longer words, use fuzzy matching with 90% threshold
182
+ for filler_word in self.filler_words:
183
+ similarity = fuzz.ratio(clean_word, filler_word)
184
+ if similarity >= 90: # 90% threshold untuk presisi lebih tinggi
185
+ is_filler = True
186
+ break
187
+
188
+ if is_filler:
189
  filler_count += 1
190
+ # Keep original word form (with punctuation like 'ehm...')
191
+ if word not in filler_found:
192
+ filler_found.append(word)
 
193
 
194
+ print(f" Filler Words: {filler_count}/{total_words}")
195
  if filler_found:
196
  print(f" Found: {', '.join(filler_found)}")
197
 
198
  return {
199
  'filler_count': filler_count,
200
+ 'filler_words_found': filler_found
 
 
201
  }
202
 
203
  def analyze_speech_rate_stability(self, audio_path: str) -> Dict:
 
468
  'consistency': round(clarity['consistency'], 3)
469
  }
470
 
471
+ result['filler_count'] = filler['filler_count']
472
+ result['filler_words'] = filler['filler_words_found']
 
 
 
473
 
474
  result['stability_metrics'] = {
475
  'syllable_rate': round(stability['avg_syllable_rate'], 2),
 
491
  \
492
 
493
  class ProfanityDetector:
494
+ """Deteksi kata tidak senonoh menggunakan hybrid approach (exact + fuzzy + pattern)"""
495
+
496
+ # Base profanity words (kata dasar)
497
  PROFANITY_WORDS = {
498
+ 'anjir', 'anjay', 'njir', 'njay', 'anjrit', 'njrit', 'anjim', 'anjing',
499
+ 'anjrot', 'asu', 'babi', 'bacot', 'bajingan', 'banci', 'bangke', 'bangor',
500
+ 'bangsat', 'bego', 'bejad', 'bencong', 'bodat', 'bodoh', 'bugil', 'bundir',
501
+ 'bunuh', 'burik', 'burit', 'cawek', 'cemen', 'cipok', 'cium', 'colai', 'coli',
502
+ 'colmek', 'cukimai', 'cukimay', 'culun', 'cumbu', 'dancuk', 'dewasa', 'dick',
503
+ 'dildo', 'encuk', 'fuck', 'gay', 'gei', 'gembel', 'gey', 'gigolo', 'gila',
504
+ 'goblog', 'goblok', 'haram', 'hencet', 'hentai', 'idiot', 'jablai', 'jablay',
505
+ 'jancok', 'jancuk', 'jangkik', 'jembut', 'jilat', 'jingan', 'kampang',
506
+ 'keparat', 'kimak', 'kirik', 'klentit', 'klitoris', 'konthol', 'kontol',
507
+ 'koplok', 'kunyuk', 'kutang', 'kutis', 'kwontol', 'lonte', 'maho',
508
+ 'masturbasi', 'matane', 'mati', 'memek', 'mesum', 'modar', 'modyar', 'mokad',
509
+ 'najis', 'nazi', 'ndhasmu', 'nenen', 'ngentot', 'ngolom', 'ngulum', 'nigga',
510
+ 'nigger', 'onani', 'oon', 'orgasme', 'paksa', 'pantat', 'pantek', 'pecun',
511
+ 'peli', 'penis', 'pentil', 'pepek', 'perek', 'perkosa', 'piatu', 'porno',
512
+ 'pukimak', 'qontol', 'selangkang', 'sempak', 'senggama', 'setan', 'setubuh',
513
+ 'shit', 'silet', 'silit', 'sinting', 'sodomi', 'stres', 'telanjang', 'telaso',
514
+ 'tete', 'tewas', 'titit', 'togel', 'toket', 'tolol', 'tusbol', 'urin', 'vagina'
515
+ }
516
+
517
+ # Multi-word profanity phrases
518
+ PROFANITY_PHRASES = {
519
+ 'gak ada otak', 'tidak ada otak', 'ga ada otak'
520
+ }
521
+
522
+ # Character substitution map (leet speak)
523
+ CHAR_SUBSTITUTIONS = {
524
+ '0': 'o', '1': 'i', '3': 'e', '4': 'a', '5': 's',
525
+ '7': 't', '8': 'b', '@': 'a', '$': 's', '*': ''
526
  }
527
 
528
+ @classmethod
529
+ def normalize_word(cls, word: str) -> str:
530
+ """Normalize word by replacing common character substitutions"""
531
+ normalized = word.lower()
532
+ for char, replacement in cls.CHAR_SUBSTITUTIONS.items():
533
+ normalized = normalized.replace(char, replacement)
534
+ return normalized
535
+
536
  @classmethod
537
  def detect_profanity(cls, text: str) -> dict:
538
+ """
539
+ Detect profanity using hybrid approach:
540
+ 1. Exact match for quick detection
541
+ 2. Fuzzy match for typo variations
542
+ 3. Pattern matching for character substitution (leet speak)
543
+ """
544
  text_lower = text.lower()
545
+
546
+ # Extract words and normalize
547
+ raw_words = re.findall(r'\w+', text_lower)
548
+
549
+ found_profanity = []
550
+ profanity_count = 0
551
+
552
+ # Step 1: Check multi-word phrases first
553
+ for phrase in cls.PROFANITY_PHRASES:
554
+ if phrase in text_lower:
555
+ profanity_count += 1
556
+ if phrase not in found_profanity:
557
+ found_profanity.append(phrase)
558
+
559
+ # Step 2: Check individual words
560
+ for word in raw_words:
561
+ is_profane = False
562
+ matched_word = word
563
+
564
+ # A. Exact match (fastest)
565
+ if word in cls.PROFANITY_WORDS:
566
+ is_profane = True
567
+
568
+ # B. Normalize and check (handle leet speak: t0l0l → tolol)
569
+ elif len(word) > 0:
570
+ normalized = cls.normalize_word(word)
571
+ if normalized in cls.PROFANITY_WORDS:
572
+ is_profane = True
573
+ matched_word = normalized
574
+
575
+ # C. Fuzzy match for typo variations (anjiir, anjiirr, etc.)
576
+ if not is_profane and len(word) > 3:
577
+ for profane_word in cls.PROFANITY_WORDS:
578
+ # Only compare words with similar length (±3 chars)
579
+ if abs(len(word) - len(profane_word)) <= 3:
580
+ similarity = fuzz.ratio(word, profane_word)
581
+ if similarity >= 85: # 85% threshold for profanity
582
+ is_profane = True
583
+ matched_word = profane_word
584
+ break
585
+
586
+ if is_profane:
587
+ profanity_count += 1
588
+ # Keep original word if not already in list
589
+ if word not in found_profanity:
590
+ found_profanity.append(word)
591
+
592
  return {
593
  'has_profanity': len(found_profanity) > 0,
594
+ 'profanity_count': profanity_count,
595
  'profanity_words': list(set(found_profanity))
596
  }
requirements.txt CHANGED
@@ -20,6 +20,7 @@ protobuf==3.20.3
20
  # NLP
21
  sentence-transformers==2.2.2
22
  scikit-learn==1.3.2
 
23
 
24
  # Audio processing
25
  librosa==0.10.1
 
20
  # NLP
21
  sentence-transformers==2.2.2
22
  scikit-learn==1.3.2
23
+ rapidfuzz==3.5.2
24
 
25
  # Audio processing
26
  librosa==0.10.1