Spaces:

Cyberlace
/

api-swara-audio-analysis

Sleeping

fariedalfarizi commited on 13 days ago

Commit

9abd5a8

1 Parent(s): 6dc05e5

feat: enhance filler detection & profanity detection with fuzzy matching

- Filler words: hybrid exact/fuzzy matching (90% threshold for long words)
- Remove 'ini' from filler list (legitimate word, not filler)
- Profanity: hybrid detection (exact + fuzzy + leet speak normalization)
- Add 130+ Indonesian profanity words and multi-word phrases
- Character substitution detection (t0l0l → tolol, f*ck → fuck)
- Fuzzy matching 85% threshold for typo variations
- Add rapidfuzz dependency for advanced string matching

Files changed (2) hide show

app/services/articulation.py +127 -25
requirements.txt +1 -0

app/services/articulation.py CHANGED Viewed

@@ -10,6 +10,7 @@ import numpy as np
 from typing import Dict, List, Tuple, Optional
 import re
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 class ArticulationService:
@@ -43,7 +44,7 @@ class ArticulationService:
         # Filler words bahasa Indonesia
         self.filler_words = [
             'eh', 'ehm', 'em', 'aa', 'ah', 'mm', 'hmm',
-            'anu', 'ini', 'itu', 'gitu', 'kayak', 'seperti',
             'ya', 'yaa', 'nah', 'terus', 'jadi', 'soalnya'
         ]
@@ -148,39 +149,55 @@ class ArticulationService:
         """Deteksi kata-kata pengisi (filler words)"""
         print("🔎 Detecting filler words...")
-        words = transcript.lower().split()
         total_words = len(words)
         if total_words == 0:
             return {
                 'filler_count': 0,
-                'filler_ratio': 0,
                 'filler_words_found': []
             }
-        # Count filler words
         filler_found = []
         filler_count = 0
         for word in words:
-            # Clean word
-            clean_word = re.sub(r'[^\w\s]', '', word)
-            if clean_word in self.filler_words:
                 filler_count += 1
-                if clean_word not in filler_found:
-                    filler_found.append(clean_word)
-        filler_ratio = filler_count / total_words
-        print(f"   Filler Words: {filler_count}/{total_words} ({filler_ratio*100:.1f}%)")
         if filler_found:
             print(f"   Found: {', '.join(filler_found)}")
         return {
             'filler_count': filler_count,
-            'filler_ratio': filler_ratio,
-            'filler_words_found': filler_found,
-            'total_words': total_words
         }
     def analyze_speech_rate_stability(self, audio_path: str) -> Dict:
@@ -451,11 +468,8 @@ class ArticulationService:
             'consistency': round(clarity['consistency'], 3)
         }
-        result['filler_metrics'] = {
-            'count': filler['filler_count'],
-            'ratio': round(filler['filler_ratio'], 3),
-            'words_found': filler['filler_words_found']
-        }
         result['stability_metrics'] = {
             'syllable_rate': round(stability['avg_syllable_rate'], 2),
@@ -477,18 +491,106 @@ class ArticulationService:
 \
 class ProfanityDetector:
     PROFANITY_WORDS = {
-        'anjir', 'anjay', 'shit', 'fuck', 'tolol', 'bego', 'goblok',
-        'bodoh', 'anjing', 'bangsat', 'brengsek'
     }
     @classmethod
     def detect_profanity(cls, text: str) -> dict:
         text_lower = text.lower()
-        words = re.findall(r'\\\w+\', text_lower)
-        found_profanity = [w for w in words if w in cls.PROFANITY_WORDS]
         return {
             'has_profanity': len(found_profanity) > 0,
-            'profanity_count': len(found_profanity),
             'profanity_words': list(set(found_profanity))
         }

 from typing import Dict, List, Tuple, Optional
 import re
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from rapidfuzz import fuzz
 class ArticulationService:
         # Filler words bahasa Indonesia
         self.filler_words = [
             'eh', 'ehm', 'em', 'aa', 'ah', 'mm', 'hmm',
+            'anu', 'itu', 'gitu', 'kayak', 'seperti',
             'ya', 'yaa', 'nah', 'terus', 'jadi', 'soalnya'
         ]
         """Deteksi kata-kata pengisi (filler words)"""
         print("🔎 Detecting filler words...")
+        # Split by whitespace to preserve original form
+        words = transcript.split()
         total_words = len(words)
         if total_words == 0:
             return {
                 'filler_count': 0,
                 'filler_words_found': []
             }
+        # Count filler words using fuzzy matching + exact match for short words
         filler_found = []
         filler_count = 0
         for word in words:
+            # Clean word for checking (lowercase, remove punctuation)
+            clean_word = re.sub(r'[^\w\s]', '', word.lower())
+            # Skip empty words
+            if not clean_word:
+                continue
+            is_filler = False
+            # For short words (2-3 chars), use exact match to avoid false positives
+            if len(clean_word) <= 3:
+                if clean_word in self.filler_words:
+                    is_filler = True
+            else:
+                # For longer words, use fuzzy matching with 90% threshold
+                for filler_word in self.filler_words:
+                    similarity = fuzz.ratio(clean_word, filler_word)
+                    if similarity >= 90:  # 90% threshold untuk presisi lebih tinggi
+                        is_filler = True
+                        break
+            if is_filler:
                 filler_count += 1
+                # Keep original word form (with punctuation like 'ehm...')
+                if word not in filler_found:
+                    filler_found.append(word)
+        print(f"   Filler Words: {filler_count}/{total_words}")
         if filler_found:
             print(f"   Found: {', '.join(filler_found)}")
         return {
             'filler_count': filler_count,
+            'filler_words_found': filler_found
         }
     def analyze_speech_rate_stability(self, audio_path: str) -> Dict:
             'consistency': round(clarity['consistency'], 3)
         }
+        result['filler_count'] = filler['filler_count']
+        result['filler_words'] = filler['filler_words_found']
         result['stability_metrics'] = {
             'syllable_rate': round(stability['avg_syllable_rate'], 2),
 \
 class ProfanityDetector:
+    """Deteksi kata tidak senonoh menggunakan hybrid approach (exact + fuzzy + pattern)"""
+    # Base profanity words (kata dasar)
     PROFANITY_WORDS = {
+        'anjir', 'anjay', 'njir', 'njay', 'anjrit', 'njrit', 'anjim', 'anjing',
+        'anjrot', 'asu', 'babi', 'bacot', 'bajingan', 'banci', 'bangke', 'bangor',
+        'bangsat', 'bego', 'bejad', 'bencong', 'bodat', 'bodoh', 'bugil', 'bundir',
+        'bunuh', 'burik', 'burit', 'cawek', 'cemen', 'cipok', 'cium', 'colai', 'coli',
+        'colmek', 'cukimai', 'cukimay', 'culun', 'cumbu', 'dancuk', 'dewasa', 'dick',
+        'dildo', 'encuk', 'fuck', 'gay', 'gei', 'gembel', 'gey', 'gigolo', 'gila',
+        'goblog', 'goblok', 'haram', 'hencet', 'hentai', 'idiot', 'jablai', 'jablay',
+        'jancok', 'jancuk', 'jangkik', 'jembut', 'jilat', 'jingan', 'kampang',
+        'keparat', 'kimak', 'kirik', 'klentit', 'klitoris', 'konthol', 'kontol',
+        'koplok', 'kunyuk', 'kutang', 'kutis', 'kwontol', 'lonte', 'maho',
+        'masturbasi', 'matane', 'mati', 'memek', 'mesum', 'modar', 'modyar', 'mokad',
+        'najis', 'nazi', 'ndhasmu', 'nenen', 'ngentot', 'ngolom', 'ngulum', 'nigga',
+        'nigger', 'onani', 'oon', 'orgasme', 'paksa', 'pantat', 'pantek', 'pecun',
+        'peli', 'penis', 'pentil', 'pepek', 'perek', 'perkosa', 'piatu', 'porno',
+        'pukimak', 'qontol', 'selangkang', 'sempak', 'senggama', 'setan', 'setubuh',
+        'shit', 'silet', 'silit', 'sinting', 'sodomi', 'stres', 'telanjang', 'telaso',
+        'tete', 'tewas', 'titit', 'togel', 'toket', 'tolol', 'tusbol', 'urin', 'vagina'
+    }
+    # Multi-word profanity phrases
+    PROFANITY_PHRASES = {
+        'gak ada otak', 'tidak ada otak', 'ga ada otak'
+    }
+    # Character substitution map (leet speak)
+    CHAR_SUBSTITUTIONS = {
+        '0': 'o', '1': 'i', '3': 'e', '4': 'a', '5': 's',
+        '7': 't', '8': 'b', '@': 'a', '$': 's', '*': ''
     }
+    @classmethod
+    def normalize_word(cls, word: str) -> str:
+        """Normalize word by replacing common character substitutions"""
+        normalized = word.lower()
+        for char, replacement in cls.CHAR_SUBSTITUTIONS.items():
+            normalized = normalized.replace(char, replacement)
+        return normalized
     @classmethod
     def detect_profanity(cls, text: str) -> dict:
+        """
+        Detect profanity using hybrid approach:
+        1. Exact match for quick detection
+        2. Fuzzy match for typo variations
+        3. Pattern matching for character substitution (leet speak)
+        """
         text_lower = text.lower()
+        # Extract words and normalize
+        raw_words = re.findall(r'\w+', text_lower)
+        found_profanity = []
+        profanity_count = 0
+        # Step 1: Check multi-word phrases first
+        for phrase in cls.PROFANITY_PHRASES:
+            if phrase in text_lower:
+                profanity_count += 1
+                if phrase not in found_profanity:
+                    found_profanity.append(phrase)
+        # Step 2: Check individual words
+        for word in raw_words:
+            is_profane = False
+            matched_word = word
+            # A. Exact match (fastest)
+            if word in cls.PROFANITY_WORDS:
+                is_profane = True
+            # B. Normalize and check (handle leet speak: t0l0l → tolol)
+            elif len(word) > 0:
+                normalized = cls.normalize_word(word)
+                if normalized in cls.PROFANITY_WORDS:
+                    is_profane = True
+                    matched_word = normalized
+            # C. Fuzzy match for typo variations (anjiir, anjiirr, etc.)
+            if not is_profane and len(word) > 3:
+                for profane_word in cls.PROFANITY_WORDS:
+                    # Only compare words with similar length (±3 chars)
+                    if abs(len(word) - len(profane_word)) <= 3:
+                        similarity = fuzz.ratio(word, profane_word)
+                        if similarity >= 85:  # 85% threshold for profanity
+                            is_profane = True
+                            matched_word = profane_word
+                            break
+            if is_profane:
+                profanity_count += 1
+                # Keep original word if not already in list
+                if word not in found_profanity:
+                    found_profanity.append(word)
         return {
             'has_profanity': len(found_profanity) > 0,
+            'profanity_count': profanity_count,
             'profanity_words': list(set(found_profanity))
         }

requirements.txt CHANGED Viewed

@@ -20,6 +20,7 @@ protobuf==3.20.3
 # NLP
 sentence-transformers==2.2.2
 scikit-learn==1.3.2
 # Audio processing
 librosa==0.10.1

 # NLP
 sentence-transformers==2.2.2
 scikit-learn==1.3.2
+rapidfuzz==3.5.2
 # Audio processing
 librosa==0.10.1