fariedalfarizi commited on
Commit
797d38d
·
1 Parent(s): 056938d

MAJOR UPGRADE: Whisper Large V3, 30+ Indonesian phonetics, Gradio JSON API, optimized weights

Browse files
Files changed (6) hide show
  1. README.md +59 -5
  2. app.py +12 -7
  3. app/api_gradio.py +236 -0
  4. app/interface.py +1 -1
  5. core/constants.py +30 -30
  6. core/scoring_engine.py +68 -69
README.md CHANGED
@@ -12,7 +12,7 @@ license: mit
12
 
13
  # 🎤 Sistem Penilaian Vokal Indonesia v2.0
14
 
15
- Sistem penilaian artikulasi vokal bahasa Indonesia menggunakan **Whisper Medium ASR** dan advanced audio signal processing.
16
 
17
  ## 🌟 Fitur
18
 
@@ -26,12 +26,66 @@ Sistem penilaian artikulasi vokal bahasa Indonesia menggunakan **Whisper Medium
26
 
27
  ### 6 Comprehensive Metrics
28
 
29
- 1. **Clarity Score**: Kejelasan pengucapan via Whisper ASR accuracy
30
  2. **Energy Score**: Kualitas volume dan energi suara
31
- 3. **Speech Rate**: Kecepatan bicara (syllables per second)
32
- 4. **Pitch Consistency**: Stabilitas nada suara
33
  5. **SNR Score**: Signal-to-Noise Ratio (kualitas rekaman)
34
- 6. **Articulation Score**: Kejernihan artikulasi dari spectral analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  ## 🚀 Cara Menggunakan
37
 
 
12
 
13
  # 🎤 Sistem Penilaian Vokal Indonesia v2.0
14
 
15
+ Sistem penilaian artikulasi vokal bahasa Indonesia menggunakan **Whisper Large V3** (Indonesian optimized) dan advanced audio signal processing.
16
 
17
  ## 🌟 Fitur
18
 
 
26
 
27
  ### 6 Comprehensive Metrics
28
 
29
+ 1. **Clarity Score (60% for Level 1)**: Kejelasan pengucapan via Whisper Large V3
30
  2. **Energy Score**: Kualitas volume dan energi suara
31
+ 3. **Speech Rate (Level 4-5)**: Kecepatan bicara optimal
32
+ 4. **Pitch Consistency (Level 4-5)**: Stabilitas nada suara
33
  5. **SNR Score**: Signal-to-Noise Ratio (kualitas rekaman)
34
+ 6. **Articulation Score (15% for Level 1)**: Kejernihan artikulasi spektral
35
+
36
+ ### JSON API (Gradio-based)
37
+
38
+ Tersedia JSON API dengan structured response untuk integrasi:
39
+
40
+ - **Tab 1**: UI Assessment (visual interface)
41
+ - **Tab 2**: JSON API (RESTful response)
42
+ - **Python Client**: `gradio_client` compatible
43
+ - **Response Format**: Structured JSON with scores, feedback, suggestions
44
+
45
+ ## 🎯 Optimized Scoring Weights
46
+
47
+ | Level | Clarity | Articulation | Speech Rate | Pitch | Energy | SNR |
48
+ |-------|---------|--------------|-------------|-------|--------|-----|
49
+ | 1 | 60% | 15% | 0% | 0% | 15% | 10% |
50
+ | 2 | 55% | 20% | 0% | 0% | 15% | 10% |
51
+ | 3 | 50% | 15% | 10% | 5% | 10% | 10% |
52
+ | 4 | 40% | 10% | 20% | 15% | 10% | 5% |
53
+ | 5 | 35% | 10% | 25% | 15% | 10% | 5% |
54
+
55
+ ## 📡 API Usage
56
+
57
+ ### Gradio Python Client
58
+
59
+ ```python
60
+ import gradio_client
61
+
62
+ client = gradio_client.Client("https://huggingface.co/spaces/Cyberlace/latihan-artikulasi")
63
+
64
+ result = client.predict(
65
+ audio_file="audio.wav",
66
+ target_text="A",
67
+ level=1,
68
+ api_name="/score_audio_api"
69
+ )
70
+
71
+ print(result["data"]["overall"]["score"]) # 95.5
72
+ print(result["data"]["transcription"]["detected"]) # "A"
73
+ ```
74
+
75
+ ### JSON Response Structure
76
+
77
+ ```json
78
+ {
79
+ "success": true,
80
+ "data": {
81
+ "overall": {"score": 95.5, "grade": "A", "level": 1},
82
+ "transcription": {"target": "A", "detected": "A", "similarity": 100.0, "wer": 0.0},
83
+ "scores": {...},
84
+ "feedback": {"message": "...", "suggestions": [...]},
85
+ "audio_features": {...}
86
+ }
87
+ }
88
+ ```
89
 
90
  ## 🚀 Cara Menggunakan
91
 
app.py CHANGED
@@ -12,21 +12,26 @@ logging.getLogger("starlette").setLevel(logging.ERROR)
12
  logging.getLogger("uvicorn").setLevel(logging.ERROR)
13
 
14
  from app.interface import create_interface, initialize_model
15
- from api.routes import app as fastapi_app
16
 
17
  if __name__ == '__main__':
18
  print('Starting Vocal Articulation Assessment System v2.0...')
19
 
20
- # Initialize model
21
  initialize_model()
22
 
23
- # Create Gradio interface
24
- demo = create_interface()
 
25
 
26
- # Mount Gradio to FastAPI (correct order!)
27
- app = gr.mount_gradio_app(fastapi_app, demo, path="/")
 
 
 
 
28
 
29
- # Launch with specific config
30
  demo.launch(
31
  server_name='0.0.0.0',
32
  server_port=7860,
 
12
  logging.getLogger("uvicorn").setLevel(logging.ERROR)
13
 
14
  from app.interface import create_interface, initialize_model
15
+ from app.api_gradio import create_api_interface
16
 
17
  if __name__ == '__main__':
18
  print('Starting Vocal Articulation Assessment System v2.0...')
19
 
20
+ # Initialize model once
21
  initialize_model()
22
 
23
+ # Create UI and API interfaces
24
+ ui_demo = create_interface()
25
+ api_demo = create_api_interface()
26
 
27
+ # Combine both interfaces with tabs
28
+ demo = gr.TabbedInterface(
29
+ [ui_demo, api_demo],
30
+ ["🎤 Assessment UI", "📡 JSON API"],
31
+ title="Vocal Articulation System v2.0"
32
+ )
33
 
34
+ # Launch
35
  demo.launch(
36
  server_name='0.0.0.0',
37
  server_port=7860,
app/api_gradio.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =======================================
2
+ # GRADIO API ENDPOINT - JSON Response
3
+ # Alternative to FastAPI for HuggingFace Spaces
4
+ # =======================================
5
+
6
+ import gradio as gr
7
+ import json
8
+ from typing import Dict, Any
9
+ from app.interface import initialize_model
10
+
11
+ def score_audio_api(
12
+ audio_file: str,
13
+ target_text: str,
14
+ level: int
15
+ ) -> Dict[str, Any]:
16
+ """
17
+ API endpoint untuk scoring audio - Returns structured JSON
18
+
19
+ Args:
20
+ audio_file: Path ke audio file
21
+ target_text: Target text yang seharusnya diucapkan
22
+ level: Level artikulasi (1-5)
23
+
24
+ Returns:
25
+ JSON response dengan struktur lengkap
26
+ """
27
+ try:
28
+ scorer = initialize_model()
29
+
30
+ # Validate input
31
+ if not audio_file:
32
+ return {
33
+ "success": False,
34
+ "error": "No audio file provided",
35
+ "code": "MISSING_AUDIO"
36
+ }
37
+
38
+ if not target_text or not target_text.strip():
39
+ return {
40
+ "success": False,
41
+ "error": "No target text provided",
42
+ "code": "MISSING_TEXT"
43
+ }
44
+
45
+ # Score audio
46
+ result = scorer.score_audio(
47
+ audio_path=audio_file,
48
+ target_text=target_text,
49
+ level=level
50
+ )
51
+
52
+ # Return structured JSON
53
+ return {
54
+ "success": True,
55
+ "data": {
56
+ "overall": {
57
+ "score": result.overall_score,
58
+ "grade": result.grade,
59
+ "level": result.level
60
+ },
61
+ "transcription": {
62
+ "target": result.target,
63
+ "detected": result.transcription,
64
+ "similarity": round(result.similarity * 100, 2),
65
+ "wer": round(result.wer * 100, 2)
66
+ },
67
+ "scores": {
68
+ "clarity": result.clarity_score,
69
+ "energy": result.energy_score,
70
+ "speech_rate": result.speech_rate_score,
71
+ "pitch_consistency": result.pitch_consistency_score,
72
+ "snr": result.snr_score,
73
+ "articulation": result.articulation_score
74
+ },
75
+ "feedback": {
76
+ "message": result.feedback,
77
+ "suggestions": result.suggestions
78
+ },
79
+ "audio_features": result.audio_features
80
+ }
81
+ }
82
+
83
+ except Exception as e:
84
+ return {
85
+ "success": False,
86
+ "error": str(e),
87
+ "code": "PROCESSING_ERROR"
88
+ }
89
+
90
+
91
+ def create_api_interface():
92
+ """Create Gradio API interface with JSON output"""
93
+
94
+ with gr.Blocks(
95
+ title="Vocal Articulation API",
96
+ theme=gr.themes.Soft(primary_hue="blue")
97
+ ) as api_demo:
98
+
99
+ gr.Markdown("""
100
+ # 🎤 Vocal Articulation API v2.0
101
+ ## RESTful JSON API for Indonesian Vocal Assessment
102
+
103
+ **Model**: Whisper Large V3 (Indonesian Optimized)
104
+
105
+ ### Quick Start
106
+
107
+ 1. Upload audio file (MP3, WAV, M4A, etc.)
108
+ 2. Enter target text (what should be spoken)
109
+ 3. Select level (1-5)
110
+ 4. Get JSON response
111
+
112
+ ### API Response Structure
113
+
114
+ ```json
115
+ {
116
+ "success": true,
117
+ "data": {
118
+ "overall": {
119
+ "score": 85.5,
120
+ "grade": "B",
121
+ "level": 1
122
+ },
123
+ "transcription": {
124
+ "target": "A",
125
+ "detected": "A",
126
+ "similarity": 100.0,
127
+ "wer": 0.0
128
+ },
129
+ "scores": {
130
+ "clarity": 95.2,
131
+ "energy": 98.5,
132
+ "speech_rate": 80.0,
133
+ "pitch_consistency": 75.3,
134
+ "snr": 100.0,
135
+ "articulation": 92.1
136
+ },
137
+ "feedback": {
138
+ "message": "Sempurna! Pengucapan Anda sangat baik.",
139
+ "suggestions": []
140
+ },
141
+ "audio_features": {...}
142
+ }
143
+ }
144
+ ```
145
+
146
+ ---
147
+ """)
148
+
149
+ with gr.Row():
150
+ with gr.Column():
151
+ gr.Markdown("### Input")
152
+
153
+ audio_input = gr.Audio(
154
+ label="Audio File",
155
+ type="filepath",
156
+ sources=["upload", "microphone"]
157
+ )
158
+
159
+ target_input = gr.Textbox(
160
+ label="Target Text",
161
+ placeholder="e.g., A, BA, PSIKOLOGI",
162
+ info="Text yang seharusnya diucapkan"
163
+ )
164
+
165
+ level_input = gr.Slider(
166
+ label="Level (1=Vokal, 5=Kalimat)",
167
+ minimum=1,
168
+ maximum=5,
169
+ value=1,
170
+ step=1
171
+ )
172
+
173
+ submit_btn = gr.Button("Score Audio", variant="primary")
174
+
175
+ with gr.Column():
176
+ gr.Markdown("### JSON Response")
177
+
178
+ output_json = gr.JSON(
179
+ label="API Response",
180
+ show_label=True
181
+ )
182
+
183
+ gr.Markdown("""
184
+ ---
185
+
186
+ ### Level Descriptions
187
+
188
+ | Level | Name | Description | Examples |
189
+ |-------|------|-------------|----------|
190
+ | 1 | Vokal Tunggal | Single vowels | A, I, U, E, O |
191
+ | 2 | Konsonan Dasar | Basic consonants | BA, PA, DA, TA, KA |
192
+ | 3 | Suku Kata | Syllable combinations | BA BE BI BO BU |
193
+ | 4 | Kata Sulit | Complex words | PSIKOLOGI, STRATEGI |
194
+ | 5 | Kalimat Kompleks | Tongue twisters | ULAR LARI LURUS... |
195
+
196
+ ### Scoring Weights per Level
197
+
198
+ **Level 1-2**: Focus on Clarity (60%) + Articulation (15%)
199
+ **Level 3**: Balanced with Speech Rate (10%)
200
+ **Level 4-5**: Comprehensive with Speech Rate (20-25%) + Pitch (15%)
201
+
202
+ ### Error Codes
203
+
204
+ - `MISSING_AUDIO`: No audio file provided
205
+ - `MISSING_TEXT`: No target text provided
206
+ - `PROCESSING_ERROR`: Error during processing
207
+
208
+ ---
209
+
210
+ ### Python Usage Example
211
+
212
+ ```python
213
+ import gradio_client
214
+
215
+ client = gradio_client.Client("https://huggingface.co/spaces/Cyberlace/latihan-artikulasi")
216
+
217
+ result = client.predict(
218
+ audio_file="audio.wav",
219
+ target_text="A",
220
+ level=1,
221
+ api_name="/score_audio_api"
222
+ )
223
+
224
+ print(result) # JSON response
225
+ ```
226
+ """)
227
+
228
+ # Connect button to API function
229
+ submit_btn.click(
230
+ fn=score_audio_api,
231
+ inputs=[audio_input, target_input, level_input],
232
+ outputs=output_json,
233
+ api_name="score_audio_api"
234
+ )
235
+
236
+ return api_demo
app/interface.py CHANGED
@@ -46,7 +46,7 @@ def initialize_model():
46
  global scorer
47
 
48
  if scorer is None:
49
- whisper_model = os.getenv("WHISPER_MODEL", "openai/whisper-medium")
50
  print(f"Loading Whisper model: {whisper_model}...")
51
  scorer = AdvancedVocalScoringSystem(whisper_model=whisper_model)
52
  print("Model loaded!")
 
46
  global scorer
47
 
48
  if scorer is None:
49
+ whisper_model = os.getenv("WHISPER_MODEL", "openai/whisper-large-v3")
50
  print(f"Loading Whisper model: {whisper_model}...")
51
  scorer = AdvancedVocalScoringSystem(whisper_model=whisper_model)
52
  print("Model loaded!")
core/constants.py CHANGED
@@ -49,46 +49,46 @@ ARTICULATION_LEVELS = {
49
  }
50
  }
51
 
52
- # Scoring weights per level
53
  LEVEL_WEIGHTS = {
54
- 1: { # Vokal tunggal - fokus clarity & articulation
55
- 'clarity': 0.50,
56
- 'energy': 0.20,
57
- 'speech_rate': 0.0,
58
- 'pitch_consistency': 0.0,
59
- 'snr': 0.15,
60
- 'articulation': 0.15
61
- },
62
- 2: { # Konsonan dasar - fokus clarity & articulation
63
- 'clarity': 0.45,
64
- 'energy': 0.20,
65
  'speech_rate': 0.0,
66
  'pitch_consistency': 0.0,
67
- 'snr': 0.15,
68
- 'articulation': 0.20
69
  },
70
- 3: { # Kombinasi suku kata - mulai speech rate
71
- 'clarity': 0.40,
72
  'energy': 0.15,
73
  'speech_rate': 0.0,
74
  'pitch_consistency': 0.0,
75
- 'snr': 0.20,
76
- 'articulation': 0.25
77
- },
78
- 4: { # Kata sulit
79
- 'clarity': 0.45,
80
- 'energy': 0.15,
81
- 'speech_rate': 0.15,
82
- 'pitch_consistency': 0.10,
83
  'snr': 0.10,
84
- 'articulation': 0.05
85
  },
86
- 5: { # Kalimat kompleks
87
- 'clarity': 0.45,
88
  'energy': 0.10,
89
- 'speech_rate': 0.20,
90
- 'pitch_consistency': 0.10,
91
  'snr': 0.10,
92
- 'articulation': 0.05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  }
94
  }
 
49
  }
50
  }
51
 
52
+ # Optimized scoring weights per level
53
  LEVEL_WEIGHTS = {
54
+ 1: { # Vokal tunggal - MAX clarity & articulation
55
+ 'clarity': 0.60, # Paling penting: ASR accuracy
56
+ 'energy': 0.15,
 
 
 
 
 
 
 
 
57
  'speech_rate': 0.0,
58
  'pitch_consistency': 0.0,
59
+ 'snr': 0.10,
60
+ 'articulation': 0.15 # Penting: spectral clarity
61
  },
62
+ 2: { # Konsonan dasar - HIGH clarity
63
+ 'clarity': 0.55,
64
  'energy': 0.15,
65
  'speech_rate': 0.0,
66
  'pitch_consistency': 0.0,
 
 
 
 
 
 
 
 
67
  'snr': 0.10,
68
+ 'articulation': 0.20
69
  },
70
+ 3: { # Kombinasi suku kata - BALANCED
71
+ 'clarity': 0.50,
72
  'energy': 0.10,
73
+ 'speech_rate': 0.10, # Mulai masuk
74
+ 'pitch_consistency': 0.05,
75
  'snr': 0.10,
76
+ 'articulation': 0.15
77
+ },
78
+ 4: { # Kata sulit - ADD speech rate & pitch
79
+ 'clarity': 0.40,
80
+ 'energy': 0.10,
81
+ 'speech_rate': 0.20, # Penting
82
+ 'pitch_consistency': 0.15,
83
+ 'snr': 0.05,
84
+ 'articulation': 0.10
85
+ },
86
+ 5: { # Kalimat kompleks - COMPREHENSIVE
87
+ 'clarity': 0.35,
88
+ 'energy': 0.10,
89
+ 'speech_rate': 0.25, # Sangat penting
90
+ 'pitch_consistency': 0.15,
91
+ 'snr': 0.05,
92
+ 'articulation': 0.10
93
  }
94
  }
core/scoring_engine.py CHANGED
@@ -10,8 +10,6 @@ import librosa
10
  from transformers import (
11
  WhisperProcessor,
12
  WhisperForConditionalGeneration,
13
- Wav2Vec2Processor,
14
- Wav2Vec2ForCTC,
15
  pipeline
16
  )
17
  from typing import Dict, List, Tuple, Optional, Any
@@ -97,32 +95,21 @@ class AdvancedVocalScoringSystem:
97
 
98
  def __init__(
99
  self,
100
- whisper_model: str = "openai/whisper-medium",
101
- wav2vec2_model: str = "indonesian-nlp/wav2vec2-indonesian-javanese-sundanese",
102
  device: str = None
103
  ):
104
  """
105
- Initialize system dengan dual ASR: Wav2Vec2 (Indonesian) + Whisper (fallback)
106
 
107
  Args:
108
- whisper_model: Model Whisper untuk Level 4-5
109
- wav2vec2_model: Model Wav2Vec2 untuk Level 1-3 (Indonesian native)
110
  device: 'cuda' atau 'cpu'
111
  """
112
  self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
113
 
114
- print(f"🔄 Loading Indonesian Wav2Vec2: {wav2vec2_model}...")
115
 
116
- # Load Wav2Vec2 for Indonesian (better for short audio)
117
- self.wav2vec2_processor = Wav2Vec2Processor.from_pretrained(wav2vec2_model)
118
- self.wav2vec2_model = Wav2Vec2ForCTC.from_pretrained(wav2vec2_model)
119
- self.wav2vec2_model.to(self.device)
120
- self.wav2vec2_model.eval()
121
-
122
- print(f"✅ Wav2Vec2 loaded on {self.device}")
123
- print(f"🔄 Loading Whisper model: {whisper_model}...")
124
-
125
- # Load Whisper model for complex sentences
126
  self.processor = WhisperProcessor.from_pretrained(whisper_model)
127
  self.model = WhisperForConditionalGeneration.from_pretrained(whisper_model)
128
  self.model.to(self.device)
@@ -252,41 +239,25 @@ class AdvancedVocalScoringSystem:
252
  level: int = 1
253
  ) -> Tuple[float, str, float, float]:
254
  """
255
- Score clarity using Wav2Vec2 (Level 1-3) or Whisper (Level 4-5)
256
 
257
  Returns:
258
  (clarity_score, transcription, similarity, wer)
259
  """
260
  try:
261
- # Use Wav2Vec2 for Level 1-3 (better for Indonesian short audio)
262
- if level <= 3:
263
- import librosa
264
- audio_np, sr = librosa.load(audio_path, sr=16000)
265
-
266
- # Process with Wav2Vec2
267
- inputs = self.wav2vec2_processor(
268
- audio_np,
269
- sampling_rate=16000,
270
- return_tensors="pt",
271
- padding=True
272
- )
273
-
274
- with torch.no_grad():
275
- logits = self.wav2vec2_model(inputs.input_values.to(self.device)).logits
276
-
277
- predicted_ids = torch.argmax(logits, dim=-1)
278
- transcription = self.wav2vec2_processor.batch_decode(predicted_ids)[0].upper().strip()
279
- else:
280
- # Use Whisper for Level 4-5 (better for long sentences)
281
- result = self.pipe(
282
- audio_path,
283
- return_timestamps=False,
284
- generate_kwargs={
285
- "language": "id",
286
- "task": "transcribe"
287
- }
288
- )
289
- transcription = result["text"].upper().strip()
290
 
291
  except Exception as e:
292
  print(f"⚠️ ASR Error: {e}")
@@ -550,43 +521,71 @@ class AdvancedVocalScoringSystem:
550
  def _phonetic_similarity(self, text1: str, text2: str) -> float:
551
  """
552
  Calculate phonetic similarity for Indonesian syllables
553
- Handles common confusions: T/D, P/B, K/G, S/Z
554
  """
555
- # Indonesian phonetic confusion pairs
556
  confusions = {
557
- 'T': ['D', 'TH'],
558
- 'D': ['T', 'DH'],
559
- 'P': ['B'],
560
- 'B': ['P'],
561
- 'K': ['G', 'C'],
 
562
  'G': ['K'],
563
- 'S': ['Z', 'SY'],
564
- 'Z': ['S'],
565
- 'A': ['AH'],
566
- 'E': ['EH']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
  }
568
 
569
  if not text1 or not text2:
570
  return 0.0
571
 
572
- # Check if first letters are phonetically similar
 
 
 
 
 
 
 
 
573
  first1 = text1[0] if text1 else ''
574
  first2 = text2[0] if text2 else ''
575
 
576
- if first1 == first2:
577
- return 1.0
578
-
579
  # Check confusion pairs
580
  if first1 in confusions and first2 in confusions[first1]:
581
- return 0.8
582
  if first2 in confusions and first1 in confusions[first2]:
583
- return 0.8
584
 
585
  # Levenshtein distance for longer text
586
- if len(text1) > 1 and len(text2) > 1:
587
- return difflib.SequenceMatcher(None, text1, text2).ratio()
588
-
589
- return 0.0
590
 
591
  def _calculate_wer(self, predicted: str, target: str) -> float:
592
  """Calculate Word Error Rate"""
 
10
  from transformers import (
11
  WhisperProcessor,
12
  WhisperForConditionalGeneration,
 
 
13
  pipeline
14
  )
15
  from typing import Dict, List, Tuple, Optional, Any
 
95
 
96
  def __init__(
97
  self,
98
+ whisper_model: str = "openai/whisper-large-v3", # Best for Indonesian
 
99
  device: str = None
100
  ):
101
  """
102
+ Initialize system dengan Whisper Large V3 (best for Indonesian)
103
 
104
  Args:
105
+ whisper_model: Model Whisper (large-v3 recommended for Indonesian)
 
106
  device: 'cuda' atau 'cpu'
107
  """
108
  self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
109
 
110
+ print(f"🔄 Loading Whisper Large V3 for Indonesian...")
111
 
112
+ # Load Whisper Large V3 - best for all levels
 
 
 
 
 
 
 
 
 
113
  self.processor = WhisperProcessor.from_pretrained(whisper_model)
114
  self.model = WhisperForConditionalGeneration.from_pretrained(whisper_model)
115
  self.model.to(self.device)
 
239
  level: int = 1
240
  ) -> Tuple[float, str, float, float]:
241
  """
242
+ Score clarity using Whisper Large V3 with Indonesian optimization
243
 
244
  Returns:
245
  (clarity_score, transcription, similarity, wer)
246
  """
247
  try:
248
+ # Use Whisper Large V3 for all levels (best accuracy)
249
+ result = self.pipe(
250
+ audio_path,
251
+ return_timestamps=False,
252
+ generate_kwargs={
253
+ "language": "indonesian", # Full language name for better detection
254
+ "task": "transcribe",
255
+ "temperature": 0.0, # Deterministic output
256
+ "compression_ratio_threshold": 1.35, # Lower for short audio
257
+ "no_speech_threshold": 0.3 # Lower sensitivity
258
+ }
259
+ )
260
+ transcription = result["text"].upper().strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  except Exception as e:
263
  print(f"⚠️ ASR Error: {e}")
 
521
  def _phonetic_similarity(self, text1: str, text2: str) -> float:
522
  """
523
  Calculate phonetic similarity for Indonesian syllables
524
+ Comprehensive Indonesian phonetic confusions
525
  """
526
+ # Comprehensive Indonesian phonetic confusion pairs
527
  confusions = {
528
+ # Plosives (Konsonan Letup)
529
+ 'T': ['D', 'TH', 'C'],
530
+ 'D': ['T', 'DH', 'J'],
531
+ 'P': ['B', 'F'],
532
+ 'B': ['P', 'V'],
533
+ 'K': ['G', 'C', 'Q'],
534
  'G': ['K'],
535
+ 'C': ['S', 'T', 'K'],
536
+
537
+ # Fricatives (Konsonan Geseran)
538
+ 'S': ['Z', 'SY', 'C'],
539
+ 'Z': ['S', 'J'],
540
+ 'F': ['P', 'V'],
541
+ 'V': ['F', 'B', 'W'],
542
+ 'H': ['KH'],
543
+
544
+ # Nasals (Konsonan Sengau)
545
+ 'M': ['N'],
546
+ 'N': ['M', 'NG', 'NY'],
547
+ 'NG': ['N'],
548
+ 'NY': ['N', 'Y'],
549
+
550
+ # Liquids (Konsonan Cair)
551
+ 'R': ['L'],
552
+ 'L': ['R'],
553
+
554
+ # Semivowels
555
+ 'W': ['V', 'U'],
556
+ 'Y': ['I', 'NY'],
557
+
558
+ # Vowels (Vokal)
559
+ 'A': ['AH', 'E'],
560
+ 'E': ['A', 'EH', 'I'],
561
+ 'I': ['E', 'Y'],
562
+ 'O': ['OH', 'U'],
563
+ 'U': ['O', 'W']
564
  }
565
 
566
  if not text1 or not text2:
567
  return 0.0
568
 
569
+ # Exact match
570
+ if text1 == text2:
571
+ return 1.0
572
+
573
+ # Check if one contains the other
574
+ if text1 in text2 or text2 in text1:
575
+ return 0.95
576
+
577
+ # Check first letter phonetic similarity
578
  first1 = text1[0] if text1 else ''
579
  first2 = text2[0] if text2 else ''
580
 
 
 
 
581
  # Check confusion pairs
582
  if first1 in confusions and first2 in confusions[first1]:
583
+ return 0.85
584
  if first2 in confusions and first1 in confusions[first2]:
585
+ return 0.85
586
 
587
  # Levenshtein distance for longer text
588
+ return difflib.SequenceMatcher(None, text1, text2).ratio()
 
 
 
589
 
590
  def _calculate_wer(self, predicted: str, target: str) -> float:
591
  """Calculate Word Error Rate"""