fariedalfarizi commited on
Commit
7de9a5a
·
1 Parent(s): 4838f88

Fix: Add JSON sanitization for numpy types to prevent Content-Length errors

Browse files
Files changed (2) hide show
  1. api/routes.py +8 -41
  2. core/scoring_engine.py +62 -35
api/routes.py CHANGED
@@ -208,26 +208,9 @@ async def score_audio(
208
  # Clean up temp file
209
  os.unlink(tmp_path)
210
 
211
- # Convert result to dict and ensure proper serialization
212
- response_data = {
213
- "success": True,
214
- "overall_score": float(result.overall_score),
215
- "grade": str(result.grade),
216
- "clarity_score": float(result.clarity_score),
217
- "energy_score": float(result.energy_score),
218
- "speech_rate_score": float(result.speech_rate_score),
219
- "pitch_consistency_score": float(result.pitch_consistency_score),
220
- "snr_score": float(result.snr_score),
221
- "articulation_score": float(result.articulation_score),
222
- "transcription": str(result.transcription),
223
- "target": str(result.target),
224
- "similarity": float(result.similarity),
225
- "wer": float(result.wer),
226
- "feedback": str(result.feedback),
227
- "suggestions": [str(s) for s in result.suggestions],
228
- "audio_features": {k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in result.audio_features.items()},
229
- "level": int(result.level)
230
- }
231
 
232
  # Return with explicit JSON response
233
  return JSONResponse(
@@ -300,27 +283,11 @@ async def batch_score_audio(
300
  # Clean up
301
  os.unlink(tmp_path)
302
 
303
- # Properly serialize result
304
- results.append({
305
- "filename": audio.filename,
306
- "success": True,
307
- "overall_score": float(result.overall_score),
308
- "grade": str(result.grade),
309
- "clarity_score": float(result.clarity_score),
310
- "energy_score": float(result.energy_score),
311
- "speech_rate_score": float(result.speech_rate_score),
312
- "pitch_consistency_score": float(result.pitch_consistency_score),
313
- "snr_score": float(result.snr_score),
314
- "articulation_score": float(result.articulation_score),
315
- "transcription": str(result.transcription),
316
- "target": str(result.target),
317
- "similarity": float(result.similarity),
318
- "wer": float(result.wer),
319
- "feedback": str(result.feedback),
320
- "suggestions": [str(s) for s in result.suggestions],
321
- "audio_features": {k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in result.audio_features.items()},
322
- "level": int(result.level)
323
- })
324
 
325
  except Exception as e:
326
  if 'tmp_path' in locals() and os.path.exists(tmp_path):
 
208
  # Clean up temp file
209
  os.unlink(tmp_path)
210
 
211
+ # Convert result to dict with JSON-safe types
212
+ response_data = result.to_dict()
213
+ response_data["success"] = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  # Return with explicit JSON response
216
  return JSONResponse(
 
283
  # Clean up
284
  os.unlink(tmp_path)
285
 
286
+ # Convert to dict with JSON-safe types
287
+ result_dict = result.to_dict()
288
+ result_dict["filename"] = audio.filename
289
+ result_dict["success"] = True
290
+ results.append(result_dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
  except Exception as e:
293
  if 'tmp_path' in locals() and os.path.exists(tmp_path):
core/scoring_engine.py CHANGED
@@ -13,13 +13,34 @@ from transformers import (
13
  WhisperForConditionalGeneration,
14
  pipeline
15
  )
16
- from typing import Dict, List, Tuple, Optional
17
- from dataclasses import dataclass
18
  import difflib
19
  import re
 
20
 
21
  from .constants import ARTICULATION_LEVELS, LEVEL_WEIGHTS
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # =======================================
24
  # SCORE RESULT DATACLASS
25
  # =======================================
@@ -52,6 +73,11 @@ class ScoreResult:
52
  feedback: str
53
  suggestions: List[str]
54
  level: int
 
 
 
 
 
55
 
56
  # =======================================
57
  # ADVANCED SCORING SYSTEM
@@ -178,22 +204,22 @@ class AdvancedVocalScoringSystem:
178
  )
179
 
180
  return ScoreResult(
181
- overall_score=round(overall_score, 2),
182
- grade=grade,
183
- clarity_score=round(clarity_score, 2),
184
- energy_score=round(energy_score, 2),
185
- speech_rate_score=round(speech_rate_score, 2),
186
- pitch_consistency_score=round(pitch_consistency_score, 2),
187
- snr_score=round(snr_score, 2),
188
- articulation_score=round(articulation_score, 2),
189
- transcription=transcription,
190
- target=target_text.upper(),
191
- similarity=round(similarity, 4),
192
- wer=round(wer, 4),
193
  audio_features=audio_features,
194
- feedback=feedback,
195
- suggestions=suggestions,
196
- level=level
197
  )
198
 
199
  # =======================================
@@ -245,12 +271,12 @@ class AdvancedVocalScoringSystem:
245
  energy_score (0-100)
246
  """
247
  # RMS energy
248
- rms = np.sqrt(np.mean(audio**2))
249
- rms_db = 20 * np.log10(rms + 1e-10)
250
 
251
  # Optimal range: -30 to -10 dB
252
  if -30 <= rms_db <= -10:
253
- energy_score = 100
254
  elif -40 <= rms_db < -30:
255
  energy_score = 60 + (rms_db + 40) * 4
256
  elif -10 < rms_db <= -5:
@@ -260,7 +286,7 @@ class AdvancedVocalScoringSystem:
260
  else:
261
  energy_score = max(0, 60 - (rms_db + 5) * 5)
262
 
263
- return min(100, max(0, energy_score))
264
 
265
  def _score_speech_rate(
266
  self,
@@ -502,34 +528,35 @@ class AdvancedVocalScoringSystem:
502
  sr: int,
503
  transcription: str
504
  ) -> Dict:
505
- """Extract comprehensive audio features"""
506
  try:
507
- duration = len(audio) / sr
508
- rms = np.sqrt(np.mean(audio**2))
509
- rms_db = 20 * np.log10(rms + 1e-10)
510
- zcr = librosa.zero_crossings(audio).sum() / len(audio)
511
 
512
  # Spectral features
513
- spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0].mean()
514
- spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0].mean()
515
- spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0].mean()
516
 
517
  # Tempo
518
  tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
 
519
 
520
  return {
521
  'duration': round(duration, 3),
522
  'rms_db': round(rms_db, 2),
523
  'zero_crossing_rate': round(zcr, 4),
524
- 'spectral_centroid': round(float(spectral_centroid), 2),
525
- 'spectral_rolloff': round(float(spectral_rolloff), 2),
526
- 'spectral_bandwidth': round(float(spectral_bandwidth), 2),
527
- 'tempo': round(float(tempo), 2),
528
- 'transcription': transcription
529
  }
530
  except Exception as e:
531
  return {
532
- 'duration': len(audio) / sr,
533
  'error': str(e)
534
  }
535
 
 
13
  WhisperForConditionalGeneration,
14
  pipeline
15
  )
16
+ from typing import Dict, List, Tuple, Optional, Any
17
+ from dataclasses import dataclass, asdict
18
  import difflib
19
  import re
20
+ import json
21
 
22
  from .constants import ARTICULATION_LEVELS, LEVEL_WEIGHTS
23
 
24
+ # =======================================
25
+ # HELPER FUNCTIONS FOR JSON SERIALIZATION
26
+ # =======================================
27
+
28
+ def sanitize_for_json(obj: Any) -> Any:
29
+ """Convert numpy/torch types to Python native types for JSON serialization"""
30
+ if isinstance(obj, (np.integer, np.int64, np.int32)):
31
+ return int(obj)
32
+ elif isinstance(obj, (np.floating, np.float64, np.float32)):
33
+ return float(obj)
34
+ elif isinstance(obj, np.ndarray):
35
+ return obj.tolist()
36
+ elif isinstance(obj, dict):
37
+ return {key: sanitize_for_json(value) for key, value in obj.items()}
38
+ elif isinstance(obj, (list, tuple)):
39
+ return [sanitize_for_json(item) for item in obj]
40
+ elif isinstance(obj, torch.Tensor):
41
+ return obj.detach().cpu().numpy().tolist()
42
+ return obj
43
+
44
  # =======================================
45
  # SCORE RESULT DATACLASS
46
  # =======================================
 
73
  feedback: str
74
  suggestions: List[str]
75
  level: int
76
+
77
+ def to_dict(self) -> Dict:
78
+ """Convert to dict with JSON-safe types"""
79
+ data = asdict(self)
80
+ return sanitize_for_json(data)
81
 
82
  # =======================================
83
  # ADVANCED SCORING SYSTEM
 
204
  )
205
 
206
  return ScoreResult(
207
+ overall_score=float(round(overall_score, 2)),
208
+ grade=str(grade),
209
+ clarity_score=float(round(clarity_score, 2)),
210
+ energy_score=float(round(energy_score, 2)),
211
+ speech_rate_score=float(round(speech_rate_score, 2)),
212
+ pitch_consistency_score=float(round(pitch_consistency_score, 2)),
213
+ snr_score=float(round(snr_score, 2)),
214
+ articulation_score=float(round(articulation_score, 2)),
215
+ transcription=str(transcription),
216
+ target=str(target_text.upper()),
217
+ similarity=float(round(similarity, 4)),
218
+ wer=float(round(wer, 4)),
219
  audio_features=audio_features,
220
+ feedback=str(feedback),
221
+ suggestions=[str(s) for s in suggestions],
222
+ level=int(level)
223
  )
224
 
225
  # =======================================
 
271
  energy_score (0-100)
272
  """
273
  # RMS energy
274
+ rms = float(np.sqrt(np.mean(audio**2)))
275
+ rms_db = float(20 * np.log10(rms + 1e-10))
276
 
277
  # Optimal range: -30 to -10 dB
278
  if -30 <= rms_db <= -10:
279
+ energy_score = 100.0
280
  elif -40 <= rms_db < -30:
281
  energy_score = 60 + (rms_db + 40) * 4
282
  elif -10 < rms_db <= -5:
 
286
  else:
287
  energy_score = max(0, 60 - (rms_db + 5) * 5)
288
 
289
+ return float(min(100, max(0, energy_score)))
290
 
291
  def _score_speech_rate(
292
  self,
 
528
  sr: int,
529
  transcription: str
530
  ) -> Dict:
531
+ """Extract comprehensive audio features - all values as Python native types"""
532
  try:
533
+ duration = float(len(audio) / sr)
534
+ rms = float(np.sqrt(np.mean(audio**2)))
535
+ rms_db = float(20 * np.log10(rms + 1e-10))
536
+ zcr = float(librosa.zero_crossings(audio).sum() / len(audio))
537
 
538
  # Spectral features
539
+ spectral_centroid = float(librosa.feature.spectral_centroid(y=audio, sr=sr)[0].mean())
540
+ spectral_rolloff = float(librosa.feature.spectral_rolloff(y=audio, sr=sr)[0].mean())
541
+ spectral_bandwidth = float(librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0].mean())
542
 
543
  # Tempo
544
  tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
545
+ tempo = float(tempo)
546
 
547
  return {
548
  'duration': round(duration, 3),
549
  'rms_db': round(rms_db, 2),
550
  'zero_crossing_rate': round(zcr, 4),
551
+ 'spectral_centroid': round(spectral_centroid, 2),
552
+ 'spectral_rolloff': round(spectral_rolloff, 2),
553
+ 'spectral_bandwidth': round(spectral_bandwidth, 2),
554
+ 'tempo': round(tempo, 2),
555
+ 'transcription': str(transcription)
556
  }
557
  except Exception as e:
558
  return {
559
+ 'duration': float(len(audio) / sr),
560
  'error': str(e)
561
  }
562