Spaces:

Cyberlace
/

latihan-artikulasi

Running on Zero

App Files Files Community

fariedalfarizi commited on 4 days ago

Commit

7de9a5a

1 Parent(s): 4838f88

Fix: Add JSON sanitization for numpy types to prevent Content-Length errors

Browse files

Files changed (2) hide show

api/routes.py +8 -41
core/scoring_engine.py +62 -35

api/routes.py CHANGED Viewed

@@ -208,26 +208,9 @@ async def score_audio(
         # Clean up temp file
         os.unlink(tmp_path)
-        # Convert result to dict and ensure proper serialization
-        response_data = {
-            "success": True,
-            "overall_score": float(result.overall_score),
-            "grade": str(result.grade),
-            "clarity_score": float(result.clarity_score),
-            "energy_score": float(result.energy_score),
-            "speech_rate_score": float(result.speech_rate_score),
-            "pitch_consistency_score": float(result.pitch_consistency_score),
-            "snr_score": float(result.snr_score),
-            "articulation_score": float(result.articulation_score),
-            "transcription": str(result.transcription),
-            "target": str(result.target),
-            "similarity": float(result.similarity),
-            "wer": float(result.wer),
-            "feedback": str(result.feedback),
-            "suggestions": [str(s) for s in result.suggestions],
-            "audio_features": {k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in result.audio_features.items()},
-            "level": int(result.level)
-        }
         # Return with explicit JSON response
         return JSONResponse(
@@ -300,27 +283,11 @@ async def batch_score_audio(
             # Clean up
             os.unlink(tmp_path)
-            # Properly serialize result
-            results.append({
-                "filename": audio.filename,
-                "success": True,
-                "overall_score": float(result.overall_score),
-                "grade": str(result.grade),
-                "clarity_score": float(result.clarity_score),
-                "energy_score": float(result.energy_score),
-                "speech_rate_score": float(result.speech_rate_score),
-                "pitch_consistency_score": float(result.pitch_consistency_score),
-                "snr_score": float(result.snr_score),
-                "articulation_score": float(result.articulation_score),
-                "transcription": str(result.transcription),
-                "target": str(result.target),
-                "similarity": float(result.similarity),
-                "wer": float(result.wer),
-                "feedback": str(result.feedback),
-                "suggestions": [str(s) for s in result.suggestions],
-                "audio_features": {k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in result.audio_features.items()},
-                "level": int(result.level)
-            })
         except Exception as e:
             if 'tmp_path' in locals() and os.path.exists(tmp_path):

         # Clean up temp file
         os.unlink(tmp_path)
+        # Convert result to dict with JSON-safe types
+        response_data = result.to_dict()
+        response_data["success"] = True
         # Return with explicit JSON response
         return JSONResponse(
             # Clean up
             os.unlink(tmp_path)
+            # Convert to dict with JSON-safe types
+            result_dict = result.to_dict()
+            result_dict["filename"] = audio.filename
+            result_dict["success"] = True
+            results.append(result_dict)
         except Exception as e:
             if 'tmp_path' in locals() and os.path.exists(tmp_path):

core/scoring_engine.py CHANGED Viewed

@@ -13,13 +13,34 @@ from transformers import (
     WhisperForConditionalGeneration,
     pipeline
 )
-from typing import Dict, List, Tuple, Optional
-from dataclasses import dataclass
 import difflib
 import re
 from .constants import ARTICULATION_LEVELS, LEVEL_WEIGHTS
 # =======================================
 # SCORE RESULT DATACLASS
 # =======================================
@@ -52,6 +73,11 @@ class ScoreResult:
     feedback: str
     suggestions: List[str]
     level: int
 # =======================================
 # ADVANCED SCORING SYSTEM
@@ -178,22 +204,22 @@ class AdvancedVocalScoringSystem:
         )
         return ScoreResult(
-            overall_score=round(overall_score, 2),
-            grade=grade,
-            clarity_score=round(clarity_score, 2),
-            energy_score=round(energy_score, 2),
-            speech_rate_score=round(speech_rate_score, 2),
-            pitch_consistency_score=round(pitch_consistency_score, 2),
-            snr_score=round(snr_score, 2),
-            articulation_score=round(articulation_score, 2),
-            transcription=transcription,
-            target=target_text.upper(),
-            similarity=round(similarity, 4),
-            wer=round(wer, 4),
             audio_features=audio_features,
-            feedback=feedback,
-            suggestions=suggestions,
-            level=level
         )
     # =======================================
@@ -245,12 +271,12 @@ class AdvancedVocalScoringSystem:
             energy_score (0-100)
         """
         # RMS energy
-        rms = np.sqrt(np.mean(audio**2))
-        rms_db = 20 * np.log10(rms + 1e-10)
         # Optimal range: -30 to -10 dB
         if -30 <= rms_db <= -10:
-            energy_score = 100
         elif -40 <= rms_db < -30:
             energy_score = 60 + (rms_db + 40) * 4
         elif -10 < rms_db <= -5:
@@ -260,7 +286,7 @@ class AdvancedVocalScoringSystem:
         else:
             energy_score = max(0, 60 - (rms_db + 5) * 5)
-        return min(100, max(0, energy_score))
     def _score_speech_rate(
         self,
@@ -502,34 +528,35 @@ class AdvancedVocalScoringSystem:
         sr: int,
         transcription: str
     ) -> Dict:
-        """Extract comprehensive audio features"""
         try:
-            duration = len(audio) / sr
-            rms = np.sqrt(np.mean(audio**2))
-            rms_db = 20 * np.log10(rms + 1e-10)
-            zcr = librosa.zero_crossings(audio).sum() / len(audio)
             # Spectral features
-            spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0].mean()
-            spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0].mean()
-            spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0].mean()
             # Tempo
             tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
             return {
                 'duration': round(duration, 3),
                 'rms_db': round(rms_db, 2),
                 'zero_crossing_rate': round(zcr, 4),
-                'spectral_centroid': round(float(spectral_centroid), 2),
-                'spectral_rolloff': round(float(spectral_rolloff), 2),
-                'spectral_bandwidth': round(float(spectral_bandwidth), 2),
-                'tempo': round(float(tempo), 2),
-                'transcription': transcription
             }
         except Exception as e:
             return {
-                'duration': len(audio) / sr,
                 'error': str(e)
             }

     WhisperForConditionalGeneration,
     pipeline
 )
+from typing import Dict, List, Tuple, Optional, Any
+from dataclasses import dataclass, asdict
 import difflib
 import re
+import json
 from .constants import ARTICULATION_LEVELS, LEVEL_WEIGHTS
+# =======================================
+# HELPER FUNCTIONS FOR JSON SERIALIZATION
+# =======================================
+def sanitize_for_json(obj: Any) -> Any:
+    """Convert numpy/torch types to Python native types for JSON serialization"""
+    if isinstance(obj, (np.integer, np.int64, np.int32)):
+        return int(obj)
+    elif isinstance(obj, (np.floating, np.float64, np.float32)):
+        return float(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {key: sanitize_for_json(value) for key, value in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return [sanitize_for_json(item) for item in obj]
+    elif isinstance(obj, torch.Tensor):
+        return obj.detach().cpu().numpy().tolist()
+    return obj
 # =======================================
 # SCORE RESULT DATACLASS
 # =======================================
     feedback: str
     suggestions: List[str]
     level: int
+    def to_dict(self) -> Dict:
+        """Convert to dict with JSON-safe types"""
+        data = asdict(self)
+        return sanitize_for_json(data)
 # =======================================
 # ADVANCED SCORING SYSTEM
         )
         return ScoreResult(
+            overall_score=float(round(overall_score, 2)),
+            grade=str(grade),
+            clarity_score=float(round(clarity_score, 2)),
+            energy_score=float(round(energy_score, 2)),
+            speech_rate_score=float(round(speech_rate_score, 2)),
+            pitch_consistency_score=float(round(pitch_consistency_score, 2)),
+            snr_score=float(round(snr_score, 2)),
+            articulation_score=float(round(articulation_score, 2)),
+            transcription=str(transcription),
+            target=str(target_text.upper()),
+            similarity=float(round(similarity, 4)),
+            wer=float(round(wer, 4)),
             audio_features=audio_features,
+            feedback=str(feedback),
+            suggestions=[str(s) for s in suggestions],
+            level=int(level)
         )
     # =======================================
             energy_score (0-100)
         """
         # RMS energy
+        rms = float(np.sqrt(np.mean(audio**2)))
+        rms_db = float(20 * np.log10(rms + 1e-10))
         # Optimal range: -30 to -10 dB
         if -30 <= rms_db <= -10:
+            energy_score = 100.0
         elif -40 <= rms_db < -30:
             energy_score = 60 + (rms_db + 40) * 4
         elif -10 < rms_db <= -5:
         else:
             energy_score = max(0, 60 - (rms_db + 5) * 5)
+        return float(min(100, max(0, energy_score)))
     def _score_speech_rate(
         self,
         sr: int,
         transcription: str
     ) -> Dict:
+        """Extract comprehensive audio features - all values as Python native types"""
         try:
+            duration = float(len(audio) / sr)
+            rms = float(np.sqrt(np.mean(audio**2)))
+            rms_db = float(20 * np.log10(rms + 1e-10))
+            zcr = float(librosa.zero_crossings(audio).sum() / len(audio))
             # Spectral features
+            spectral_centroid = float(librosa.feature.spectral_centroid(y=audio, sr=sr)[0].mean())
+            spectral_rolloff = float(librosa.feature.spectral_rolloff(y=audio, sr=sr)[0].mean())
+            spectral_bandwidth = float(librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0].mean())
             # Tempo
             tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
+            tempo = float(tempo)
             return {
                 'duration': round(duration, 3),
                 'rms_db': round(rms_db, 2),
                 'zero_crossing_rate': round(zcr, 4),
+                'spectral_centroid': round(spectral_centroid, 2),
+                'spectral_rolloff': round(spectral_rolloff, 2),
+                'spectral_bandwidth': round(spectral_bandwidth, 2),
+                'tempo': round(tempo, 2),
+                'transcription': str(transcription)
             }
         except Exception as e:
             return {
+                'duration': float(len(audio) / sr),
                 'error': str(e)
             }