Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
7de9a5a
1
Parent(s):
4838f88
Fix: Add JSON sanitization for numpy types to prevent Content-Length errors
Browse files- api/routes.py +8 -41
- core/scoring_engine.py +62 -35
api/routes.py
CHANGED
|
@@ -208,26 +208,9 @@ async def score_audio(
|
|
| 208 |
# Clean up temp file
|
| 209 |
os.unlink(tmp_path)
|
| 210 |
|
| 211 |
-
# Convert result to dict
|
| 212 |
-
response_data =
|
| 213 |
-
|
| 214 |
-
"overall_score": float(result.overall_score),
|
| 215 |
-
"grade": str(result.grade),
|
| 216 |
-
"clarity_score": float(result.clarity_score),
|
| 217 |
-
"energy_score": float(result.energy_score),
|
| 218 |
-
"speech_rate_score": float(result.speech_rate_score),
|
| 219 |
-
"pitch_consistency_score": float(result.pitch_consistency_score),
|
| 220 |
-
"snr_score": float(result.snr_score),
|
| 221 |
-
"articulation_score": float(result.articulation_score),
|
| 222 |
-
"transcription": str(result.transcription),
|
| 223 |
-
"target": str(result.target),
|
| 224 |
-
"similarity": float(result.similarity),
|
| 225 |
-
"wer": float(result.wer),
|
| 226 |
-
"feedback": str(result.feedback),
|
| 227 |
-
"suggestions": [str(s) for s in result.suggestions],
|
| 228 |
-
"audio_features": {k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in result.audio_features.items()},
|
| 229 |
-
"level": int(result.level)
|
| 230 |
-
}
|
| 231 |
|
| 232 |
# Return with explicit JSON response
|
| 233 |
return JSONResponse(
|
|
@@ -300,27 +283,11 @@ async def batch_score_audio(
|
|
| 300 |
# Clean up
|
| 301 |
os.unlink(tmp_path)
|
| 302 |
|
| 303 |
-
#
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
"grade": str(result.grade),
|
| 309 |
-
"clarity_score": float(result.clarity_score),
|
| 310 |
-
"energy_score": float(result.energy_score),
|
| 311 |
-
"speech_rate_score": float(result.speech_rate_score),
|
| 312 |
-
"pitch_consistency_score": float(result.pitch_consistency_score),
|
| 313 |
-
"snr_score": float(result.snr_score),
|
| 314 |
-
"articulation_score": float(result.articulation_score),
|
| 315 |
-
"transcription": str(result.transcription),
|
| 316 |
-
"target": str(result.target),
|
| 317 |
-
"similarity": float(result.similarity),
|
| 318 |
-
"wer": float(result.wer),
|
| 319 |
-
"feedback": str(result.feedback),
|
| 320 |
-
"suggestions": [str(s) for s in result.suggestions],
|
| 321 |
-
"audio_features": {k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in result.audio_features.items()},
|
| 322 |
-
"level": int(result.level)
|
| 323 |
-
})
|
| 324 |
|
| 325 |
except Exception as e:
|
| 326 |
if 'tmp_path' in locals() and os.path.exists(tmp_path):
|
|
|
|
| 208 |
# Clean up temp file
|
| 209 |
os.unlink(tmp_path)
|
| 210 |
|
| 211 |
+
# Convert result to dict with JSON-safe types
|
| 212 |
+
response_data = result.to_dict()
|
| 213 |
+
response_data["success"] = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
# Return with explicit JSON response
|
| 216 |
return JSONResponse(
|
|
|
|
| 283 |
# Clean up
|
| 284 |
os.unlink(tmp_path)
|
| 285 |
|
| 286 |
+
# Convert to dict with JSON-safe types
|
| 287 |
+
result_dict = result.to_dict()
|
| 288 |
+
result_dict["filename"] = audio.filename
|
| 289 |
+
result_dict["success"] = True
|
| 290 |
+
results.append(result_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
|
| 292 |
except Exception as e:
|
| 293 |
if 'tmp_path' in locals() and os.path.exists(tmp_path):
|
core/scoring_engine.py
CHANGED
|
@@ -13,13 +13,34 @@ from transformers import (
|
|
| 13 |
WhisperForConditionalGeneration,
|
| 14 |
pipeline
|
| 15 |
)
|
| 16 |
-
from typing import Dict, List, Tuple, Optional
|
| 17 |
-
from dataclasses import dataclass
|
| 18 |
import difflib
|
| 19 |
import re
|
|
|
|
| 20 |
|
| 21 |
from .constants import ARTICULATION_LEVELS, LEVEL_WEIGHTS
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# =======================================
|
| 24 |
# SCORE RESULT DATACLASS
|
| 25 |
# =======================================
|
|
@@ -52,6 +73,11 @@ class ScoreResult:
|
|
| 52 |
feedback: str
|
| 53 |
suggestions: List[str]
|
| 54 |
level: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
# =======================================
|
| 57 |
# ADVANCED SCORING SYSTEM
|
|
@@ -178,22 +204,22 @@ class AdvancedVocalScoringSystem:
|
|
| 178 |
)
|
| 179 |
|
| 180 |
return ScoreResult(
|
| 181 |
-
overall_score=round(overall_score, 2),
|
| 182 |
-
grade=grade,
|
| 183 |
-
clarity_score=round(clarity_score, 2),
|
| 184 |
-
energy_score=round(energy_score, 2),
|
| 185 |
-
speech_rate_score=round(speech_rate_score, 2),
|
| 186 |
-
pitch_consistency_score=round(pitch_consistency_score, 2),
|
| 187 |
-
snr_score=round(snr_score, 2),
|
| 188 |
-
articulation_score=round(articulation_score, 2),
|
| 189 |
-
transcription=transcription,
|
| 190 |
-
target=target_text.upper(),
|
| 191 |
-
similarity=round(similarity, 4),
|
| 192 |
-
wer=round(wer, 4),
|
| 193 |
audio_features=audio_features,
|
| 194 |
-
feedback=feedback,
|
| 195 |
-
suggestions=suggestions,
|
| 196 |
-
level=level
|
| 197 |
)
|
| 198 |
|
| 199 |
# =======================================
|
|
@@ -245,12 +271,12 @@ class AdvancedVocalScoringSystem:
|
|
| 245 |
energy_score (0-100)
|
| 246 |
"""
|
| 247 |
# RMS energy
|
| 248 |
-
rms = np.sqrt(np.mean(audio**2))
|
| 249 |
-
rms_db = 20 * np.log10(rms + 1e-10)
|
| 250 |
|
| 251 |
# Optimal range: -30 to -10 dB
|
| 252 |
if -30 <= rms_db <= -10:
|
| 253 |
-
energy_score = 100
|
| 254 |
elif -40 <= rms_db < -30:
|
| 255 |
energy_score = 60 + (rms_db + 40) * 4
|
| 256 |
elif -10 < rms_db <= -5:
|
|
@@ -260,7 +286,7 @@ class AdvancedVocalScoringSystem:
|
|
| 260 |
else:
|
| 261 |
energy_score = max(0, 60 - (rms_db + 5) * 5)
|
| 262 |
|
| 263 |
-
return min(100, max(0, energy_score))
|
| 264 |
|
| 265 |
def _score_speech_rate(
|
| 266 |
self,
|
|
@@ -502,34 +528,35 @@ class AdvancedVocalScoringSystem:
|
|
| 502 |
sr: int,
|
| 503 |
transcription: str
|
| 504 |
) -> Dict:
|
| 505 |
-
"""Extract comprehensive audio features"""
|
| 506 |
try:
|
| 507 |
-
duration = len(audio) / sr
|
| 508 |
-
rms = np.sqrt(np.mean(audio**2))
|
| 509 |
-
rms_db = 20 * np.log10(rms + 1e-10)
|
| 510 |
-
zcr = librosa.zero_crossings(audio).sum() / len(audio)
|
| 511 |
|
| 512 |
# Spectral features
|
| 513 |
-
spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0].mean()
|
| 514 |
-
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0].mean()
|
| 515 |
-
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0].mean()
|
| 516 |
|
| 517 |
# Tempo
|
| 518 |
tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
|
|
|
|
| 519 |
|
| 520 |
return {
|
| 521 |
'duration': round(duration, 3),
|
| 522 |
'rms_db': round(rms_db, 2),
|
| 523 |
'zero_crossing_rate': round(zcr, 4),
|
| 524 |
-
'spectral_centroid': round(
|
| 525 |
-
'spectral_rolloff': round(
|
| 526 |
-
'spectral_bandwidth': round(
|
| 527 |
-
'tempo': round(
|
| 528 |
-
'transcription': transcription
|
| 529 |
}
|
| 530 |
except Exception as e:
|
| 531 |
return {
|
| 532 |
-
'duration': len(audio) / sr,
|
| 533 |
'error': str(e)
|
| 534 |
}
|
| 535 |
|
|
|
|
| 13 |
WhisperForConditionalGeneration,
|
| 14 |
pipeline
|
| 15 |
)
|
| 16 |
+
from typing import Dict, List, Tuple, Optional, Any
|
| 17 |
+
from dataclasses import dataclass, asdict
|
| 18 |
import difflib
|
| 19 |
import re
|
| 20 |
+
import json
|
| 21 |
|
| 22 |
from .constants import ARTICULATION_LEVELS, LEVEL_WEIGHTS
|
| 23 |
|
| 24 |
+
# =======================================
|
| 25 |
+
# HELPER FUNCTIONS FOR JSON SERIALIZATION
|
| 26 |
+
# =======================================
|
| 27 |
+
|
| 28 |
+
def sanitize_for_json(obj: Any) -> Any:
|
| 29 |
+
"""Convert numpy/torch types to Python native types for JSON serialization"""
|
| 30 |
+
if isinstance(obj, (np.integer, np.int64, np.int32)):
|
| 31 |
+
return int(obj)
|
| 32 |
+
elif isinstance(obj, (np.floating, np.float64, np.float32)):
|
| 33 |
+
return float(obj)
|
| 34 |
+
elif isinstance(obj, np.ndarray):
|
| 35 |
+
return obj.tolist()
|
| 36 |
+
elif isinstance(obj, dict):
|
| 37 |
+
return {key: sanitize_for_json(value) for key, value in obj.items()}
|
| 38 |
+
elif isinstance(obj, (list, tuple)):
|
| 39 |
+
return [sanitize_for_json(item) for item in obj]
|
| 40 |
+
elif isinstance(obj, torch.Tensor):
|
| 41 |
+
return obj.detach().cpu().numpy().tolist()
|
| 42 |
+
return obj
|
| 43 |
+
|
| 44 |
# =======================================
|
| 45 |
# SCORE RESULT DATACLASS
|
| 46 |
# =======================================
|
|
|
|
| 73 |
feedback: str
|
| 74 |
suggestions: List[str]
|
| 75 |
level: int
|
| 76 |
+
|
| 77 |
+
def to_dict(self) -> Dict:
|
| 78 |
+
"""Convert to dict with JSON-safe types"""
|
| 79 |
+
data = asdict(self)
|
| 80 |
+
return sanitize_for_json(data)
|
| 81 |
|
| 82 |
# =======================================
|
| 83 |
# ADVANCED SCORING SYSTEM
|
|
|
|
| 204 |
)
|
| 205 |
|
| 206 |
return ScoreResult(
|
| 207 |
+
overall_score=float(round(overall_score, 2)),
|
| 208 |
+
grade=str(grade),
|
| 209 |
+
clarity_score=float(round(clarity_score, 2)),
|
| 210 |
+
energy_score=float(round(energy_score, 2)),
|
| 211 |
+
speech_rate_score=float(round(speech_rate_score, 2)),
|
| 212 |
+
pitch_consistency_score=float(round(pitch_consistency_score, 2)),
|
| 213 |
+
snr_score=float(round(snr_score, 2)),
|
| 214 |
+
articulation_score=float(round(articulation_score, 2)),
|
| 215 |
+
transcription=str(transcription),
|
| 216 |
+
target=str(target_text.upper()),
|
| 217 |
+
similarity=float(round(similarity, 4)),
|
| 218 |
+
wer=float(round(wer, 4)),
|
| 219 |
audio_features=audio_features,
|
| 220 |
+
feedback=str(feedback),
|
| 221 |
+
suggestions=[str(s) for s in suggestions],
|
| 222 |
+
level=int(level)
|
| 223 |
)
|
| 224 |
|
| 225 |
# =======================================
|
|
|
|
| 271 |
energy_score (0-100)
|
| 272 |
"""
|
| 273 |
# RMS energy
|
| 274 |
+
rms = float(np.sqrt(np.mean(audio**2)))
|
| 275 |
+
rms_db = float(20 * np.log10(rms + 1e-10))
|
| 276 |
|
| 277 |
# Optimal range: -30 to -10 dB
|
| 278 |
if -30 <= rms_db <= -10:
|
| 279 |
+
energy_score = 100.0
|
| 280 |
elif -40 <= rms_db < -30:
|
| 281 |
energy_score = 60 + (rms_db + 40) * 4
|
| 282 |
elif -10 < rms_db <= -5:
|
|
|
|
| 286 |
else:
|
| 287 |
energy_score = max(0, 60 - (rms_db + 5) * 5)
|
| 288 |
|
| 289 |
+
return float(min(100, max(0, energy_score)))
|
| 290 |
|
| 291 |
def _score_speech_rate(
|
| 292 |
self,
|
|
|
|
| 528 |
sr: int,
|
| 529 |
transcription: str
|
| 530 |
) -> Dict:
|
| 531 |
+
"""Extract comprehensive audio features - all values as Python native types"""
|
| 532 |
try:
|
| 533 |
+
duration = float(len(audio) / sr)
|
| 534 |
+
rms = float(np.sqrt(np.mean(audio**2)))
|
| 535 |
+
rms_db = float(20 * np.log10(rms + 1e-10))
|
| 536 |
+
zcr = float(librosa.zero_crossings(audio).sum() / len(audio))
|
| 537 |
|
| 538 |
# Spectral features
|
| 539 |
+
spectral_centroid = float(librosa.feature.spectral_centroid(y=audio, sr=sr)[0].mean())
|
| 540 |
+
spectral_rolloff = float(librosa.feature.spectral_rolloff(y=audio, sr=sr)[0].mean())
|
| 541 |
+
spectral_bandwidth = float(librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0].mean())
|
| 542 |
|
| 543 |
# Tempo
|
| 544 |
tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
|
| 545 |
+
tempo = float(tempo)
|
| 546 |
|
| 547 |
return {
|
| 548 |
'duration': round(duration, 3),
|
| 549 |
'rms_db': round(rms_db, 2),
|
| 550 |
'zero_crossing_rate': round(zcr, 4),
|
| 551 |
+
'spectral_centroid': round(spectral_centroid, 2),
|
| 552 |
+
'spectral_rolloff': round(spectral_rolloff, 2),
|
| 553 |
+
'spectral_bandwidth': round(spectral_bandwidth, 2),
|
| 554 |
+
'tempo': round(tempo, 2),
|
| 555 |
+
'transcription': str(transcription)
|
| 556 |
}
|
| 557 |
except Exception as e:
|
| 558 |
return {
|
| 559 |
+
'duration': float(len(audio) / sr),
|
| 560 |
'error': str(e)
|
| 561 |
}
|
| 562 |
|