Spaces:
Sleeping
Sleeping
| """ | |
| Uzbek Speech Recognition with NVIDIA FastConformer | |
| Client-side audio recording, server-side transcription | |
| Optimized for Hugging Face Spaces deployment | |
| """ | |
| from flask import Flask, render_template, request, jsonify | |
| from flask_cors import CORS | |
| import nemo.collections.asr as nemo_asr | |
| import tempfile | |
| import os | |
| import threading | |
| import librosa | |
| import soundfile as sf | |
| app = Flask(__name__) | |
| CORS(app) # Enable CORS for WebGL Unity builds | |
| print("=" * 60) | |
| print("๐ Loading NVIDIA FastConformer for Uzbek...") | |
| print("=" * 60) | |
| asr_model = None | |
| model_loaded = False | |
| last_transcribed_text = "" | |
| last_animation_data = [] | |
| last_transcription_lock = threading.Lock() | |
| # Available letters in Unity | |
| AVAILABLE_LETTERS = [ | |
| 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', | |
| 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' | |
| ] | |
| def load_nvidia_model(): | |
| """Load NeMo ASR model in background thread""" | |
| global asr_model, model_loaded | |
| try: | |
| print("๐ฅ Loading nvidia/stt_uz_fastconformer_hybrid_large_pc...") | |
| asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained( | |
| model_name="nvidia/stt_uz_fastconformer_hybrid_large_pc" | |
| ) | |
| model_loaded = True | |
| print("โ NVIDIA FastConformer loaded successfully!") | |
| except Exception as e: | |
| print(f"โ Model loading error: {e}") | |
| model_loaded = False | |
| # Start model loading in background | |
| model_thread = threading.Thread(target=load_nvidia_model, daemon=True) | |
| model_thread.start() | |
| def process_text_to_letters(text): | |
| """ | |
| Convert text to fingerspelling | |
| Each word shown letter by letter | |
| """ | |
| words = text.lower().split() | |
| animation_data = [] | |
| for word in words: | |
| if not word: | |
| continue | |
| # Clean word - keep only available letters | |
| clean_word = ''.join(c for c in word if c.isalnum() and c in AVAILABLE_LETTERS) | |
| if not clean_word: | |
| continue | |
| letters = list(clean_word) | |
| if letters: | |
| animation_data.append({ | |
| "type": "letters", | |
| "word": clean_word, | |
| "letters": letters | |
| }) | |
| print(f"๐ค Word '{clean_word}' โ Fingerspelling: {' '.join(letters).upper()}") | |
| return animation_data | |
| def transcribe_uzbek_audio(audio_path): | |
| """Transcribe audio file using NeMo model""" | |
| global asr_model | |
| try: | |
| # Ensure audio is in correct format (16kHz mono) | |
| audio, sr = librosa.load(audio_path, sr=16000, mono=True) | |
| # Save as temporary WAV file | |
| temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) | |
| sf.write(temp_wav.name, audio, 16000) | |
| # Transcribe | |
| outputs = asr_model.transcribe([temp_wav.name]) | |
| # Clean up temp file | |
| try: | |
| os.unlink(temp_wav.name) | |
| except: | |
| pass | |
| if outputs and len(outputs) > 0: | |
| result = outputs[0] | |
| # FIX: Extract text from Hypothesis object | |
| if hasattr(result, 'text'): | |
| return result.text # Get the .text attribute | |
| elif isinstance(result, list): | |
| result = " ".join(str(item) for item in result) | |
| return str(result) | |
| else: | |
| return str(result) | |
| except Exception as e: | |
| print(f"โ Transcription error: {e}") | |
| return "" | |
| return "" | |
| def index(): | |
| """Serve main web interface""" | |
| return render_template('testt.html') | |
| def get_message(): | |
| """Unity endpoint - send fingerspelling sequence""" | |
| global last_animation_data | |
| with last_transcription_lock: | |
| # Create letter sequence for Unity | |
| letter_sequence = [] | |
| for item in last_animation_data: | |
| if item["type"] == "letters": | |
| for letter in item["letters"]: | |
| letter_sequence.append(f"letter_{letter}") | |
| message_string = " ".join(letter_sequence) | |
| print(f"๐ค Sending to Unity: {message_string}") | |
| return jsonify({ | |
| "message": message_string, | |
| "animation_data": last_animation_data, | |
| "language": "uz" | |
| }) | |
| def transcribe(): | |
| """ | |
| Receive audio from browser, transcribe, return fingerspelling data | |
| Expects audio file in FormData with key 'audio' | |
| """ | |
| global model_loaded, last_transcribed_text, last_animation_data | |
| if not model_loaded: | |
| return jsonify({ | |
| "success": False, | |
| "error": "Model is still loading... Please wait." | |
| }), 503 | |
| try: | |
| # Check if audio file was sent | |
| if 'audio' not in request.files: | |
| return jsonify({ | |
| "success": False, | |
| "error": "No audio file received" | |
| }), 400 | |
| audio_file = request.files['audio'] | |
| if audio_file.filename == '': | |
| return jsonify({ | |
| "success": False, | |
| "error": "Empty audio file" | |
| }), 400 | |
| print(f"๐ Received audio file: {audio_file.filename}") | |
| # Save to temporary file | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: | |
| tmp_path = tmp_file.name | |
| audio_file.save(tmp_path) | |
| print("๐๏ธ Transcribing audio...") | |
| # Transcribe | |
| uzbek_text = transcribe_uzbek_audio(tmp_path) | |
| # Clean up temp file | |
| try: | |
| os.unlink(tmp_path) | |
| except: | |
| pass | |
| if uzbek_text: | |
| print(f"โ Transcribed: {uzbek_text}") | |
| # Convert to fingerspelling | |
| animation_data = process_text_to_letters(uzbek_text) | |
| # Save for Unity endpoint | |
| with last_transcription_lock: | |
| last_transcribed_text = uzbek_text | |
| last_animation_data = animation_data | |
| return jsonify({ | |
| "success": True, | |
| "uzbek": uzbek_text, | |
| "animation_data": animation_data, | |
| "language": "uz", | |
| "model": "NVIDIA FastConformer" | |
| }) | |
| else: | |
| return jsonify({ | |
| "success": False, | |
| "error": "Could not transcribe audio. Please speak clearly." | |
| }), 400 | |
| except Exception as e: | |
| print(f"โ Error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return jsonify({ | |
| "success": False, | |
| "error": f"Server error: {str(e)}" | |
| }), 500 | |
| def health(): | |
| """Health check endpoint""" | |
| return jsonify({ | |
| "status": "running", | |
| "model_loaded": model_loaded, | |
| "model": "NVIDIA FastConformer (uz)" if model_loaded else "loading...", | |
| "wer": "16.46%" if model_loaded else "N/A", | |
| "mode": "Client-side Recording + Server-side Transcription" | |
| }) | |
| if __name__ == '__main__': | |
| print("=" * 60) | |
| print("๐ NVIDIA FastConformer - Fingerspelling Server") | |
| print("=" * 60) | |
| print("๐ฑ Interface: http://0.0.0.0:7860") | |
| print("๐ฏ WER: 16.46%") | |
| print("๐ค MODE: Client-side recording") | |
| print("๐ฎ Unity endpoint: /message") | |
| print("=" * 60) | |
| # Hugging Face Spaces uses port 7860 | |
| app.run(host='0.0.0.0', port=7860, debug=False) |