""" Uzbek Speech Recognition with NVIDIA FastConformer Client-side audio recording, server-side transcription Optimized for Hugging Face Spaces deployment """ from flask import Flask, render_template, request, jsonify from flask_cors import CORS import nemo.collections.asr as nemo_asr import tempfile import os import threading import librosa import soundfile as sf app = Flask(__name__) CORS(app) # Enable CORS for WebGL Unity builds print("=" * 60) print("🔄 Loading NVIDIA FastConformer for Uzbek...") print("=" * 60) asr_model = None model_loaded = False last_transcribed_text = "" last_animation_data = [] last_transcription_lock = threading.Lock() # Available letters in Unity AVAILABLE_LETTERS = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ] def load_nvidia_model(): """Load NeMo ASR model in background thread""" global asr_model, model_loaded try: print("📥 Loading nvidia/stt_uz_fastconformer_hybrid_large_pc...") asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained( model_name="nvidia/stt_uz_fastconformer_hybrid_large_pc" ) model_loaded = True print("✅ NVIDIA FastConformer loaded successfully!") except Exception as e: print(f"❌ Model loading error: {e}") model_loaded = False # Start model loading in background model_thread = threading.Thread(target=load_nvidia_model, daemon=True) model_thread.start() def process_text_to_letters(text): """ Convert text to fingerspelling Each word shown letter by letter """ words = text.lower().split() animation_data = [] for word in words: if not word: continue # Clean word - keep only available letters clean_word = ''.join(c for c in word if c.isalnum() and c in AVAILABLE_LETTERS) if not clean_word: continue letters = list(clean_word) if letters: animation_data.append({ "type": "letters", "word": clean_word, "letters": letters }) print(f"🔤 Word '{clean_word}' → Fingerspelling: {' '.join(letters).upper()}") return animation_data def transcribe_uzbek_audio(audio_path): """Transcribe audio file using NeMo model""" global asr_model try: # Ensure audio is in correct format (16kHz mono) audio, sr = librosa.load(audio_path, sr=16000, mono=True) # Save as temporary WAV file temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) sf.write(temp_wav.name, audio, 16000) # Transcribe outputs = asr_model.transcribe([temp_wav.name]) # Clean up temp file try: os.unlink(temp_wav.name) except: pass if outputs and len(outputs) > 0: result = outputs[0] # FIX: Extract text from Hypothesis object if hasattr(result, 'text'): return result.text # Get the .text attribute elif isinstance(result, list): result = " ".join(str(item) for item in result) return str(result) else: return str(result) except Exception as e: print(f"❌ Transcription error: {e}") return "" return "" @app.route('/') def index(): """Serve main web interface""" return render_template('testt.html') @app.route('/message', methods=['GET']) def get_message(): """Unity endpoint - send fingerspelling sequence""" global last_animation_data with last_transcription_lock: # Create letter sequence for Unity letter_sequence = [] for item in last_animation_data: if item["type"] == "letters": for letter in item["letters"]: letter_sequence.append(f"letter_{letter}") message_string = " ".join(letter_sequence) print(f"📤 Sending to Unity: {message_string}") return jsonify({ "message": message_string, "animation_data": last_animation_data, "language": "uz" }) @app.route('/transcribe', methods=['POST']) def transcribe(): """ Receive audio from browser, transcribe, return fingerspelling data Expects audio file in FormData with key 'audio' """ global model_loaded, last_transcribed_text, last_animation_data if not model_loaded: return jsonify({ "success": False, "error": "Model is still loading... Please wait." }), 503 try: # Check if audio file was sent if 'audio' not in request.files: return jsonify({ "success": False, "error": "No audio file received" }), 400 audio_file = request.files['audio'] if audio_file.filename == '': return jsonify({ "success": False, "error": "Empty audio file" }), 400 print(f"📁 Received audio file: {audio_file.filename}") # Save to temporary file with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: tmp_path = tmp_file.name audio_file.save(tmp_path) print("🎙️ Transcribing audio...") # Transcribe uzbek_text = transcribe_uzbek_audio(tmp_path) # Clean up temp file try: os.unlink(tmp_path) except: pass if uzbek_text: print(f"✅ Transcribed: {uzbek_text}") # Convert to fingerspelling animation_data = process_text_to_letters(uzbek_text) # Save for Unity endpoint with last_transcription_lock: last_transcribed_text = uzbek_text last_animation_data = animation_data return jsonify({ "success": True, "uzbek": uzbek_text, "animation_data": animation_data, "language": "uz", "model": "NVIDIA FastConformer" }) else: return jsonify({ "success": False, "error": "Could not transcribe audio. Please speak clearly." }), 400 except Exception as e: print(f"❌ Error: {e}") import traceback traceback.print_exc() return jsonify({ "success": False, "error": f"Server error: {str(e)}" }), 500 @app.route('/health') def health(): """Health check endpoint""" return jsonify({ "status": "running", "model_loaded": model_loaded, "model": "NVIDIA FastConformer (uz)" if model_loaded else "loading...", "wer": "16.46%" if model_loaded else "N/A", "mode": "Client-side Recording + Server-side Transcription" }) if __name__ == '__main__': print("=" * 60) print("🚀 NVIDIA FastConformer - Fingerspelling Server") print("=" * 60) print("📱 Interface: http://0.0.0.0:7860") print("🎯 WER: 16.46%") print("🔤 MODE: Client-side recording") print("🎮 Unity endpoint: /message") print("=" * 60) # Hugging Face Spaces uses port 7860 app.run(host='0.0.0.0', port=7860, debug=False)