hackaton2

Sleeping

App Files Files Community

hackaton2 / app.py

ayubkhonibrokhimzoda

Update app.py

81dfb18 verified about 1 month ago

raw

history blame contribute delete

7.45 kB

	"""
	Uzbek Speech Recognition with NVIDIA FastConformer
	Client-side audio recording, server-side transcription
	Optimized for Hugging Face Spaces deployment
	"""

	from flask import Flask, render_template, request, jsonify
	from flask_cors import CORS
	import nemo.collections.asr as nemo_asr
	import tempfile
	import os
	import threading
	import librosa
	import soundfile as sf

	app = Flask(__name__)
	CORS(app) # Enable CORS for WebGL Unity builds

	print("=" * 60)
	print("🔄 Loading NVIDIA FastConformer for Uzbek...")
	print("=" * 60)

	asr_model = None
	model_loaded = False

	last_transcribed_text = ""
	last_animation_data = []
	last_transcription_lock = threading.Lock()

	# Available letters in Unity
	AVAILABLE_LETTERS = [
	'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
	'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
	]


	def load_nvidia_model():
	"""Load NeMo ASR model in background thread"""
	global asr_model, model_loaded
	try:
	print("📥 Loading nvidia/stt_uz_fastconformer_hybrid_large_pc...")
	asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
	model_name="nvidia/stt_uz_fastconformer_hybrid_large_pc"
	)
	model_loaded = True
	print("✅ NVIDIA FastConformer loaded successfully!")
	except Exception as e:
	print(f"❌ Model loading error: {e}")
	model_loaded = False


	# Start model loading in background
	model_thread = threading.Thread(target=load_nvidia_model, daemon=True)
	model_thread.start()


	def process_text_to_letters(text):
	"""
	Convert text to fingerspelling
	Each word shown letter by letter
	"""
	words = text.lower().split()
	animation_data = []

	for word in words:
	if not word:
	continue

	# Clean word - keep only available letters
	clean_word = ''.join(c for c in word if c.isalnum() and c in AVAILABLE_LETTERS)

	if not clean_word:
	continue

	letters = list(clean_word)

	if letters:
	animation_data.append({
	"type": "letters",
	"word": clean_word,
	"letters": letters
	})
	print(f"🔤 Word '{clean_word}' → Fingerspelling: {' '.join(letters).upper()}")

	return animation_data


	def transcribe_uzbek_audio(audio_path):
	"""Transcribe audio file using NeMo model"""
	global asr_model

	try:
	# Ensure audio is in correct format (16kHz mono)
	audio, sr = librosa.load(audio_path, sr=16000, mono=True)

	# Save as temporary WAV file
	temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
	sf.write(temp_wav.name, audio, 16000)

	# Transcribe
	outputs = asr_model.transcribe([temp_wav.name])

	# Clean up temp file
	try:
	os.unlink(temp_wav.name)
	except:
	pass

	if outputs and len(outputs) > 0:
	result = outputs[0]

	# FIX: Extract text from Hypothesis object
	if hasattr(result, 'text'):
	return result.text # Get the .text attribute
	elif isinstance(result, list):
	result = " ".join(str(item) for item in result)
	return str(result)
	else:
	return str(result)

	except Exception as e:
	print(f"❌ Transcription error: {e}")
	return ""

	return ""


	@app.route('/')
	def index():
	"""Serve main web interface"""
	return render_template('testt.html')


	@app.route('/message', methods=['GET'])
	def get_message():
	"""Unity endpoint - send fingerspelling sequence"""
	global last_animation_data

	with last_transcription_lock:
	# Create letter sequence for Unity
	letter_sequence = []

	for item in last_animation_data:
	if item["type"] == "letters":
	for letter in item["letters"]:
	letter_sequence.append(f"letter_{letter}")

	message_string = " ".join(letter_sequence)

	print(f"📤 Sending to Unity: {message_string}")

	return jsonify({
	"message": message_string,
	"animation_data": last_animation_data,
	"language": "uz"
	})


	@app.route('/transcribe', methods=['POST'])
	def transcribe():
	"""
	Receive audio from browser, transcribe, return fingerspelling data
	Expects audio file in FormData with key 'audio'
	"""
	global model_loaded, last_transcribed_text, last_animation_data

	if not model_loaded:
	return jsonify({
	"success": False,
	"error": "Model is still loading... Please wait."
	}), 503

	try:
	# Check if audio file was sent
	if 'audio' not in request.files:
	return jsonify({
	"success": False,
	"error": "No audio file received"
	}), 400

	audio_file = request.files['audio']

	if audio_file.filename == '':
	return jsonify({
	"success": False,
	"error": "Empty audio file"
	}), 400

	print(f"📁 Received audio file: {audio_file.filename}")

	# Save to temporary file
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
	tmp_path = tmp_file.name
	audio_file.save(tmp_path)

	print("🎙️ Transcribing audio...")

	# Transcribe
	uzbek_text = transcribe_uzbek_audio(tmp_path)

	# Clean up temp file
	try:
	os.unlink(tmp_path)
	except:
	pass

	if uzbek_text:
	print(f"✅ Transcribed: {uzbek_text}")

	# Convert to fingerspelling
	animation_data = process_text_to_letters(uzbek_text)

	# Save for Unity endpoint
	with last_transcription_lock:
	last_transcribed_text = uzbek_text
	last_animation_data = animation_data

	return jsonify({
	"success": True,
	"uzbek": uzbek_text,
	"animation_data": animation_data,
	"language": "uz",
	"model": "NVIDIA FastConformer"
	})
	else:
	return jsonify({
	"success": False,
	"error": "Could not transcribe audio. Please speak clearly."
	}), 400

	except Exception as e:
	print(f"❌ Error: {e}")
	import traceback
	traceback.print_exc()
	return jsonify({
	"success": False,
	"error": f"Server error: {str(e)}"
	}), 500


	@app.route('/health')
	def health():
	"""Health check endpoint"""
	return jsonify({
	"status": "running",
	"model_loaded": model_loaded,
	"model": "NVIDIA FastConformer (uz)" if model_loaded else "loading...",
	"wer": "16.46%" if model_loaded else "N/A",
	"mode": "Client-side Recording + Server-side Transcription"
	})


	if __name__ == '__main__':
	print("=" * 60)
	print("🚀 NVIDIA FastConformer - Fingerspelling Server")
	print("=" * 60)
	print("📱 Interface: http://0.0.0.0:7860")
	print("🎯 WER: 16.46%")
	print("🔤 MODE: Client-side recording")
	print("🎮 Unity endpoint: /message")
	print("=" * 60)

	# Hugging Face Spaces uses port 7860
	app.run(host='0.0.0.0', port=7860, debug=False)