benjaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa commited on
Commit
d349e36
·
verified ·
1 Parent(s): 17bc2b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -143
app.py CHANGED
@@ -8,19 +8,23 @@ from transformers import (
8
  WhisperForConditionalGeneration,
9
  AutoModelForCausalLM,
10
  AutoTokenizer,
11
- pipeline
12
  )
13
- from TTS.api import TTS
14
  import io
15
- import numpy as np
16
- import soundfile as sf
17
  import tempfile
18
  import os
 
19
 
20
- app = FastAPI(title="Asistente de Voz API")
21
 
22
  # ============================================
23
- # CARGAR MODELOS AL INICIAR
 
 
 
 
 
 
 
24
  # ============================================
25
 
26
  print("🔄 Cargando modelos...")
@@ -31,22 +35,14 @@ whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
31
  whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
32
  whisper_model.eval()
33
 
34
- # 2. MODELO DE LENGUAJE (Conversacional)
35
  print("🤖 Cargando modelo de lenguaje...")
36
- # Opción A: Modelo pequeño en español (recomendado para ESP32)
37
- llm_tokenizer = AutoTokenizer.from_pretrained("DeepESP/gpt2-spanish")
38
- llm_model = AutoModelForCausalLM.from_pretrained("DeepESP/gpt2-spanish")
39
-
40
- # Opción B: Modelo más potente (requiere más RAM)
41
- # llm_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
42
- # llm_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
43
-
44
- # 3. TTS (Text-to-Speech)
45
- print("🔊 Cargando TTS...")
46
- # Usar Coqui TTS con modelo en español
47
- tts = TTS(model_name="tts_models/es/css10/vits", progress_bar=False, gpu=False)
48
 
49
- print("✅ Todos los modelos cargados!\n")
50
 
51
  # ============================================
52
  # MODELOS DE DATOS
@@ -54,51 +50,59 @@ print("✅ Todos los modelos cargados!\n")
54
 
55
  class ChatRequest(BaseModel):
56
  question: str
57
- max_length: int = 100
58
 
59
  class TTSRequest(BaseModel):
60
  text: str
61
 
62
  # ============================================
63
- # ENDPOINT 1: TRANSCRIPCIÓN (Speech-to-Text)
64
  # ============================================
65
 
66
- @app.post("/transcribe")
67
- async def transcribe_audio(file: UploadFile = File(...)):
68
- """
69
- Convierte audio WAV a texto usando Whisper
70
- """
 
71
  try:
72
- print(f"📥 Recibiendo audio: {file.filename}")
73
-
74
- # Leer audio
75
- audio_bytes = await file.read()
76
-
77
- # Guardar temporalmente
78
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
79
- tmp.write(audio_bytes)
80
- tmp_path = tmp.name
81
-
82
- # Cargar con torchaudio
83
  waveform, sample_rate = torchaudio.load(tmp_path)
84
 
85
- # Remuestrear a 16kHz si es necesario
86
  if sample_rate != 16000:
87
  resampler = torchaudio.transforms.Resample(sample_rate, 16000)
88
  waveform = resampler(waveform)
89
 
90
- # Convertir a mono si es estéreo
91
  if waveform.shape[0] > 1:
92
  waveform = torch.mean(waveform, dim=0, keepdim=True)
93
 
94
- # Procesar con Whisper
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  input_features = whisper_processor(
96
- waveform.squeeze().numpy(),
97
  sampling_rate=16000,
98
  return_tensors="pt"
99
  ).input_features
100
 
101
- # Generar transcripción
102
  with torch.no_grad():
103
  predicted_ids = whisper_model.generate(input_features)
104
 
@@ -107,9 +111,6 @@ async def transcribe_audio(file: UploadFile = File(...)):
107
  skip_special_tokens=True
108
  )[0]
109
 
110
- # Limpiar archivo temporal
111
- os.unlink(tmp_path)
112
-
113
  print(f"✅ Transcrito: {transcription}")
114
 
115
  return JSONResponse({
@@ -118,18 +119,16 @@ async def transcribe_audio(file: UploadFile = File(...)):
118
  })
119
 
120
  except Exception as e:
121
- print(f"❌ Error en transcripción: {str(e)}")
122
  raise HTTPException(status_code=500, detail=str(e))
123
 
124
  # ============================================
125
- # ENDPOINT 2: CHAT (IA Conversacional)
126
  # ============================================
127
 
128
  @app.post("/chat")
129
  async def chat(request: ChatRequest):
130
- """
131
- Genera respuesta usando modelo de lenguaje
132
- """
133
  try:
134
  question = request.question.strip()
135
  print(f"💬 Pregunta: {question}")
@@ -140,8 +139,11 @@ async def chat(request: ChatRequest):
140
  "success": False
141
  })
142
 
143
- # Preparar prompt
144
- prompt = f"Pregunta: {question}\nRespuesta:"
 
 
 
145
 
146
  # Generar respuesta
147
  inputs = llm_tokenizer.encode(prompt, return_tensors="pt")
@@ -151,24 +153,30 @@ async def chat(request: ChatRequest):
151
  inputs,
152
  max_length=request.max_length,
153
  num_return_sequences=1,
154
- temperature=0.7,
155
  top_p=0.9,
156
  do_sample=True,
157
- pad_token_id=llm_tokenizer.eos_token_id
 
158
  )
159
 
160
- # Decodificar respuesta
161
  full_text = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
162
 
163
- # Extraer solo la respuesta (después de "Respuesta:")
164
  if "Respuesta:" in full_text:
165
  answer = full_text.split("Respuesta:")[-1].strip()
166
  else:
167
- answer = full_text.strip()
168
 
169
- # Limitar longitud
 
170
  if len(answer) > 200:
171
- answer = answer[:200] + "..."
 
 
 
 
172
 
173
  print(f"✅ Respuesta: {answer}")
174
 
@@ -178,182 +186,230 @@ async def chat(request: ChatRequest):
178
  })
179
 
180
  except Exception as e:
181
- print(f"❌ Error en chat: {str(e)}")
182
  return JSONResponse({
183
- "answer": "Lo siento, tuve un error al procesar tu pregunta",
184
- "success": False,
185
- "error": str(e)
186
  })
187
 
188
  # ============================================
189
- # ENDPOINT 3: TEXT-TO-SPEECH
190
  # ============================================
191
 
192
  @app.post("/tts")
193
  async def text_to_speech(request: TTSRequest):
194
  """
195
- Convierte texto a audio usando Coqui TTS
 
196
  """
197
  try:
198
  text = request.text.strip()
199
- print(f"🔊 Generando voz para: {text[:50]}...")
200
 
201
  if not text:
202
  raise HTTPException(status_code=400, detail="Texto vacío")
203
 
204
- # Limitar longitud para evitar timeouts
205
  if len(text) > 300:
206
  text = text[:300] + "..."
207
 
208
- # Generar audio con TTS
209
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
210
- tmp_path = tmp.name
211
-
212
- # Generar audio
213
- tts.tts_to_file(
214
- text=text,
215
- file_path=tmp_path
216
- )
217
 
218
- # Leer audio generado
219
- with open(tmp_path, "rb") as f:
220
- audio_data = f.read()
221
-
222
- # Limpiar
223
- os.unlink(tmp_path)
224
 
225
- print(f"✅ Audio generado: {len(audio_data)} bytes")
226
-
227
- # Retornar como stream
228
- return StreamingResponse(
229
- io.BytesIO(audio_data),
230
- media_type="audio/wav",
231
- headers={
232
- "Content-Disposition": "attachment; filename=speech.wav"
233
- }
234
  )
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  except Exception as e:
237
- print(f"❌ Error en TTS: {str(e)}")
238
  raise HTTPException(status_code=500, detail=str(e))
239
 
240
  # ============================================
241
- # ENDPOINT 4: PROCESO COMPLETO (OPCIONAL)
242
  # ============================================
243
 
244
  @app.post("/complete")
245
  async def complete_conversation(file: UploadFile = File(...)):
246
  """
247
  Proceso completo: Audio → Texto → IA → Audio
248
- (Alternativa más simple para el ESP32)
249
  """
250
  try:
251
- print("🔄 Iniciando proceso completo...")
 
 
252
 
253
- # 1. Transcribir
 
254
  audio_bytes = await file.read()
255
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
256
- tmp.write(audio_bytes)
257
- tmp_path = tmp.name
258
-
259
- waveform, sample_rate = torchaudio.load(tmp_path)
260
- if sample_rate != 16000:
261
- resampler = torchaudio.transforms.Resample(sample_rate, 16000)
262
- waveform = resampler(waveform)
263
- if waveform.shape[0] > 1:
264
- waveform = torch.mean(waveform, dim=0, keepdim=True)
265
 
266
  input_features = whisper_processor(
267
- waveform.squeeze().numpy(),
268
  sampling_rate=16000,
269
  return_tensors="pt"
270
  ).input_features
271
 
272
  with torch.no_grad():
273
  predicted_ids = whisper_model.generate(input_features)
 
274
  transcription = whisper_processor.batch_decode(
275
- predicted_ids, skip_special_tokens=True
276
- )[0]
 
277
 
278
- os.unlink(tmp_path)
279
- print(f"✅ Transcrito: {transcription}")
 
 
 
 
 
 
 
 
 
280
 
281
- # 2. Generar respuesta
282
- prompt = f"Pregunta: {transcription}\nRespuesta:"
283
  inputs = llm_tokenizer.encode(prompt, return_tensors="pt")
284
 
285
  with torch.no_grad():
286
  outputs = llm_model.generate(
287
- inputs, max_length=100, temperature=0.7,
288
- top_p=0.9, do_sample=True,
289
- pad_token_id=llm_tokenizer.eos_token_id
 
 
 
 
290
  )
291
 
292
  full_text = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
 
293
  if "Respuesta:" in full_text:
294
  answer = full_text.split("Respuesta:")[-1].strip()
295
  else:
296
- answer = full_text.strip()
297
 
 
298
  if len(answer) > 200:
299
- answer = answer[:200]
 
 
 
300
 
301
  print(f"✅ Respuesta: {answer}")
302
 
303
- # 3. Generar audio
304
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
305
- audio_path = tmp.name
306
 
307
- tts.tts_to_file(text=answer, file_path=audio_path)
 
 
308
 
309
- with open(audio_path, "rb") as f:
310
- audio_data = f.read()
 
 
 
 
311
 
312
- os.unlink(audio_path)
 
 
 
 
 
 
313
 
314
- print("✅ Proceso completo!")
 
315
 
 
316
  return StreamingResponse(
317
- io.BytesIO(audio_data),
318
- media_type="audio/wav",
319
  headers={
320
  "X-Transcription": transcription,
321
- "X-Answer": answer
 
322
  }
323
  )
324
 
325
  except Exception as e:
326
- print(f"❌ Error: {str(e)}")
327
  raise HTTPException(status_code=500, detail=str(e))
328
 
329
  # ============================================
330
- # ENDPOINTS DE UTILIDAD
331
  # ============================================
332
 
333
  @app.get("/")
334
  async def root():
335
  return {
336
- "message": "🤖 API Asistente de Voz",
337
- "version": "1.0",
 
338
  "endpoints": {
339
- "/transcribe": "POST - Audio WAV → Texto",
340
- "/chat": "POST - Pregunta → Respuesta IA",
341
- "/tts": "POST - Texto → Audio",
342
- "/complete": "POST - Audio → Audio (proceso completo)"
 
 
 
 
 
343
  }
344
  }
345
 
346
  @app.get("/health")
347
  async def health_check():
348
  return {
349
- "status": "ok",
350
- "models": {
351
- "whisper": "loaded",
352
- "llm": "loaded",
353
- "tts": "loaded"
354
  }
355
  }
356
 
 
 
 
 
 
 
 
 
357
  if __name__ == "__main__":
358
  import uvicorn
359
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
8
  WhisperForConditionalGeneration,
9
  AutoModelForCausalLM,
10
  AutoTokenizer,
 
11
  )
 
12
  import io
 
 
13
  import tempfile
14
  import os
15
+ import requests
16
 
17
+ app = FastAPI(title="Asistente de Voz API - Versión Simple")
18
 
19
  # ============================================
20
+ # TOKEN DE HUGGING FACE (OPCIONAL)
21
+ # ============================================
22
+ # Si quieres usar modelos privados o más cuota, obtén tu token en:
23
+ # https://huggingface.co/settings/tokens
24
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
25
+
26
+ # ============================================
27
+ # CARGAR MODELOS
28
  # ============================================
29
 
30
  print("🔄 Cargando modelos...")
 
35
  whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
36
  whisper_model.eval()
37
 
38
+ # 2. MODELO DE LENGUAJE (más pequeño y rápido)
39
  print("🤖 Cargando modelo de lenguaje...")
40
+ # Usando GPT-2 pequeño en español
41
+ llm_tokenizer = AutoTokenizer.from_pretrained("DeepESP/gpt2-spanish-medium")
42
+ llm_model = AutoModelForCausalLM.from_pretrained("DeepESP/gpt2-spanish-medium")
43
+ llm_model.eval()
 
 
 
 
 
 
 
 
44
 
45
+ print("✅ Modelos cargados!\n")
46
 
47
  # ============================================
48
  # MODELOS DE DATOS
 
50
 
51
  class ChatRequest(BaseModel):
52
  question: str
53
+ max_length: int = 150
54
 
55
  class TTSRequest(BaseModel):
56
  text: str
57
 
58
  # ============================================
59
+ # FUNCIONES AUXILIARES
60
  # ============================================
61
 
62
+ def process_audio_file(audio_bytes):
63
+ """Procesa bytes de audio y los convierte al formato correcto"""
64
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
65
+ tmp.write(audio_bytes)
66
+ tmp_path = tmp.name
67
+
68
  try:
69
+ # Cargar audio
 
 
 
 
 
 
 
 
 
 
70
  waveform, sample_rate = torchaudio.load(tmp_path)
71
 
72
+ # Remuestrear a 16kHz
73
  if sample_rate != 16000:
74
  resampler = torchaudio.transforms.Resample(sample_rate, 16000)
75
  waveform = resampler(waveform)
76
 
77
+ # Convertir a mono
78
  if waveform.shape[0] > 1:
79
  waveform = torch.mean(waveform, dim=0, keepdim=True)
80
 
81
+ return waveform.squeeze().numpy()
82
+ finally:
83
+ os.unlink(tmp_path)
84
+
85
+ # ============================================
86
+ # ENDPOINT 1: TRANSCRIPCIÓN
87
+ # ============================================
88
+
89
+ @app.post("/transcribe")
90
+ async def transcribe_audio(file: UploadFile = File(...)):
91
+ """Convierte audio WAV a texto"""
92
+ try:
93
+ print(f"📥 Recibiendo audio: {file.filename}")
94
+
95
+ # Procesar audio
96
+ audio_bytes = await file.read()
97
+ waveform = process_audio_file(audio_bytes)
98
+
99
+ # Transcribir con Whisper
100
  input_features = whisper_processor(
101
+ waveform,
102
  sampling_rate=16000,
103
  return_tensors="pt"
104
  ).input_features
105
 
 
106
  with torch.no_grad():
107
  predicted_ids = whisper_model.generate(input_features)
108
 
 
111
  skip_special_tokens=True
112
  )[0]
113
 
 
 
 
114
  print(f"✅ Transcrito: {transcription}")
115
 
116
  return JSONResponse({
 
119
  })
120
 
121
  except Exception as e:
122
+ print(f"❌ Error: {str(e)}")
123
  raise HTTPException(status_code=500, detail=str(e))
124
 
125
  # ============================================
126
+ # ENDPOINT 2: CHAT IA
127
  # ============================================
128
 
129
  @app.post("/chat")
130
  async def chat(request: ChatRequest):
131
+ """Genera respuesta de IA"""
 
 
132
  try:
133
  question = request.question.strip()
134
  print(f"💬 Pregunta: {question}")
 
139
  "success": False
140
  })
141
 
142
+ # Crear contexto en español
143
+ prompt = f"""Eres un asistente virtual amigable. Responde de forma breve y clara.
144
+
145
+ Pregunta: {question}
146
+ Respuesta:"""
147
 
148
  # Generar respuesta
149
  inputs = llm_tokenizer.encode(prompt, return_tensors="pt")
 
153
  inputs,
154
  max_length=request.max_length,
155
  num_return_sequences=1,
156
+ temperature=0.8,
157
  top_p=0.9,
158
  do_sample=True,
159
+ pad_token_id=llm_tokenizer.eos_token_id,
160
+ repetition_penalty=1.2
161
  )
162
 
163
+ # Decodificar
164
  full_text = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
165
 
166
+ # Extraer solo la respuesta
167
  if "Respuesta:" in full_text:
168
  answer = full_text.split("Respuesta:")[-1].strip()
169
  else:
170
+ answer = full_text.replace(prompt, "").strip()
171
 
172
+ # Limpiar y limitar
173
+ answer = answer.split("\n")[0].strip() # Solo primera línea
174
  if len(answer) > 200:
175
+ answer = answer[:200].rsplit(" ", 1)[0] + "..."
176
+
177
+ # Si está vacía, dar respuesta por defecto
178
+ if not answer or len(answer) < 5:
179
+ answer = "Interesante pregunta. Déjame pensar en eso."
180
 
181
  print(f"✅ Respuesta: {answer}")
182
 
 
186
  })
187
 
188
  except Exception as e:
189
+ print(f"❌ Error: {str(e)}")
190
  return JSONResponse({
191
+ "answer": "Lo siento, tuve un problema procesando tu pregunta",
192
+ "success": False
 
193
  })
194
 
195
  # ============================================
196
+ # ENDPOINT 3: TTS (usando API de HF)
197
  # ============================================
198
 
199
  @app.post("/tts")
200
  async def text_to_speech(request: TTSRequest):
201
  """
202
+ Convierte texto a voz usando API de Hugging Face
203
+ IMPORTANTE: Requiere conexión a internet
204
  """
205
  try:
206
  text = request.text.strip()
207
+ print(f"🔊 Generando voz: {text[:50]}...")
208
 
209
  if not text:
210
  raise HTTPException(status_code=400, detail="Texto vacío")
211
 
212
+ # Limitar longitud
213
  if len(text) > 300:
214
  text = text[:300] + "..."
215
 
216
+ # Usar API de Hugging Face para TTS
217
+ # Modelo: Facebook MMS TTS español
218
+ API_URL = "https://api-inference.huggingface.co/models/facebook/mms-tts-spa"
 
 
 
 
 
 
219
 
220
+ headers = {}
221
+ if HF_TOKEN:
222
+ headers["Authorization"] = f"Bearer {HF_TOKEN}"
 
 
 
223
 
224
+ # Hacer request a la API
225
+ response = requests.post(
226
+ API_URL,
227
+ headers=headers,
228
+ json={"inputs": text},
229
+ timeout=30
 
 
 
230
  )
231
 
232
+ if response.status_code == 200:
233
+ print(f"✅ Audio generado: {len(response.content)} bytes")
234
+
235
+ return StreamingResponse(
236
+ io.BytesIO(response.content),
237
+ media_type="audio/flac",
238
+ headers={
239
+ "Content-Disposition": "attachment; filename=speech.flac"
240
+ }
241
+ )
242
+ else:
243
+ print(f"❌ Error API TTS: {response.status_code}")
244
+ raise HTTPException(
245
+ status_code=response.status_code,
246
+ detail=f"Error en TTS: {response.text}"
247
+ )
248
+
249
+ except requests.exceptions.Timeout:
250
+ print("⏱️ Timeout en TTS")
251
+ raise HTTPException(status_code=504, detail="Timeout generando audio")
252
  except Exception as e:
253
+ print(f"❌ Error: {str(e)}")
254
  raise HTTPException(status_code=500, detail=str(e))
255
 
256
  # ============================================
257
+ # ENDPOINT 4: PROCESO COMPLETO
258
  # ============================================
259
 
260
  @app.post("/complete")
261
  async def complete_conversation(file: UploadFile = File(...)):
262
  """
263
  Proceso completo: Audio → Texto → IA → Audio
 
264
  """
265
  try:
266
+ print("\n" + "="*50)
267
+ print("🔄 PROCESO COMPLETO INICIADO")
268
+ print("="*50)
269
 
270
+ # PASO 1: Transcribir
271
+ print("\n📝 PASO 1: Transcribiendo...")
272
  audio_bytes = await file.read()
273
+ waveform = process_audio_file(audio_bytes)
 
 
 
 
 
 
 
 
 
274
 
275
  input_features = whisper_processor(
276
+ waveform,
277
  sampling_rate=16000,
278
  return_tensors="pt"
279
  ).input_features
280
 
281
  with torch.no_grad():
282
  predicted_ids = whisper_model.generate(input_features)
283
+
284
  transcription = whisper_processor.batch_decode(
285
+ predicted_ids,
286
+ skip_special_tokens=True
287
+ )[0].strip()
288
 
289
+ print(f"✅ Transcripción: {transcription}")
290
+
291
+ if not transcription or len(transcription) < 3:
292
+ transcription = "No te escuché bien"
293
+
294
+ # PASO 2: Generar respuesta
295
+ print("\n🤖 PASO 2: Generando respuesta IA...")
296
+ prompt = f"""Eres un asistente virtual amigable. Responde breve.
297
+
298
+ Pregunta: {transcription}
299
+ Respuesta:"""
300
 
 
 
301
  inputs = llm_tokenizer.encode(prompt, return_tensors="pt")
302
 
303
  with torch.no_grad():
304
  outputs = llm_model.generate(
305
+ inputs,
306
+ max_length=150,
307
+ temperature=0.8,
308
+ top_p=0.9,
309
+ do_sample=True,
310
+ pad_token_id=llm_tokenizer.eos_token_id,
311
+ repetition_penalty=1.2
312
  )
313
 
314
  full_text = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
315
+
316
  if "Respuesta:" in full_text:
317
  answer = full_text.split("Respuesta:")[-1].strip()
318
  else:
319
+ answer = full_text.replace(prompt, "").strip()
320
 
321
+ answer = answer.split("\n")[0].strip()
322
  if len(answer) > 200:
323
+ answer = answer[:200].rsplit(" ", 1)[0] + "..."
324
+
325
+ if not answer or len(answer) < 5:
326
+ answer = "Entiendo tu pregunta."
327
 
328
  print(f"✅ Respuesta: {answer}")
329
 
330
+ # PASO 3: Generar audio
331
+ print("\n🔊 PASO 3: Generando audio...")
332
+ API_URL = "https://api-inference.huggingface.co/models/facebook/mms-tts-spa"
333
 
334
+ headers = {}
335
+ if HF_TOKEN:
336
+ headers["Authorization"] = f"Bearer {HF_TOKEN}"
337
 
338
+ response = requests.post(
339
+ API_URL,
340
+ headers=headers,
341
+ json={"inputs": answer},
342
+ timeout=30
343
+ )
344
 
345
+ if response.status_code != 200:
346
+ print(f"⚠️ Error TTS, usando respuesta de texto")
347
+ return JSONResponse({
348
+ "transcription": transcription,
349
+ "answer": answer,
350
+ "audio_error": True
351
+ })
352
 
353
+ print("✅ Audio generado correctamente")
354
+ print("="*50 + "\n")
355
 
356
+ # Retornar audio con metadata en headers
357
  return StreamingResponse(
358
+ io.BytesIO(response.content),
359
+ media_type="audio/flac",
360
  headers={
361
  "X-Transcription": transcription,
362
+ "X-Answer": answer,
363
+ "Content-Disposition": "attachment; filename=response.flac"
364
  }
365
  )
366
 
367
  except Exception as e:
368
+ print(f"❌ ERROR COMPLETO: {str(e)}")
369
  raise HTTPException(status_code=500, detail=str(e))
370
 
371
  # ============================================
372
+ # ENDPOINTS INFORMATIVOS
373
  # ============================================
374
 
375
  @app.get("/")
376
  async def root():
377
  return {
378
+ "message": "🤖 API Asistente de Voz ESP32",
379
+ "version": "2.0 - Simplificada",
380
+ "status": "online",
381
  "endpoints": {
382
+ "POST /transcribe": "Audio WAV → Texto",
383
+ "POST /chat": "Pregunta → Respuesta IA",
384
+ "POST /tts": "Texto → Audio",
385
+ "POST /complete": "Audio → Audio (recomendado)"
386
+ },
387
+ "models": {
388
+ "stt": "openai/whisper-small",
389
+ "llm": "DeepESP/gpt2-spanish-medium",
390
+ "tts": "facebook/mms-tts-spa (API)"
391
  }
392
  }
393
 
394
  @app.get("/health")
395
  async def health_check():
396
  return {
397
+ "status": "healthy",
398
+ "models_loaded": {
399
+ "whisper": whisper_model is not None,
400
+ "llm": llm_model is not None,
401
+ "tts": "API externa"
402
  }
403
  }
404
 
405
+ @app.get("/test")
406
+ async def test_endpoint():
407
+ """Endpoint de prueba simple"""
408
+ return {
409
+ "message": "¡Servidor funcionando correctamente!",
410
+ "test": "OK"
411
+ }
412
+
413
  if __name__ == "__main__":
414
  import uvicorn
415
  uvicorn.run(app, host="0.0.0.0", port=7860)