import os# import gradio as gr from dotenv import load_dotenv # Make sure static directory exists if not os.path.exists("static"): os.makedirs("static") # Load environment variables load_dotenv() os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Suppress TensorFlow warnings # Imports from your custom modules from brain_of_the_doctor import analyze_image_with_query, encode_image from breast_cancer_classifer import breast_cancer_detection_model from voice_of_the_doctor import text_to_speech_with_veena # <-- Changed here from voice_of_the_patient import transcribe_with_groq # System Prompt for diagnosis system_prompt = """You have to act as a professional doctor, i know you are not but this is for learning purpose. What's in this image?. Do you find anything wrong with it medically? If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person. Donot say 'In the image I see' but say 'With what I see, I think you have ....' Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot, Keep your answer concise (max 2 sentences). No preamble, start your answer right away please""" # Helper functions def contains_hsi_keywords(speech_text): keywords = ["hsi", "hyperspectral imaging", "tissue"] return any(keyword.lower() in speech_text.lower() for keyword in keywords) def is_hsi_image(image_path): return image_path and image_path.lower().endswith(('.mat', '.npy', '.hdr')) # Reset state def clear_state(): return None, None, gr.update(value=""), gr.update(value=""), gr.update(value=None) # Main processing function def process_inputs(audio_filepath, image_pil): speech_text = "" diagnosis = "" audio_output_path = text_to_speech_with_veena(diagnosis) # <-- changed to .wav for Veena TTS try: # Step 1: Transcribe audio if provided if audio_filepath: speech_text = transcribe_with_groq( GROQ_API_KEY=os.environ.get("GROQ_API_KEY"), audio_filepath=audio_filepath, stt_model="whisper-large-v3" ) # Step 2: Handle image if provided image_filepath = None encoded_image = None if image_pil: image_filepath = "static/temp_image.png" image_pil.save(image_filepath) encoded_image = encode_image(image_filepath) # Step 3: Check for HSI images if contains_hsi_keywords(speech_text) or (image_filepath and is_hsi_image(image_filepath)): diagnosis = breast_cancer_detection_model(image_filepath) # Step 4: Multimodal Diagnosis logic elif encoded_image and speech_text: diagnosis = analyze_image_with_query( query=system_prompt + " " + speech_text, encoded_image=encoded_image, model="meta-llama/llama-4-scout-17b-16e-instruct" ) elif encoded_image: diagnosis = analyze_image_with_query( query=system_prompt, encoded_image=encoded_image, model="meta-llama/llama-4-scout-17b-16e-instruct" ) elif speech_text: diagnosis = analyze_image_with_query( query=system_prompt + " " + speech_text, encoded_image=None, model="meta-llama/llama-4-scout-17b-16e-instruct" ) else: diagnosis = "Please provide at least an image or audio to begin diagnosis." # Step 5: Generate voice output using Maya Veena TTS text_to_speech_with_veena( input_text=diagnosis, output_filepath=audio_output_path ) except Exception as e: print("โ Error during processing:", e) diagnosis = "An error occurred while processing. Please try again." return speech_text, diagnosis, audio_output_path if os.path.exists(audio_output_path) else None # Replay audio def replay_audio(): audio_path = "static/final.wav" return audio_path if os.path.exists(audio_path) else None # Download audio def download_audio(): return "static/final.wav" if os.path.exists("static/final.wav") else None # --------- Gradio UI ---------- with gr.Blocks(theme=gr.themes.Soft(primary_hue="teal", secondary_hue="indigo"), css=""" body { background: linear-gradient(to right, #ccfbf1, #e0f2fe); } .gr-box { border-radius: 20px; padding: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); background-color: #ffffff; } .gr-button { font-weight: bold; font-size: 16px; padding: 12px 20px; border-radius: 12px; transition: transform 0.1s ease; } .gr-button:active { transform: scale(0.96); } .gr-textbox textarea { font-size: 15px; line-height: 1.5; background-color: #f8fafc; } .gr-audio .audio-upload-box { display: none !important; } .loading-spinner { font-size: 16px; color: #0f766e; text-align: center; padding: 10px; } @media (max-width: 768px) { .gr-box { padding: 8px; } .gr-button { font-size: 14px; padding: 10px 14px; } } """) as iface: gr.Markdown("
An AI tool that listens, sees, and speaks to offer quick medical guidance
") with gr.Row(): with gr.Column(): audio_input = gr.Audio(sources=["microphone"], type="filepath", label="๐ Speak your symptoms") image_input = gr.Image(type="pil", label="๐ผ Upload Medical Image") with gr.Column(): speech_output = gr.Textbox(label="๐ Transcribed Text", lines=4, interactive=False) diagnosis_output = gr.Textbox(label="๐ฉบ Doctor's Diagnosis", lines=4, interactive=False) audio_output = gr.Audio(label="๐ Doctor's Voice", interactive=False) download_btn = gr.File(label="โฌ Download Voice Output") with gr.Row(): analyze_btn = gr.Button("๐ง Analyze & Diagnose") clear_btn = gr.Button("๐งน Clear") # Button events analyze_btn.click(process_inputs, [audio_input, image_input], [speech_output, diagnosis_output, audio_output]) clear_btn.click(clear_state, [], [audio_input, image_input, speech_output, diagnosis_output, audio_output]) # ---- Run on Hugging Face ---- if __name__ == "__main__": iface.launch()