Spaces:

Nourhenem
/

medical-agent

Sleeping

File size: 7,877 Bytes

1eb76aa

#!/usr/bin/env python3
"""
Complete Pipeline Test
Tests the full pipeline including Langfuse transcription download
"""

import os
import sys
import time
from pathlib import Path
from datetime import datetime

# Add the current directory to Python path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))


def test_complete_pipeline():
    """Test the complete pipeline including Langfuse transcription download."""
    print("🏥 Complete Medical Document Pipeline Test")
    print("=" * 70)
    print("This test will:")
    print("1. Download transcriptions from Langfuse")
    print("2. Run the complete document processing pipeline")
    print("3. Validate the results")
    print("=" * 70)

    # Step 1: Download transcriptions from Langfuse
    print("\n📥 Step 1: Downloading transcriptions from Langfuse...")
    try:
        from medical_transcription_retriever import MedicalTranscriptionRetriever

        retriever = MedicalTranscriptionRetriever()
        saved_files = retriever.run(
            limit=5, save_to_file=True, save_by_user=True)

        if not saved_files:
            print("❌ No transcriptions downloaded from Langfuse")
            print("Please check your Langfuse configuration and try again")
            return None

        print(
            f"✅ Successfully downloaded transcriptions: {len(saved_files)} files")

    except Exception as e:
        print(f"❌ Error downloading transcriptions: {e}")
        print("Continuing with existing transcriptions if available...")

    # Step 2: Check if we have transcription files
    transcriptions_dir = "transcriptions"
    if not os.path.exists(transcriptions_dir):
        print(f"❌ Transcriptions directory not found: {transcriptions_dir}")
        return None

    transcription_files = list(Path(transcriptions_dir).glob("*.json"))
    if not transcription_files:
        print(f"❌ No transcription files found in {transcriptions_dir}")
        return None

    print(f"📁 Found {len(transcription_files)} transcription files")

    # Step 3: Test with the first transcription file
    first_transcription = transcription_files[0]
    print(f"📄 Using transcription file: {first_transcription.name}")

    try:
        # Step 4: Initialize the orchestrator
        print(
            "\n🚀 Step 2: Initializing orchestrator with automatic SFTP model detection...")
        from langchain_medical_agents_refactored import MedicalDocumentOrchestrator

        orchestrator = MedicalDocumentOrchestrator(
            template_path=None,  # Let the SFTP agent find the template
            transcription_path=str(first_transcription),
            transcriptions_dir=transcriptions_dir
        )

        # Step 5: Run the complete pipeline
        print("\n🔄 Step 3: Running complete pipeline...")
        print("This will include:")
        print("  📥 Step 0: SFTP Download (.rtf → .doc) - AUTOMATIC MODEL DETECTION")
        print("  📋 Step 1: Template Analysis")
        print("  ✏️ Step 2: Transcription Correction")
        print("  🔬 Step 3: Medical Data Analysis")
        print("  📝 Step 4: Title Generation")
        print("  📝 Step 5: Section Generation")
        print("  📄 Step 6: Document Assembly")
        print("  📋 Step 7: Validation")

        start_time = time.time()
        output_file = orchestrator.run_full_pipeline()
        end_time = time.time()

        execution_time = end_time - start_time
        print(f"\n⏱️ Pipeline execution time: {execution_time:.2f} seconds")

        print(f"\n🎉 Pipeline completed successfully!")
        print(f"📄 Output file: {output_file}")

        # Step 6: Show SFTP download summary
        if orchestrator.downloaded_models:
            successful_downloads = [
                m for m in orchestrator.downloaded_models if m['status'] == 'success']
            failed_downloads = [
                m for m in orchestrator.downloaded_models if m['status'] == 'error']

            print(f"\n📥 SFTP Download Summary:")
            print(
                f"  ✅ Successfully downloaded: {len(successful_downloads)} models")
            print(f"  ❌ Failed downloads: {len(failed_downloads)} models")

            if successful_downloads:
                print("  📁 Downloaded models:")
                for model in successful_downloads[:5]:  # Show first 5
                    print(
                        f"    - {model['model_id']}: {model['local_filename']}")
                if len(successful_downloads) > 5:
                    print(f"    ... and {len(successful_downloads) - 5} more")

        # Step 7: Verify output file exists
        if os.path.exists(output_file):
            file_size = os.path.getsize(output_file)
            print(f"\n✅ Output file verified:")
            print(f"  📄 File: {output_file}")
            print(f"  📏 Size: {file_size} bytes")

            # Check if file is readable
            try:
                from docx import Document
                doc = Document(output_file)
                paragraph_count = len(doc.paragraphs)
                print(f"  📝 Paragraphs: {paragraph_count}")
                print(f"  ✅ Document is readable and valid")
            except Exception as e:
                print(f"  ⚠️ Document validation failed: {e}")
        else:
            print(f"\n❌ Output file not found: {output_file}")

        return output_file

    except Exception as e:
        print(f"❌ Error running pipeline: {str(e)}")
        import traceback
        traceback.print_exc()
        return None


def cleanup_test_files():
    """Clean up test files after testing."""
    print("\n🧹 Cleaning up test files...")

    # Remove generated documents
    for file in Path("./transcriptions").glob("*.json"):
        try:
            os.remove(file)
            print(f"🗑️ Removed: {file}")
        except Exception as e:
            print(f"⚠️ Could not remove {file}: {e}")

    for file in Path("./").glob("*.docx"):
        try:
            os.remove(file)
            print(f"🗑️ Removed: {file}")
        except Exception as e:
            print(f"⚠️ Could not remove {file}: {e}")

    for file in Path("./").glob("*.json"):
        try:
            os.remove(file)
            print(f"🗑️ Removed: {file}")
        except Exception as e:
            print(f"⚠️ Could not remove {file}: {e}")

    # Remove downloaded models
    models_dir = "models"
    if os.path.exists(models_dir):
        for file in Path(models_dir).glob("*.doc"):
            try:
                os.remove(file)
                print(f"🗑️ Removed: {file}")
            except Exception as e:
                print(f"⚠️ Could not remove {file}: {e}")


def main():
    """Main test function."""
    print("🧪 Complete Pipeline Test with Langfuse Integration")
    print("=" * 70)

    # Check if we're in the right directory
    if not os.path.exists("transcriptions"):
        print("❌ Please run this script from the project root directory")
        print("   (where the 'transcriptions' folder is located)")
        return

    # Show current configuration
    try:
        from sftp_config import print_sftp_config
        print_sftp_config()
    except ImportError:
        print("⚠️ SFTP config not available")

    # Run the complete pipeline test
    result = test_complete_pipeline()

    if result:
        print(f"\n🎉 Complete pipeline test completed successfully!")
        print(f"📄 Generated document: {result}")

        # Ask if user wants to clean up
        cleanup = input(
            "\n🧹 Do you want to clean up test files? (y/n): ").lower().strip()
        if cleanup in ['y', 'yes']:
            cleanup_test_files()
    else:
        print(f"\n❌ Complete pipeline test failed")


if __name__ == "__main__":
    main()