Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Document Validator | |
| Validates generated medical documents against original transcriptions | |
| """ | |
| import re | |
| from typing import Dict, Any, List | |
| from docx import Document | |
| from langchain.prompts import ChatPromptTemplate | |
| def validate_generated_document(template_path: str, transcription_path: str, generated_doc_path: str) -> Dict[str, Any]: | |
| """Validate that the generated document contains all important content from the transcription.""" | |
| from template_analyzer import analyze_word_template | |
| from transcription_processor import load_transcription | |
| # Extract content from generated document | |
| doc = Document(generated_doc_path) | |
| generated_content = [] | |
| for paragraph in doc.paragraphs: | |
| text = paragraph.text.strip() | |
| if text and not text.startswith("Date:") and not text.startswith("Heure:"): | |
| generated_content.append(text) | |
| generated_text = "\n".join(generated_content) | |
| # Load transcription | |
| transcription_text = load_transcription(transcription_path) | |
| # Extract medical entities from both texts | |
| def extract_medical_entities(text: str) -> List[str]: | |
| patterns = [ | |
| r'\d+(?:\.\d+)?\s*(?:mm|cm|kg|cc|ml|g|mg)', # Measurements | |
| r'\b(?:rein|vessie|foie|rate|poumon|coeur|cerveau|muscle|tendon|os|articulation)\b', | |
| r'\b(?:lithiase|calcification|tendinopathie|inflammation|dilatation|normal|anormal)\b', | |
| r'\b(?:échographie|radiographie|scanner|irm|examen)\b', | |
| ] | |
| entities = [] | |
| for pattern in patterns: | |
| matches = re.findall(pattern, text.lower()) | |
| entities.extend(matches) | |
| return list(set(entities)) | |
| transcription_entities = extract_medical_entities(transcription_text) | |
| generated_entities = extract_medical_entities(generated_text) | |
| # Calculate coverage | |
| missing_entities = [ | |
| entity for entity in transcription_entities if entity not in generated_entities] | |
| coverage_percentage = ((len(transcription_entities) - len(missing_entities)) / | |
| len(transcription_entities) * 100) if transcription_entities else 100 | |
| # Validate structure | |
| template_analysis = analyze_word_template(template_path) | |
| template_sections = [section['text'] | |
| for section in template_analysis.get('sections', [])] | |
| found_sections = [] | |
| for paragraph in doc.paragraphs: | |
| text = paragraph.text.strip() | |
| for template_section in template_sections: | |
| template_clean = template_section.lower().replace( | |
| '\xa0', ' ').replace(':', '').strip() | |
| text_clean = text.lower().replace(':', '').strip() | |
| if template_clean in text_clean or text_clean in template_clean: | |
| found_sections.append(template_section) | |
| break | |
| missing_sections = [ | |
| s for s in template_sections if s not in found_sections] | |
| structure_valid = len(missing_sections) == 0 | |
| # Overall score | |
| structure_score = 1.0 if structure_valid else 0.5 | |
| entities_score = coverage_percentage / 100 | |
| overall_score = (structure_score + entities_score) / 2 | |
| validation_result = { | |
| "overall_score": overall_score, | |
| "structure_valid": structure_valid, | |
| "entities_coverage": coverage_percentage, | |
| "missing_sections": missing_sections, | |
| "missing_entities": missing_entities, | |
| "transcription_entities_count": len(transcription_entities), | |
| "generated_entities_count": len(generated_entities), | |
| "found_sections": found_sections, | |
| "template_sections": template_sections | |
| } | |
| return validation_result | |
| def create_validation_chain(llm): | |
| """Create the validation chain.""" | |
| validation_prompt = ChatPromptTemplate.from_messages([ | |
| ("system", """You are a medical document validation expert. | |
| Analyze if the generated medical document contains all important medical information from the original transcription. | |
| Provide a brief validation summary with: | |
| - Overall quality assessment | |
| - Missing important information (if any) | |
| - Key recommendations"""), | |
| ("human", """Validate the content coverage between the original transcription and the generated document. | |
| ORIGINAL TRANSCRIPTION: | |
| {transcription} | |
| GENERATED DOCUMENT CONTENT: | |
| {generated_content} | |
| VALIDATION METRICS: | |
| - Structure Valid: {structure_valid} | |
| - Entities Coverage: {entities_coverage:.1f}% | |
| - Missing Sections: {missing_sections} | |
| - Missing Entities: {missing_entities} | |
| Provide a concise validation summary.""") | |
| ]) | |
| return validation_prompt | llm | |