Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Improved Template Analyzer - Enhanced section detection | |
| Fixes issues with section detection and provides better analysis | |
| """ | |
| import os | |
| import re | |
| from typing import Dict, Any, List, Tuple | |
| from docx import Document | |
| import json | |
| from datetime import datetime | |
| from langchain.tools import tool | |
| from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder | |
| from langchain.agents import AgentExecutor, create_openai_tools_agent | |
| from langchain_openai import ChatOpenAI | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| def analyze_word_template_tool(template_path: str) -> Dict[str, Any]: | |
| """Analyze a Word document template to extract structure and sections.""" | |
| if not os.path.exists(template_path): | |
| raise FileNotFoundError(f"Template file not found: {template_path}") | |
| doc = Document(template_path) | |
| analysis = { | |
| 'sections': [], | |
| 'formatting': {}, | |
| 'document_info': {} | |
| } | |
| # Improved section detection regex - includes all common medical sections | |
| section_patterns = [ | |
| r'\b(clinique|examen|observation)\b', | |
| r'\b(technique|matériel|méthode|procédure)\b', | |
| r'\b(résultat|resultat|resultats|résultats)\b', | |
| r'\b(conclusion|diagnostic|impression)\b', | |
| r'\b(échographie|echographie|imagerie)\b', | |
| r'\b(recommandation|traitement|suivi)\b', | |
| r'\b(analyse|commentaire|discussion)\b', | |
| r'\b(antécédents|histoire|anamnèse)\b', | |
| r'\b(indication|objectif)\b', | |
| r'\b(biologie|laboratoire)\b' | |
| ] | |
| combined_pattern = '|'.join(section_patterns) | |
| # Analyze paragraphs and sections | |
| for i, paragraph in enumerate(doc.paragraphs): | |
| text = paragraph.text.strip() | |
| if text: | |
| # Check if paragraph contains section keywords | |
| if re.search(combined_pattern, text, re.IGNORECASE): | |
| analysis['sections'].append({ | |
| 'text': text, | |
| 'index': i, | |
| 'style': paragraph.style.name if paragraph.style else 'Normal' | |
| }) | |
| # Analyze formatting | |
| if paragraph.runs: | |
| run = paragraph.runs[0] | |
| analysis['formatting'][i] = { | |
| 'bold': run.bold, | |
| 'italic': run.italic, | |
| 'font_name': run.font.name, | |
| 'font_size': run.font.size.pt if run.font.size else None, | |
| 'alignment': paragraph.alignment | |
| } | |
| # Analyze document properties | |
| if doc.core_properties: | |
| analysis['document_info'] = { | |
| 'title': doc.core_properties.title or 'Word Document', | |
| 'author': doc.core_properties.author or '', | |
| 'subject': doc.core_properties.subject or '' | |
| } | |
| return analysis | |
| class ImprovedTemplateAnalyzer: | |
| """Enhanced template analyzer with better section detection.""" | |
| def __init__(self): | |
| """Initialize the template analyzer.""" | |
| print("🔍 Improved Template Analyzer initialized") | |
| # Define comprehensive section patterns | |
| self.section_patterns = { | |
| 'clinique': r'\b(clinique|examen|observation|examen_clinique)\b', | |
| 'technique': r'\b(technique|matériel|méthode|procédure|protocole)\b', | |
| 'resultats': r'\b(résultat|resultat|resultats|résultats|findings)\b', | |
| 'conclusion': r'\b(conclusion|diagnostic|impression|synthèse)\b', | |
| 'imagerie': r'\b(échographie|echographie|imagerie|radiologie)\b', | |
| 'recommandations': r'\b(recommandation|traitement|suivi|conduite)\b', | |
| 'analyse': r'\b(analyse|commentaire|discussion|interprétation)\b', | |
| 'antecedents': r'\b(antécédents|histoire|anamnèse|contexte)\b', | |
| 'indication': r'\b(indication|objectif|but|demande)\b', | |
| 'biologie': r'\b(biologie|laboratoire|bilan|analyses)\b' | |
| } | |
| def analyze_word_template(self, template_path: str) -> Dict[str, Any]: | |
| """Analyze a Word document template to extract structure and sections.""" | |
| if not os.path.exists(template_path): | |
| raise FileNotFoundError(f"Template file not found: {template_path}") | |
| print(f"📄 Analyzing template: {template_path}") | |
| doc = Document(template_path) | |
| analysis = { | |
| 'sections': [], | |
| 'formatting': {}, | |
| 'document_info': {}, | |
| 'all_text': [], | |
| 'structure': {}, | |
| 'detected_section_types': [] | |
| } | |
| # Analyze paragraphs and sections | |
| for i, paragraph in enumerate(doc.paragraphs): | |
| text = paragraph.text.strip() | |
| # Store all text for reference | |
| if text: | |
| analysis['all_text'].append({ | |
| 'index': i, | |
| 'text': text, | |
| 'length': len(text) | |
| }) | |
| # Check for sections using improved detection | |
| section_type = self._detect_section_type(text) | |
| if section_type: | |
| analysis['sections'].append({ | |
| 'text': text, | |
| 'index': i, | |
| 'style': paragraph.style.name if paragraph.style else 'Normal', | |
| 'section_type': section_type, | |
| 'is_header': self._is_likely_header(text) | |
| }) | |
| if section_type not in analysis['detected_section_types']: | |
| analysis['detected_section_types'].append(section_type) | |
| # Analyze formatting | |
| if paragraph.runs: | |
| run = paragraph.runs[0] | |
| analysis['formatting'][i] = { | |
| 'bold': run.bold, | |
| 'italic': run.italic, | |
| 'font_name': run.font.name, | |
| 'font_size': run.font.size.pt if run.font.size else None, | |
| 'alignment': str(paragraph.alignment) if paragraph.alignment else None | |
| } | |
| # Analyze document properties | |
| if doc.core_properties: | |
| analysis['document_info'] = { | |
| 'title': doc.core_properties.title or 'Word Document', | |
| 'author': doc.core_properties.author or '', | |
| 'subject': doc.core_properties.subject or '', | |
| 'created': doc.core_properties.created.isoformat() if doc.core_properties.created else None, | |
| 'modified': doc.core_properties.modified.isoformat() if doc.core_properties.modified else None | |
| } | |
| # Extract document structure | |
| analysis['structure'] = self._extract_structure(analysis['sections']) | |
| return analysis | |
| def _detect_section_type(self, text: str) -> str: | |
| """Detect the type of section based on improved pattern matching.""" | |
| text_lower = text.lower() | |
| # Check each pattern | |
| for section_type, pattern in self.section_patterns.items(): | |
| if re.search(pattern, text_lower): | |
| return section_type | |
| # Additional check for common section formats | |
| if ':' in text and len(text.split()) <= 3: | |
| # Likely a section header | |
| first_word = text.split(':')[0].strip().lower() | |
| if first_word in ['clinique', 'technique', 'resultats', 'résultats', 'conclusion']: | |
| return first_word if first_word != 'résultats' else 'resultats' | |
| return None | |
| def _is_likely_header(self, text: str) -> bool: | |
| """Determine if text is likely a section header.""" | |
| # Headers are usually short, may end with ':', and often bold | |
| conditions = [ | |
| len(text) < 100, # Short text | |
| text.endswith(':'), # Ends with colon | |
| text.isupper(), # All uppercase | |
| len(text.split()) <= 3 # Few words | |
| ] | |
| return any(conditions) | |
| def _extract_structure(self, sections: List[Dict[str, Any]]) -> Dict[str, Any]: | |
| """Extract the document structure from sections.""" | |
| structure = { | |
| 'detected_sections': [], | |
| 'section_types': [], | |
| 'total_sections': len(sections) | |
| } | |
| for section in sections: | |
| structure['detected_sections'].append({ | |
| 'text': section['text'], | |
| 'type': section.get('section_type', 'unknown'), | |
| 'index': section['index'] | |
| }) | |
| section_type = section.get('section_type', 'unknown') | |
| if section_type not in structure['section_types']: | |
| structure['section_types'].append(section_type) | |
| return structure | |
| def save_analysis(self, analysis: Dict[str, Any], output_path: str = None): | |
| """Save analysis results to JSON file.""" | |
| if not output_path: | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| output_path = f"improved_template_analysis_{timestamp}.json" | |
| try: | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| json.dump(analysis, f, ensure_ascii=False, indent=2) | |
| print(f"💾 Analysis saved to: {output_path}") | |
| return output_path | |
| except Exception as e: | |
| print(f"❌ Error saving analysis: {e}") | |
| return None | |
| def display_analysis_summary(self, analysis: Dict[str, Any]): | |
| """Display a summary of the template analysis.""" | |
| print("\n📊 IMPROVED TEMPLATE ANALYSIS SUMMARY") | |
| print("=" * 60) | |
| print(f"Total paragraphs: {len(analysis['all_text'])}") | |
| print(f"Detected sections: {len(analysis['sections'])}") | |
| if analysis['detected_section_types']: | |
| print(f"Section types found: {', '.join(analysis['detected_section_types'])}") | |
| print(f"Document title: {analysis['document_info'].get('title', 'N/A')}") | |
| print(f"Document author: {analysis['document_info'].get('author', 'N/A')}") | |
| print("\n🔍 DETECTED SECTIONS:") | |
| for i, section in enumerate(analysis['structure']['detected_sections']): | |
| print(f" {i+1}. [{section['type']}] {section['text']}") | |
| print(f"\n📄 ALL PARAGRAPHS:") | |
| for i, text_item in enumerate(analysis['all_text']): | |
| print(f" {i+1}. {text_item['text']}") | |
| def test_with_sample_template(self, template_path: str): | |
| """Test the analyzer with a sample template.""" | |
| print(f"🚀 Testing Improved Template Analyzer with: {template_path}") | |
| print("=" * 60) | |
| try: | |
| # Analyze the template | |
| analysis = self.analyze_word_template(template_path) | |
| # Display summary | |
| self.display_analysis_summary(analysis) | |
| # Save analysis | |
| output_file = self.save_analysis(analysis) | |
| print(f"\n✅ Improved analysis completed successfully!") | |
| print(f"📁 Results saved to: {output_file}") | |
| return analysis | |
| except Exception as e: | |
| print(f"❌ Error during analysis: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None | |
| def create_template_analyzer_agent(self, llm): | |
| """Create the improved template analyzer agent.""" | |
| template_analyzer_prompt = ChatPromptTemplate.from_messages([ | |
| ("system", """You are an enhanced medical document template analyzer. | |
| Analyze the provided Word template and extract its structure, sections, and formatting. | |
| Pay special attention to detecting ALL sections including: CLINIQUE, TECHNIQUE, RESULTATS, and CONCLUSION. | |
| Provide a detailed analysis that can be used by other agents."""), | |
| ("human", | |
| "Analyze the template at {template_path} and provide a comprehensive analysis. Make sure to detect all sections including RESULTATS."), | |
| MessagesPlaceholder("agent_scratchpad") | |
| ]) | |
| template_analyzer_agent = create_openai_tools_agent( | |
| llm=llm, | |
| tools=[analyze_word_template_tool], | |
| prompt=template_analyzer_prompt | |
| ) | |
| template_analyzer_executor = AgentExecutor( | |
| agent=template_analyzer_agent, | |
| tools=[analyze_word_template_tool], | |
| verbose=True | |
| ) | |
| return template_analyzer_executor | |
| def test_with_agent(self, template_path: str): | |
| """Test the template analyzer using the enhanced LangChain agent.""" | |
| print(f"🤖 Testing Improved Template Analyzer AGENT with: {template_path}") | |
| print("=" * 60) | |
| try: | |
| # Initialize OpenAI LLM | |
| api_key = os.getenv('OPENAI_API_KEY') | |
| if not api_key: | |
| print("❌ OpenAI API key not found in environment variables") | |
| return None | |
| llm = ChatOpenAI( | |
| model="gpt-4o-mini", | |
| temperature=0, | |
| api_key=api_key | |
| ) | |
| # Create the agent | |
| print("🔧 Creating improved template analyzer agent...") | |
| agent_executor = self.create_template_analyzer_agent(llm) | |
| # Run the agent | |
| print("🚀 Running enhanced agent analysis...") | |
| result = agent_executor.invoke({ | |
| "template_path": template_path | |
| }) | |
| print("✅ Enhanced agent analysis completed!") | |
| print("\n📋 AGENT OUTPUT:") | |
| print("=" * 50) | |
| print(result['output']) | |
| # Save agent result | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| agent_output_file = f"improved_agent_analysis_{timestamp}.json" | |
| with open(agent_output_file, 'w', encoding='utf-8') as f: | |
| json.dump(result, f, ensure_ascii=False, indent=2) | |
| print(f"\n💾 Enhanced agent result saved to: {agent_output_file}") | |
| return result | |
| except Exception as e: | |
| print(f"❌ Error during enhanced agent analysis: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None | |
| def main(): | |
| print("🏥 Improved Template Analyzer - Enhanced Section Detection") | |
| print("=" * 60) | |
| # Initialize analyzer | |
| analyzer = ImprovedTemplateAnalyzer() | |
| # Test with sample path or interactive mode | |
| sample_path = "sample.docx" | |
| """ | |
| if os.path.exists(sample_path): | |
| print(f"📄 Found sample file: {sample_path}") | |
| print("🔬 Running enhanced analysis...") | |
| # Test both methods | |
| print("\n1️⃣ Testing improved direct analysis...") | |
| direct_result = analyzer.test_with_sample_template(sample_path) | |
| print("\n" + "="*60) | |
| print("2️⃣ Testing improved agent analysis...") | |
| agent_result = analyzer.test_with_agent(sample_path) | |
| if direct_result and agent_result: | |
| print(f"\n🎉 Both enhanced analyses completed successfully!") | |
| print(f"📊 Direct analysis found {len(direct_result['sections'])} sections") | |
| print(f"📊 Agent analysis tool was executed successfully") | |
| """ | |
| if os.path.exists(sample_path): | |
| print(f"📄 Found sample file: {sample_path}") | |
| print("🤖 Running enhanced **agent** analysis with GPT...") | |
| # Désormais on lance uniquement l’agent LLM | |
| agent_result = analyzer.test_with_agent(sample_path) | |
| if agent_result: | |
| print(f"\n🎉 Enhanced agent analysis completed successfully!") | |
| # Affiche par exemple le résumé des sections détectées | |
| #sec = agent_result.get('output', {}).get('structure', {}).get('detected_sections', []) | |
| #print(f"📊 Sections détectées via GPT : {len(sec)}") | |
| print("\n=== AGENT RAW OUTPUT ===\n", agent_result) | |
| else: | |
| print("❌ sample.docx not found. Please provide the correct path.") | |
| template_path = input("Enter the path to your Word template file: ").strip() | |
| if template_path and os.path.exists(template_path): | |
| analyzer.test_with_sample_template(template_path) | |
| else: | |
| print("❌ Invalid file path provided") | |
| if __name__ == "__main__": | |
| main() |