Spaces:

Chirapath
/

SB-PoC

Configuration error

App Files Files Community

Chirapath commited on Jul 9

Commit

1628024

verified ·

1 Parent(s): 2c570fe

Upload 3 files

Browse files

Files changed (3) hide show

service/ner_service.py +1582 -0
service/ocr_service.py +588 -0
service/rag_service.py +1367 -0

service/ner_service.py ADDED Viewed

	@@ -0,0 +1,1582 @@

+#!/usr/bin/env python3
+"""
+Enhanced NER Analysis Service - Cleaned and Optimized
+Advanced Named Entity Recognition with Thai language support,
+relationship extraction, and graph database exports
+"""
+import os
+import io
+import json
+import logging
+import re
+import csv
+import tempfile
+import zipfile
+from datetime import datetime
+from typing import Optional, List, Dict, Any, Union, Tuple
+from pathlib import Path
+from contextlib import asynccontextmanager
+from collections import defaultdict
+import xml.etree.ElementTree as ET
+import httpx
+import asyncpg
+from azure.storage.blob import BlobServiceClient
+from azure.core.credentials import AzureKeyCredential
+from fastapi import FastAPI, File, UploadFile, HTTPException, Form, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+from pydantic import BaseModel, HttpUrl, field_validator
+import uvicorn
+import docx
+from azure.ai.inference import ChatCompletionsClient
+from azure.ai.inference.models import SystemMessage, UserMessage
+from openai import AzureOpenAI
+# Import unified configuration
+try:
+    from configs import get_config
+    config = get_config().ner
+    unified_config = get_config()
+    print("✅ Using unified configuration")
+except ImportError:
+    print("⚠️  Unified config not available, using fallback configuration")
+    # Fallback configuration
+    from dotenv import load_dotenv
+    load_dotenv()
+    class FallbackConfig:
+        HOST = os.getenv("HOST", "0.0.0.0")
+        PORT = int(os.getenv("NER_PORT", "8500"))
+        DEBUG = os.getenv("DEBUG", "False").lower() == "true"
+        # Database
+        POSTGRES_HOST = os.getenv("POSTGRES_HOST", "")
+        POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
+        POSTGRES_USER = os.getenv("POSTGRES_USER", "")
+        POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
+        POSTGRES_DATABASE = os.getenv("POSTGRES_DATABASE", "postgres")
+        # APIs
+        OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
+        DEEPSEEK_ENDPOINT = os.getenv("DEEPSEEK_ENDPOINT", "")
+        DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
+        DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "DeepSeek-R1-0528")
+        AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
+        AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
+        EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
+        # Storage
+        AZURE_STORAGE_ACCOUNT_URL = os.getenv("AZURE_STORAGE_ACCOUNT_URL", "")
+        AZURE_BLOB_SAS_TOKEN = os.getenv("AZURE_BLOB_SAS_TOKEN", "")
+        BLOB_CONTAINER = os.getenv("BLOB_CONTAINER", "historylog")
+        # Limits
+        MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
+        MAX_TEXT_LENGTH = 100000  # 100KB
+        SUPPORTED_TEXT_FORMATS = {'.txt', '.doc', '.docx', '.rtf'}
+        SUPPORTED_OCR_FORMATS = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif'}
+        ENTITY_TYPES = [
+            "PERSON", "ORGANIZATION", "LOCATION", "DATE", "TIME", "MONEY", "PRODUCT", "EVENT",
+            "VEHICLE", "SUSPICIOUS_OBJECT", "ILLEGAL_ACTIVITY", "EVIDENCE", "ILLEGAL_ITEM",
+            "WEAPON", "DRUG", "CHEMICAL", "DOCUMENT", "PHONE_NUMBER", "ADDRESS", "EMAIL"
+        ]
+        RELATIONSHIP_TYPES = [
+            "works_for", "founded", "located_in", "part_of", "associated_with", "owns", "manages",
+            "ทำงานที่", "ก่อตั้ง", "ตั้งอยู่ที่", "เกี่ยวข้องกับ", "เป็นเจ้าของ",
+            "arrested_by", "investigated_by", "confiscated_from", "used_in", "evidence_of",
+            "จับกุมโดย", "สอบสวนโดย", "ยึดจาก", "หลักฐานของ"
+        ]
+    config = FallbackConfig()
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Export directories
+EXPORT_DIR = Path("exports")
+EXPORT_DIR.mkdir(exist_ok=True)
+# Global variables
+pg_pool = None
+vector_available = False
+clients = {}
+# Pydantic Models
+class NERRequest(BaseModel):
+    text: Optional[str] = None
+    url: Optional[HttpUrl] = None
+    extract_relationships: bool = True
+    include_embeddings: bool = True
+    include_summary: bool = True
+    generate_graph_files: bool = True
+    export_formats: List[str] = ["neo4j", "json", "graphml"]
+    @field_validator('text')
+    @classmethod
+    def validate_text_length(cls, v):
+        if v and len(v) > config.MAX_TEXT_LENGTH:
+            raise ValueError(f"Text too long (max {config.MAX_TEXT_LENGTH} characters)")
+        return v
+class MultiInputRequest(BaseModel):
+    texts: Optional[List[str]] = None
+    urls: Optional[List[HttpUrl]] = None
+    extract_relationships: bool = True
+    include_embeddings: bool = True
+    include_summary: bool = True
+    combine_results: bool = True
+    generate_graph_files: bool = True
+    export_formats: List[str] = ["neo4j", "json", "graphml"]
+class EntityResult(BaseModel):
+    id: str
+    text: str
+    label: str
+    confidence: float
+    start_pos: int
+    end_pos: int
+    source_type: Optional[str] = None
+    source_index: Optional[int] = None
+    frequency: int = 1
+    importance_score: float = 0.0
+    metadata: Optional[Dict[str, Any]] = None
+class RelationshipResult(BaseModel):
+    id: str
+    source_entity_id: str
+    target_entity_id: str
+    source_entity: str
+    target_entity: str
+    relationship_type: str
+    confidence: float
+    strength: float
+    context: str
+    evidence_count: int = 1
+    bidirectional: bool = False
+    metadata: Optional[Dict[str, Any]] = None
+class NodeResult(BaseModel):
+    id: str
+    label: str
+    type: str
+    confidence: float
+    frequency: int = 1
+    importance_score: float = 0.0
+    properties: Dict[str, Any]
+class LinkResult(BaseModel):
+    id: str
+    source: str
+    target: str
+    relationship: str
+    confidence: float
+    strength: float
+    evidence_count: int = 1
+    properties: Dict[str, Any]
+class GraphData(BaseModel):
+    nodes: List[NodeResult]
+    links: List[LinkResult]
+    metadata: Dict[str, Any]
+class ExportFiles(BaseModel):
+    neo4j_nodes: Optional[str] = None
+    neo4j_relationships: Optional[str] = None
+    json_export: Optional[str] = None
+    graphml_export: Optional[str] = None
+    csv_nodes: Optional[str] = None
+    csv_edges: Optional[str] = None
+    gexf_export: Optional[str] = None
+    analysis_report: Optional[str] = None
+    download_bundle: Optional[str] = None
+class NERResponse(BaseModel):
+    success: bool
+    analysis_id: str
+    source_text: str
+    source_type: str
+    language: str
+    entities: List[EntityResult]
+    keywords: List[str]
+    relationships: List[RelationshipResult]
+    summary: str
+    embeddings: Optional[List[float]] = None
+    graph_data: GraphData
+    export_files: ExportFiles
+    processing_time: float
+    character_count: int
+    word_count: int
+    sentence_count: int
+    entity_relationship_stats: Dict[str, Any]
+    error: Optional[str] = None
+class MultiNERResponse(BaseModel):
+    success: bool
+    analysis_id: str
+    combined_analysis: NERResponse
+    individual_analyses: List[NERResponse]
+    processing_time: float
+    total_sources: int
+    error: Optional[str] = None
+# Utility Functions
+def generate_unique_id(prefix: str = "item") -> str:
+    """Generate unique ID with timestamp"""
+    return f"{prefix}_{int(datetime.utcnow().timestamp() * 1000)}"
+def normalize_text(text: str) -> str:
+    """Normalize text for comparison"""
+    return re.sub(r'\s+', ' ', text.strip().lower())
+def calculate_text_similarity(text1: str, text2: str) -> float:
+    """Calculate basic text similarity"""
+    norm1 = normalize_text(text1)
+    norm2 = normalize_text(text2)
+    if norm1 == norm2:
+        return 1.0
+    words1 = set(norm1.split())
+    words2 = set(norm2.split())
+    if not words1 and not words2:
+        return 1.0
+    if not words1 or not words2:
+        return 0.0
+    intersection = words1.intersection(words2)
+    union = words1.union(words2)
+    return len(intersection) / len(union) if union else 0.0
+def deduplicate_entities(entities: List[Dict[str, Any]], similarity_threshold: float = 0.8) -> List[Dict[str, Any]]:
+    """Remove duplicate entities based on text similarity"""
+    if not entities:
+        return []
+    deduplicated = []
+    processed_texts = set()
+    for entity in entities:
+        entity_text = entity.get('text', '').strip()
+        normalized_text = normalize_text(entity_text)
+        if not entity_text or normalized_text in processed_texts:
+            continue
+        is_duplicate = False
+        for existing_entity in deduplicated:
+            existing_text = existing_entity.get('text', '')
+            similarity = calculate_text_similarity(entity_text, existing_text)
+            if similarity >= similarity_threshold:
+                if entity.get('confidence', 0) > existing_entity.get('confidence', 0):
+                    deduplicated.remove(existing_entity)
+                    break
+                else:
+                    is_duplicate = True
+                    break
+        if not is_duplicate:
+            entity['id'] = entity.get('id', generate_unique_id('ent'))
+            deduplicated.append(entity)
+            processed_texts.add(normalized_text)
+    return deduplicated
+def detect_language(text: str) -> str:
+    """Enhanced language detection"""
+    if not text:
+        return "en"
+    thai_chars = len(re.findall(r'[ก-๙]', text))
+    english_chars = len(re.findall(r'[a-zA-Z]', text))
+    total_chars = thai_chars + english_chars
+    if total_chars == 0:
+        return "en"
+    thai_ratio = thai_chars / total_chars
+    if thai_ratio > 0.3:
+        return "th"
+    elif thai_ratio > 0.1:
+        return "mixed"
+    else:
+        return "en"
+def get_text_stats(text: str) -> Dict[str, int]:
+    """Get comprehensive text statistics"""
+    return {
+        "character_count": len(text),
+        "word_count": len(text.split()),
+        "sentence_count": len(re.findall(r'[.!?]+', text)),
+        "paragraph_count": len([p for p in text.split('\n\n') if p.strip()]),
+        "line_count": len(text.split('\n'))
+    }
+# Client Management
+def get_blob_client():
+    if clients.get('blob') is None and config.AZURE_STORAGE_ACCOUNT_URL and config.AZURE_BLOB_SAS_TOKEN:
+        try:
+            clients['blob'] = BlobServiceClient(
+                account_url=config.AZURE_STORAGE_ACCOUNT_URL,
+                credential=config.AZURE_BLOB_SAS_TOKEN
+            )
+        except Exception as e:
+            logger.error(f"Failed to initialize blob client: {e}")
+    return clients.get('blob')
+def get_deepseek_client():
+    if clients.get('deepseek') is None and config.DEEPSEEK_ENDPOINT and config.DEEPSEEK_API_KEY:
+        try:
+            clients['deepseek'] = ChatCompletionsClient(
+                endpoint=config.DEEPSEEK_ENDPOINT,
+                credential=AzureKeyCredential(config.DEEPSEEK_API_KEY),
+                api_version="2024-05-01-preview"
+            )
+        except Exception as e:
+            logger.error(f"Failed to initialize DeepSeek client: {e}")
+    return clients.get('deepseek')
+def get_openai_client():
+    if clients.get('openai') is None and config.AZURE_OPENAI_ENDPOINT and config.AZURE_OPENAI_API_KEY:
+        try:
+            clients['openai'] = AzureOpenAI(
+                api_version="2024-12-01-preview",
+                azure_endpoint=config.AZURE_OPENAI_ENDPOINT,
+                api_key=config.AZURE_OPENAI_API_KEY
+            )
+        except Exception as e:
+            logger.error(f"Failed to initialize OpenAI client: {e}")
+    return clients.get('openai')
+# Database Operations
+async def init_database():
+    global pg_pool, vector_available
+    logger.info("🔄 Connecting to database...")
+    try:
+        pg_pool = await asyncpg.create_pool(
+            host=config.POSTGRES_HOST,
+            port=config.POSTGRES_PORT,
+            user=config.POSTGRES_USER,
+            password=config.POSTGRES_PASSWORD,
+            database=config.POSTGRES_DATABASE,
+            ssl='require',
+            min_size=2,
+            max_size=10,
+            command_timeout=60
+        )
+        async with pg_pool.acquire() as conn:
+            logger.info("✅ Database connected")
+            # Check vector extension
+            try:
+                await conn.execute("CREATE EXTENSION IF NOT EXISTS vector;")
+                await conn.fetchval("SELECT '[1,2,3]'::vector(3)")
+                vector_available = True
+                logger.info("✅ Vector extension available")
+            except:
+                vector_available = False
+                logger.info("⚠️  Vector extension not available (using JSONB)")
+            # Create tables
+            await create_tables(conn)
+            logger.info("✅ Database setup complete")
+        return True
+    except Exception as e:
+        logger.error(f"❌ Database init failed: {e}")
+        return False
+async def create_tables(conn):
+    """Create enhanced database tables for ER model"""
+    await conn.execute("""
+        CREATE TABLE IF NOT EXISTS ner_analyses (
+            id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+            analysis_id VARCHAR(255) UNIQUE NOT NULL,
+            source_text TEXT NOT NULL,
+            source_type VARCHAR(50) NOT NULL,
+            language VARCHAR(10) DEFAULT 'en',
+            entities JSONB NOT NULL DEFAULT '[]',
+            keywords JSONB NOT NULL DEFAULT '[]',
+            relationships JSONB NOT NULL DEFAULT '[]',
+            summary TEXT DEFAULT '',
+            embeddings JSONB DEFAULT '[]',
+            graph_data JSONB DEFAULT '{}',
+            export_files JSONB DEFAULT '{}',
+            text_stats JSONB DEFAULT '{}',
+            er_stats JSONB DEFAULT '{}',
+            processing_time FLOAT DEFAULT 0,
+            entity_types JSONB DEFAULT '[]',
+            relationship_types JSONB DEFAULT '[]',
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        );
+    """)
+    await conn.execute("""
+        CREATE TABLE IF NOT EXISTS entities (
+            id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+            entity_id VARCHAR(255) NOT NULL,
+            analysis_id VARCHAR(255) NOT NULL,
+            text VARCHAR(1000) NOT NULL,
+            label VARCHAR(100) NOT NULL,
+            confidence FLOAT DEFAULT 0,
+            start_pos INTEGER DEFAULT 0,
+            end_pos INTEGER DEFAULT 0,
+            frequency INTEGER DEFAULT 1,
+            importance_score FLOAT DEFAULT 0,
+            metadata JSONB DEFAULT '{}',
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            FOREIGN KEY (analysis_id) REFERENCES ner_analyses(analysis_id) ON DELETE CASCADE
+        );
+    """)
+    await conn.execute("""
+        CREATE TABLE IF NOT EXISTS relationships (
+            id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+            relationship_id VARCHAR(255) NOT NULL,
+            analysis_id VARCHAR(255) NOT NULL,
+            source_entity_id VARCHAR(255) NOT NULL,
+            target_entity_id VARCHAR(255) NOT NULL,
+            source_entity VARCHAR(1000) NOT NULL,
+            target_entity VARCHAR(1000) NOT NULL,
+            relationship_type VARCHAR(200) NOT NULL,
+            confidence FLOAT DEFAULT 0,
+            strength FLOAT DEFAULT 0,
+            context TEXT DEFAULT '',
+            evidence_count INTEGER DEFAULT 1,
+            bidirectional BOOLEAN DEFAULT FALSE,
+            metadata JSONB DEFAULT '{}',
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            FOREIGN KEY (analysis_id) REFERENCES ner_analyses(analysis_id) ON DELETE CASCADE
+        );
+    """)
+    # Create indexes
+    try:
+        await conn.execute("""
+            CREATE INDEX IF NOT EXISTS idx_analysis_id ON ner_analyses(analysis_id);
+            CREATE INDEX IF NOT EXISTS idx_entities_analysis ON entities(analysis_id);
+            CREATE INDEX IF NOT EXISTS idx_relationships_analysis ON relationships(analysis_id);
+        """)
+    except:
+        pass
+# Text Extraction
+def extract_text_from_file(file_content: bytes, filename: str) -> str:
+    file_ext = Path(filename).suffix.lower()
+    if file_ext == '.txt':
+        return file_content.decode('utf-8', errors='ignore')
+    elif file_ext == '.docx':
+        doc = docx.Document(io.BytesIO(file_content))
+        return '\n'.join([p.text for p in doc.paragraphs])
+    else:
+        return file_content.decode('utf-8', errors='ignore')
+async def get_text_from_ocr(file_content: bytes, filename: str) -> str:
+    try:
+        async with httpx.AsyncClient(timeout=300) as client:
+            files = {'file': (filename, file_content)}
+            response = await client.post(f"{config.OCR_SERVICE_URL}/ocr/upload", files=files)
+            if response.status_code == 200:
+                return response.json().get('content', '')
+    except Exception as e:
+        logger.error(f"OCR service error: {e}")
+        pass
+    raise HTTPException(status_code=500, detail="OCR processing failed")
+async def get_text_from_url(url: str) -> str:
+    try:
+        async with httpx.AsyncClient(timeout=300) as client:
+            response = await client.post(f"{config.OCR_SERVICE_URL}/ocr/url",
+                                       json={"url": str(url), "extract_images": True})
+            if response.status_code == 200:
+                return response.json().get('content', '')
+    except Exception as e:
+        logger.error(f"URL processing error: {e}")
+        pass
+    raise HTTPException(status_code=500, detail="URL processing failed")
+# Enhanced NER and Relationship Analysis
+async def analyze_with_deepseek(text: str, language: str = None) -> Dict[str, Any]:
+    """Enhanced analysis with improved relationship extraction"""
+    deepseek_client = get_deepseek_client()
+    if not deepseek_client:
+        logger.warning("DeepSeek not configured, using manual extraction")
+        return extract_manual_entities_and_relationships(text, language)
+    try:
+        if not language:
+            language = detect_language(text)
+        if language == "th":
+            system_prompt = """คุณเป็นผู้เชี่ยวชาญในการจดจำนามเอกลักษณ์และการสกัดความสัมพันธ์สำหรับภาษาไทย
+วิเคราะห์ข้อความและสกัดข้อมูลดังนี้:
+1. นามเอกลักษณ์ทุกประเภท (บุคคล องค์กร สถานที่ วันที่ เวลา เงิน ฯลฯ)
+2. ความสัมพันธ์ระหว่างนามเอกลักษณ์ - ต้องสกัดทุกความสัมพันธ์ที่พบ
+3. คำหลักสำคัญจากข้อความ
+4. สรุปที่ครอบคลุม
+ให้ผลลัพธ์เป็น JSON:
+{
+    "entities": [{"text": "ข้อความ", "label": "ประเภท", "confidence": 0.95, "start_pos": 0, "end_pos": 10}],
+    "keywords": ["คำหลัก1", "คำหลัก2"],
+    "relationships": [{"source_entity": "A", "target_entity": "B", "relationship_type": "ประเภท", "confidence": 0.9, "context": "บริบท"}],
+    "summary": "สรุป"
+}"""
+        else:
+            system_prompt = """You are an expert in Named Entity Recognition and relationship extraction.
+Analyze the text and extract:
+1. All named entities (people, organizations, locations, dates, money, etc.)
+2. ALL relationships between entities - extract every relationship found
+3. Important keywords from the text
+4. Comprehensive summary
+Return ONLY valid JSON:
+{
+    "entities": [{"text": "entity text", "label": "TYPE", "confidence": 0.95, "start_pos": 0, "end_pos": 10}],
+    "keywords": ["keyword1", "keyword2"],
+    "relationships": [{"source_entity": "Entity A", "target_entity": "Entity B", "relationship_type": "relationship_type", "confidence": 0.9, "context": "context"}],
+    "summary": "Comprehensive summary"
+}"""
+        user_prompt = f"วิเคราะห์ข้อความนี้:\n\n{text[:8000]}" if language == "th" else f"Analyze this text:\n\n{text[:8000]}"
+        response = deepseek_client.complete(
+            messages=[
+                SystemMessage(content=system_prompt),
+                UserMessage(content=user_prompt)
+            ],
+            max_tokens=6000,
+            model=config.DEEPSEEK_MODEL,
+            temperature=0.1
+        )
+        result_text = response.choices[0].message.content.strip()
+        # Extract JSON from response
+        start_idx = result_text.find('{')
+        end_idx = result_text.rfind('}') + 1
+        if start_idx != -1 and end_idx > start_idx:
+            json_text = result_text[start_idx:end_idx]
+            try:
+                json_result = json.loads(json_text)
+                logger.info("✅ Successfully parsed JSON from DeepSeek")
+            except:
+                try:
+                    fixed_json = json_text.replace("'", '"').replace('True', 'true').replace('False', 'false')
+                    json_result = json.loads(fixed_json)
+                    logger.info("✅ Successfully parsed fixed JSON")
+                except:
+                    json_result = None
+        else:
+            json_result = None
+        if json_result:
+            entities = deduplicate_entities(json_result.get('entities', []))
+            keywords = json_result.get('keywords', [])
+            relationships = json_result.get('relationships', [])
+            summary = json_result.get('summary', '')
+            # Ensure relationships are extracted
+            if len(relationships) == 0 and len(entities) >= 2:
+                logger.warning("No relationships found by DeepSeek, applying rule-based extraction")
+                rule_based_relationships = extract_rule_based_relationships(entities, text, language)
+                relationships.extend(rule_based_relationships)
+            # Enhance relationships with IDs
+            for rel in relationships:
+                if 'id' not in rel:
+                    rel['id'] = generate_unique_id('rel')
+                if 'strength' not in rel:
+                    rel['strength'] = rel.get('confidence', 0.8)
+                if 'evidence_count' not in rel:
+                    rel['evidence_count'] = 1
+                if 'bidirectional' not in rel:
+                    rel['bidirectional'] = False
+            return {
+                "entities": entities,
+                "keywords": keywords[:20],
+                "relationships": relationships,
+                "summary": summary or f"Analysis of {len(text)} characters"
+            }
+        logger.warning("JSON parsing failed, using manual extraction")
+        return extract_manual_entities_and_relationships(text, language)
+    except Exception as e:
+        logger.error(f"DeepSeek analysis error: {e}")
+        return extract_manual_entities_and_relationships(text, language)
+def extract_rule_based_relationships(entities: List[Dict], text: str, language: str) -> List[Dict]:
+    """Extract relationships using rule-based approach"""
+    relationships = []
+    if len(entities) < 2:
+        return relationships
+    # Define relationship patterns
+    if language == "th":
+        patterns = [
+            (r'(.+?)\s*ทำงาน(?:ที่|ใน|กับ)\s*(.+)', 'ทำงานที่'),
+            (r'(.+?)\s*เป็น(?:เจ้าของ|ของ)\s*(.+)', 'เป็นเจ้าของ'),
+            (r'(.+?)\s*ตั้งอยู่(?:ที่|ใน)\s*(.+)', 'ตั้งอยู่ที่'),
+            (r'(.+?)\s*(?:จับกุม|จับ)\s*(.+)', 'จับกุมโดย'),
+        ]
+    else:
+        patterns = [
+            (r'(.+?)\s*(?:works?\s+(?:for|at|in)|employed\s+by)\s*(.+)', 'works_for'),
+            (r'(.+?)\s*(?:owns?|possesses?)\s*(.+)', 'owns'),
+            (r'(.+?)\s*(?:located\s+(?:in|at)|based\s+in)\s*(.+)', 'located_in'),
+            (r'(.+?)\s*(?:arrested\s+by|detained\s+by)\s*(.+)', 'arrested_by'),
+        ]
+    for pattern, rel_type in patterns:
+        for match in re.finditer(pattern, text, re.IGNORECASE | re.UNICODE):
+            source_text = match.group(1).strip()
+            target_text = match.group(2).strip()
+            source_entity = find_best_entity_match(source_text, entities)
+            target_entity = find_best_entity_match(target_text, entities)
+            if source_entity and target_entity and source_entity != target_entity:
+                relationship = {
+                    'id': generate_unique_id('rel'),
+                    'source_entity': source_entity['text'],
+                    'target_entity': target_entity['text'],
+                    'relationship_type': rel_type,
+                    'confidence': 0.7,
+                    'strength': 0.7,
+                    'context': match.group(0),
+                    'evidence_count': 1,
+                    'bidirectional': False,
+                    'metadata': {'extraction_method': 'rule_based'}
+                }
+                relationships.append(relationship)
+    return relationships
+def find_best_entity_match(text: str, entities: List[Dict]) -> Optional[Dict]:
+    """Find the best matching entity for given text"""
+    text_norm = normalize_text(text)
+    for entity in entities:
+        if normalize_text(entity['text']) == text_norm:
+            return entity
+    best_match = None
+    best_score = 0
+    for entity in entities:
+        score = calculate_text_similarity(text, entity['text'])
+        if score > best_score and score > 0.6:
+            best_score = score
+            best_match = entity
+    return best_match
+def extract_manual_entities_and_relationships(text: str, language: str = None) -> Dict[str, Any]:
+    """Enhanced manual extraction with relationship detection"""
+    if not language:
+        language = detect_language(text)
+    entities = []
+    keywords = []
+    # Enhanced patterns for different languages
+    if language == "th":
+        patterns = {
+            'PERSON': [r'(?:คุณ|นาย|นาง|นางสาว|ดร\.?)\s*[ก-๙\w\s]+'],
+            'ORGANIZATION': [r'บริษัท\s+[ก-๙\w\s]+(?:จำกัด|มหาชน)', r'สถานีตำรวจ[ก-๙\w\s]+'],
+            'LOCATION': [r'จังหวัด[ก-๙\w\s]+', r'กรุงเทพมหานคร|กรุงเทพฯ?'],
+            'MONEY': [r'\d+(?:,\d{3})*\s*(?:บาท|ล้านบาท|พันบาท)'],
+            'DATE': [r'\d{1,2}\/\d{1,2}\/\d{4}'],
+        }
+        words = re.findall(r'[ก-๙]+', text)
+        thai_stop_words = {'และ', 'หรือ', 'แต่', 'ใน', 'ที่', 'เพื่อ', 'กับ', 'จาก', 'โดย', 'ของ'}
+        keywords = [word for word in words if word not in thai_stop_words and len(word) > 2]
+    else:
+        patterns = {
+            'PERSON': [r'\b(?:Mr|Mrs|Ms|Dr|Prof)\.\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*'],
+            'ORGANIZATION': [r'\b[A-Z][a-zA-Z]+\s+(?:Inc|Corp|Company|Ltd|Co|LLC|Corporation|Limited|University)\b'],
+            'LOCATION': [r'\b(?:New York|Los Angeles|Chicago|Bangkok|London|Paris|Berlin)\b'],
+            'MONEY': [r'\$[\d,]+\.?\d*', r'\b\d+(?:,\d{3})*\s*(?:dollars?|USD|million|billion)\b'],
+            'DATE': [r'\b\d{1,2}\/\d{1,2}\/\d{4}\b'],
+        }
+        words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
+        english_stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
+        keywords = [word.lower() for word in words if word.lower() not in english_stop_words]
+    # Extract entities
+    for label, pattern_list in patterns.items():
+        for pattern in pattern_list:
+            for match in re.finditer(pattern, text, re.UNICODE | re.IGNORECASE):
+                entity_text = match.group().strip()
+                if len(entity_text) > 1:
+                    entities.append({
+                        "id": generate_unique_id('ent'),
+                        "text": entity_text,
+                        "label": label,
+                        "confidence": 0.8,
+                        "start_pos": match.start(),
+                        "end_pos": match.end(),
+                        "frequency": 1,
+                        "importance_score": 0.7,
+                        "metadata": {"source": "manual_extraction"}
+                    })
+    # Deduplicate
+    entities = deduplicate_entities(entities)
+    keywords = list(set(keywords))[:20]
+    # Extract relationships
+    relationships = []
+    if len(entities) >= 2:
+        relationships = extract_rule_based_relationships(entities, text, language)
+    summary = f"Analysis of {len(text)} characters found {len(entities)} entities and {len(relationships)} relationships"
+    return {
+        "entities": entities,
+        "keywords": keywords,
+        "relationships": relationships,
+        "summary": summary
+    }
+async def generate_embeddings(text: str) -> List[float]:
+    openai_client = get_openai_client()
+    if not openai_client:
+        return []
+    try:
+        response = openai_client.embeddings.create(
+            input=[text[:8000]],
+            model=config.EMBEDDING_MODEL,
+            dimensions=1536
+        )
+        return response.data[0].embedding
+    except Exception as e:
+        logger.error(f"Embedding failed: {e}")
+        return []
+def create_enhanced_graph_data(entities: List[Dict], relationships: List[Dict]) -> GraphData:
+    """Create enhanced graph data with comprehensive ER model"""
+    nodes = []
+    links = []
+    entity_map = {}
+    # Create nodes
+    for entity in entities:
+        node_id = entity.get('id', generate_unique_id('ent'))
+        entity_map[entity['text']] = node_id
+        node_properties = {
+            "original_text": entity['text'],
+            "entity_type": entity['label'],
+            "confidence": entity.get('confidence', 0.0),
+            "start_position": entity.get('start_pos', 0),
+            "end_position": entity.get('end_pos', 0),
+            "frequency": entity.get('frequency', 1),
+            "importance_score": entity.get('importance_score', 0.0),
+            "metadata": entity.get('metadata', {})
+        }
+        nodes.append(NodeResult(
+            id=node_id,
+            label=entity['text'],
+            type=entity['label'],
+            confidence=entity.get('confidence', 0.0),
+            frequency=entity.get('frequency', 1),
+            importance_score=entity.get('importance_score', 0.0),
+            properties=node_properties
+        ))
+    # Create links
+    for rel in relationships:
+        source_id = entity_map.get(rel['source_entity'])
+        target_id = entity_map.get(rel['target_entity'])
+        if source_id and target_id:
+            link_id = rel.get('id', generate_unique_id('link'))
+            link_properties = {
+                "relationship_type": rel['relationship_type'],
+                "confidence": rel.get('confidence', 0.0),
+                "strength": rel.get('strength', rel.get('confidence', 0.0)),
+                "context": rel.get('context', ''),
+                "evidence_count": rel.get('evidence_count', 1),
+                "bidirectional": rel.get('bidirectional', False),
+                "metadata": rel.get('metadata', {})
+            }
+            links.append(LinkResult(
+                id=link_id,
+                source=source_id,
+                target=target_id,
+                relationship=rel['relationship_type'],
+                confidence=rel.get('confidence', 0.0),
+                strength=rel.get('strength', rel.get('confidence', 0.0)),
+                evidence_count=rel.get('evidence_count', 1),
+                properties=link_properties
+            ))
+    # Calculate metadata
+    entity_types = defaultdict(int)
+    relationship_types = defaultdict(int)
+    for entity in entities:
+        entity_types[entity['label']] += 1
+    for rel in relationships:
+        relationship_types[rel['relationship_type']] += 1
+    metadata = {
+        "total_entities": len(entities),
+        "total_relationships": len(relationships),
+        "entity_type_distribution": dict(entity_types),
+        "relationship_type_distribution": dict(relationship_types),
+        "graph_density": len(relationships) / (len(entities) * (len(entities) - 1) / 2) if len(entities) > 1 else 0,
+        "average_entity_confidence": sum(entity.get('confidence', 0) for entity in entities) / len(entities) if entities else 0,
+        "average_relationship_confidence": sum(rel.get('confidence', 0) for rel in relationships) / len(relationships) if relationships else 0,
+        "unique_entity_types": len(entity_types),
+        "unique_relationship_types": len(relationship_types)
+    }
+    return GraphData(
+        nodes=nodes,
+        links=links,
+        metadata=metadata
+    )
+# Export Functions (simplified)
+async def generate_export_files(analysis_id: str, entities: List[Dict], relationships: List[Dict],
+                               graph_data: GraphData, formats: List[str]) -> ExportFiles:
+    """Generate export files for various formats"""
+    export_files = ExportFiles()
+    analysis_dir = EXPORT_DIR / analysis_id
+    analysis_dir.mkdir(exist_ok=True)
+    try:
+        if "neo4j" in formats:
+            nodes_file, rels_file = await generate_neo4j_csv(analysis_dir, entities, relationships)
+            export_files.neo4j_nodes = str(nodes_file)
+            export_files.neo4j_relationships = str(rels_file)
+        if "json" in formats:
+            json_file = await generate_json_export(analysis_dir, entities, relationships, graph_data)
+            export_files.json_export = str(json_file)
+        if "graphml" in formats:
+            graphml_file = await generate_graphml_export(analysis_dir, entities, relationships)
+            export_files.graphml_export = str(graphml_file)
+        logger.info(f"✅ Generated export files for analysis {analysis_id}")
+    except Exception as e:
+        logger.error(f"❌ Export file generation failed: {e}")
+    return export_files
+async def generate_neo4j_csv(export_dir: Path, entities: List[Dict], relationships: List[Dict]) -> Tuple[Path, Path]:
+    """Generate Neo4j compatible CSV files"""
+    nodes_file = export_dir / "neo4j_nodes.csv"
+    with open(nodes_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow([
+            'nodeId:ID', 'text', 'label:LABEL', 'confidence:float',
+            'frequency:int', 'importance:float'
+        ])
+        for entity in entities:
+            writer.writerow([
+                entity.get('id', generate_unique_id('ent')),
+                entity['text'],
+                entity['label'],
+                entity.get('confidence', 0.0),
+                entity.get('frequency', 1),
+                entity.get('importance_score', 0.0)
+            ])
+    rels_file = export_dir / "neo4j_relationships.csv"
+    entity_map = {entity['text']: entity.get('id', generate_unique_id('ent')) for entity in entities}
+    with open(rels_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow([
+            ':START_ID', ':END_ID', ':TYPE', 'confidence:float',
+            'strength:float', 'context'
+        ])
+        for rel in relationships:
+            source_id = entity_map.get(rel['source_entity'])
+            target_id = entity_map.get(rel['target_entity'])
+            if source_id and target_id:
+                writer.writerow([
+                    source_id,
+                    target_id,
+                    rel['relationship_type'].upper().replace(' ', '_'),
+                    rel.get('confidence', 0.0),
+                    rel.get('strength', rel.get('confidence', 0.0)),
+                    rel.get('context', '')
+                ])
+    return nodes_file, rels_file
+async def generate_json_export(export_dir: Path, entities: List[Dict], relationships: List[Dict], graph_data: GraphData) -> Path:
+    """Generate comprehensive JSON export"""
+    json_file = export_dir / "analysis_export.json"
+    export_data = {
+        "metadata": {
+            "export_timestamp": datetime.utcnow().isoformat(),
+            "format_version": "1.0",
+            "total_entities": len(entities),
+            "total_relationships": len(relationships)
+        },
+        "entities": entities,
+        "relationships": relationships,
+        "graph_data": graph_data.dict(),
+        "statistics": {
+            "entity_types": list(set(e['label'] for e in entities)),
+            "relationship_types": list(set(r['relationship_type'] for r in relationships)),
+            "average_confidence": sum(e.get('confidence', 0) for e in entities) / len(entities) if entities else 0
+        }
+    }
+    with open(json_file, 'w', encoding='utf-8') as f:
+        json.dump(export_data, f, indent=2, ensure_ascii=False)
+    return json_file
+async def generate_graphml_export(export_dir: Path, entities: List[Dict], relationships: List[Dict]) -> Path:
+    """Generate GraphML format"""
+    graphml_file = export_dir / "graph_export.graphml"
+    # Create GraphML structure
+    root = ET.Element('graphml')
+    root.set('xmlns', 'http://graphml.graphdrawing.org/xmlns')
+    # Define attributes
+    ET.SubElement(root, 'key', id='label', **{'for': 'node', 'attr.name': 'label', 'attr.type': 'string'})
+    ET.SubElement(root, 'key', id='type', **{'for': 'node', 'attr.name': 'type', 'attr.type': 'string'})
+    ET.SubElement(root, 'key', id='rel_type', **{'for': 'edge', 'attr.name': 'relationship', 'attr.type': 'string'})
+    graph = ET.SubElement(root, 'graph', id='G', edgedefault='directed')
+    # Add nodes
+    entity_map = {}
+    for entity in entities:
+        node_id = entity.get('id', generate_unique_id('ent'))
+        entity_map[entity['text']] = node_id
+        node = ET.SubElement(graph, 'node', id=node_id)
+        label_data = ET.SubElement(node, 'data', key='label')
+        label_data.text = entity['text']
+        type_data = ET.SubElement(node, 'data', key='type')
+        type_data.text = entity['label']
+    # Add edges
+    for i, rel in enumerate(relationships):
+        source_id = entity_map.get(rel['source_entity'])
+        target_id = entity_map.get(rel['target_entity'])
+        if source_id and target_id:
+            edge = ET.SubElement(graph, 'edge', id=f"e{i}", source=source_id, target=target_id)
+            rel_data = ET.SubElement(edge, 'data', key='rel_type')
+            rel_data.text = rel['relationship_type']
+    # Write to file
+    tree = ET.ElementTree(root)
+    tree.write(graphml_file, encoding='utf-8', xml_declaration=True)
+    return graphml_file
+def calculate_er_stats(entities: List[Dict], relationships: List[Dict]) -> Dict[str, Any]:
+    """Calculate Entity-Relationship statistics"""
+    if not entities:
+        return {}
+    entity_types = defaultdict(int)
+    relationship_types = defaultdict(int)
+    for entity in entities:
+        entity_types[entity['label']] += 1
+    for rel in relationships:
+        relationship_types[rel['relationship_type']] += 1
+    return {
+        "total_entities": len(entities),
+        "total_relationships": len(relationships),
+        "entity_type_distribution": dict(entity_types),
+        "relationship_type_distribution": dict(relationship_types),
+        "graph_density": len(relationships) / (len(entities) * (len(entities) - 1) / 2) if len(entities) > 1 else 0,
+        "unique_entity_types": len(entity_types),
+        "unique_relationship_types": len(relationship_types)
+    }
+async def save_to_database(data: Dict[str, Any]) -> bool:
+    if not pg_pool:
+        logger.error("No database pool available")
+        return False
+    try:
+        async with pg_pool.acquire() as conn:
+            await conn.execute("""
+                INSERT INTO ner_analyses (
+                    analysis_id, source_text, source_type, language, entities, keywords,
+                    relationships, summary, embeddings, graph_data, export_files, text_stats,
+                    er_stats, processing_time, entity_types, relationship_types
+                ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16)
+                ON CONFLICT (analysis_id) DO UPDATE SET
+                    entities = EXCLUDED.entities,
+                    relationships = EXCLUDED.relationships,
+                    summary = EXCLUDED.summary
+                """,
+                data['analysis_id'],
+                data['source_text'][:10000],
+                data['source_type'],
+                data['language'],
+                json.dumps(data['entities'], ensure_ascii=False),
+                json.dumps(data['keywords'], ensure_ascii=False),
+                json.dumps(data['relationships'], ensure_ascii=False),
+                data['summary'],
+                json.dumps(data.get('embeddings', [])),
+                json.dumps(data.get('graph_data', {}), ensure_ascii=False, default=str),
+                json.dumps(data.get('export_files', {}), ensure_ascii=False, default=str),
+                json.dumps(data.get('text_stats', {})),
+                json.dumps(data.get('er_stats', {})),
+                float(data.get('processing_time', 0)),
+                json.dumps(list(set(entity.get('label', '') for entity in data.get('entities', [])))),
+                json.dumps(list(set(rel.get('relationship_type', '') for rel in data.get('relationships', []))))
+            )
+        logger.info(f"✅ Analysis {data['analysis_id']} saved to database")
+        return True
+    except Exception as e:
+        logger.error(f"❌ DB save failed for {data.get('analysis_id', 'unknown')}: {e}")
+        return False
+async def save_to_blob(analysis_id: str, data: Dict[str, Any]) -> bool:
+    blob_client = get_blob_client()
+    if not blob_client:
+        return False
+    try:
+        blob_name = f"ner_analysis/{analysis_id}_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.json"
+        blob_client_obj = blob_client.get_blob_client(container=config.BLOB_CONTAINER, blob=blob_name)
+        blob_client_obj.upload_blob(json.dumps(data, indent=2, ensure_ascii=False, default=str), overwrite=True)
+        return True
+    except Exception as e:
+        logger.error(f"Blob save failed: {e}")
+        return False
+# App Lifecycle
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    logger.info("🚀 Starting Enhanced NER Analysis Service...")
+    logger.info("🔄 Database initialization...")
+    db_ok = await init_database()
+    if not db_ok:
+        logger.error("❌ Database initialization failed!")
+        raise RuntimeError("Database initialization failed")
+    logger.info("🔄 Initializing API clients...")
+    get_deepseek_client()
+    get_openai_client()
+    get_blob_client()
+    logger.info("🔄 Creating export directories...")
+    EXPORT_DIR.mkdir(exist_ok=True)
+    logger.info("🎉 Enhanced NER Analysis Service is ready!")
+    logger.info(f"📡 Server running on http://{config.HOST}:{config.PORT}")
+    yield
+    logger.info("🛑 Shutting down...")
+    if pg_pool:
+        await pg_pool.close()
+        logger.info("✅ Database connections closed")
+# FastAPI App
+app = FastAPI(
+    title="Enhanced NER Analysis Service",
+    description="Advanced Named Entity Recognition with relationship extraction and graph exports",
+    version="2.0.0",
+    lifespan=lifespan
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# API Endpoints
+@app.get("/")
+async def root():
+    deepseek_available = bool(config.DEEPSEEK_ENDPOINT and config.DEEPSEEK_API_KEY)
+    openai_available = bool(config.AZURE_OPENAI_ENDPOINT and config.AZURE_OPENAI_API_KEY)
+    blob_available = bool(config.AZURE_STORAGE_ACCOUNT_URL and config.AZURE_BLOB_SAS_TOKEN)
+    return {
+        "message": "Enhanced NER Analysis Service",
+        "version": "2.0.0",
+        "status": "operational",
+        "supported_entities": config.ENTITY_TYPES,
+        "supported_relationships": config.RELATIONSHIP_TYPES[:10],
+        "export_formats": ["neo4j", "json", "graphml"],
+        "features": {
+            "ner_analysis": True,
+            "relationship_extraction": True,
+            "thai_language_support": True,
+            "graph_database_export": True,
+            "embedding_generation": openai_available,
+            "deepseek_analysis": deepseek_available,
+            "blob_storage": blob_available
+        }
+    }
+@app.get("/health")
+async def health():
+    deepseek_available = bool(config.DEEPSEEK_ENDPOINT and config.DEEPSEEK_API_KEY)
+    openai_available = bool(config.AZURE_OPENAI_ENDPOINT and config.AZURE_OPENAI_API_KEY)
+    blob_available = bool(config.AZURE_STORAGE_ACCOUNT_URL and config.AZURE_BLOB_SAS_TOKEN)
+    return {
+        "status": "healthy",
+        "service": "NER Analysis Service",
+        "version": "2.0.0",
+        "database": pg_pool is not None,
+        "vector_extension": vector_available,
+        "deepseek": deepseek_available,
+        "openai": openai_available,
+        "blob_storage": blob_available,
+        "supported_entity_count": len(config.ENTITY_TYPES),
+        "supported_relationship_count": len(config.RELATIONSHIP_TYPES),
+        "export_formats": ["neo4j", "json", "graphml"]
+    }
+@app.post("/analyze/text", response_model=NERResponse)
+async def analyze_text(request: NERRequest, background_tasks: BackgroundTasks):
+    """Analyze text for entities and relationships"""
+    start_time = datetime.utcnow()
+    analysis_id = f"text_{int(start_time.timestamp())}"
+    if not request.text or not request.text.strip():
+        raise HTTPException(status_code=400, detail="Text is required")
+    try:
+        language = detect_language(request.text)
+        text_stats = get_text_stats(request.text)
+        # Enhanced analysis
+        analysis_result = await analyze_with_deepseek(request.text, language)
+        # Generate embeddings if requested
+        embeddings = []
+        if request.include_embeddings:
+            embeddings = await generate_embeddings(request.text)
+        # Create enhanced graph
+        graph_data = create_enhanced_graph_data(
+            analysis_result.get('entities', []),
+            analysis_result.get('relationships', [])
+        )
+        # Calculate ER statistics
+        er_stats = calculate_er_stats(
+            analysis_result.get('entities', []),
+            analysis_result.get('relationships', [])
+        )
+        # Generate export files if requested
+        export_files = ExportFiles()
+        if request.generate_graph_files:
+            export_files = await generate_export_files(
+                analysis_id,
+                analysis_result.get('entities', []),
+                analysis_result.get('relationships', []),
+                graph_data,
+                request.export_formats
+            )
+        processing_time = (datetime.utcnow() - start_time).total_seconds()
+        response_data = {
+            "analysis_id": analysis_id,
+            "source_text": request.text,
+            "source_type": "text_input",
+            "language": language,
+            "entities": analysis_result.get('entities', []),
+            "keywords": analysis_result.get('keywords', []),
+            "relationships": analysis_result.get('relationships', []),
+            "summary": analysis_result.get('summary', ''),
+            "embeddings": embeddings,
+            "graph_data": graph_data,
+            "export_files": export_files,
+            "text_stats": text_stats,
+            "er_stats": er_stats,
+            "processing_time": processing_time,
+            "character_count": text_stats["character_count"],
+            "word_count": text_stats["word_count"],
+            "sentence_count": text_stats["sentence_count"]
+        }
+        # Save to database in background
+        background_tasks.add_task(save_to_database, response_data)
+        background_tasks.add_task(save_to_blob, analysis_id, response_data)
+        return NERResponse(
+            success=True,
+            entity_relationship_stats=er_stats,
+            **response_data
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Text analysis failed: {e}")
+        return NERResponse(
+            success=False,
+            analysis_id=analysis_id,
+            source_text=request.text[:1000],
+            source_type="text_input",
+            language="unknown",
+            entities=[],
+            keywords=[],
+            relationships=[],
+            summary="",
+            graph_data=GraphData(nodes=[], links=[], metadata={}),
+            export_files=ExportFiles(),
+            processing_time=(datetime.utcnow() - start_time).total_seconds(),
+            character_count=0,
+            word_count=0,
+            sentence_count=0,
+            entity_relationship_stats={},
+            error=str(e)
+        )
+@app.post("/analyze/file", response_model=NERResponse)
+async def analyze_file(
+    file: UploadFile = File(...),
+    extract_relationships: bool = Form(True),
+    include_embeddings: bool = Form(True),
+    include_summary: bool = Form(True),
+    generate_graph_files: bool = Form(True),
+    export_formats: str = Form("neo4j,json"),
+    background_tasks: BackgroundTasks = None
+):
+    """Analyze uploaded file for entities and relationships"""
+    start_time = datetime.utcnow()
+    analysis_id = f"file_{int(start_time.timestamp())}"
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename")
+    try:
+        file_content = await file.read()
+        if len(file_content) > config.MAX_FILE_SIZE:
+            raise HTTPException(status_code=400, detail="File too large")
+        file_ext = Path(file.filename).suffix.lower()
+        export_format_list = export_formats.split(',') if export_formats else ["json"]
+        if file_ext in config.SUPPORTED_TEXT_FORMATS:
+            text = extract_text_from_file(file_content, file.filename)
+            source_type = "text_file"
+        elif file_ext in config.SUPPORTED_OCR_FORMATS:
+            text = await get_text_from_ocr(file_content, file.filename)
+            source_type = "ocr_file"
+        else:
+            raise HTTPException(status_code=400, detail=f"Unsupported format: {file_ext}")
+        if not text.strip():
+            raise HTTPException(status_code=400, detail="No text extracted")
+        language = detect_language(text)
+        text_stats = get_text_stats(text)
+        # Enhanced analysis
+        analysis_result = await analyze_with_deepseek(text, language)
+        # Generate embeddings
+        embeddings = []
+        if include_embeddings:
+            embeddings = await generate_embeddings(text)
+        # Create enhanced graph
+        graph_data = create_enhanced_graph_data(
+            analysis_result.get('entities', []),
+            analysis_result.get('relationships', [])
+        )
+        # Calculate ER statistics
+        er_stats = calculate_er_stats(
+            analysis_result.get('entities', []),
+            analysis_result.get('relationships', [])
+        )
+        # Generate export files
+        export_files = ExportFiles()
+        if generate_graph_files:
+            export_files = await generate_export_files(
+                analysis_id,
+                analysis_result.get('entities', []),
+                analysis_result.get('relationships', []),
+                graph_data,
+                export_format_list
+            )
+        processing_time = (datetime.utcnow() - start_time).total_seconds()
+        response_data = {
+            "analysis_id": analysis_id,
+            "source_text": text,
+            "source_type": source_type,
+            "language": language,
+            "entities": analysis_result.get('entities', []),
+            "keywords": analysis_result.get('keywords', []),
+            "relationships": analysis_result.get('relationships', []),
+            "summary": analysis_result.get('summary', ''),
+            "embeddings": embeddings,
+            "graph_data": graph_data,
+            "export_files": export_files,
+            "text_stats": text_stats,
+            "er_stats": er_stats,
+            "processing_time": processing_time,
+            "character_count": text_stats["character_count"],
+            "word_count": text_stats["word_count"],
+            "sentence_count": text_stats["sentence_count"]
+        }
+        # Save in background
+        if background_tasks:
+            background_tasks.add_task(save_to_database, response_data)
+            background_tasks.add_task(save_to_blob, analysis_id, response_data)
+        return NERResponse(
+            success=True,
+            entity_relationship_stats=er_stats,
+            **response_data
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"File analysis failed: {e}")
+        return NERResponse(
+            success=False,
+            analysis_id=analysis_id,
+            source_text="",
+            source_type="file_input",
+            language="unknown",
+            entities=[],
+            keywords=[],
+            relationships=[],
+            summary="",
+            graph_data=GraphData(nodes=[], links=[], metadata={}),
+            export_files=ExportFiles(),
+            processing_time=(datetime.utcnow() - start_time).total_seconds(),
+            character_count=0,
+            word_count=0,
+            sentence_count=0,
+            entity_relationship_stats={},
+            error=str(e)
+        )
+@app.post("/analyze/url", response_model=NERResponse)
+async def analyze_url(request: NERRequest, background_tasks: BackgroundTasks):
+    """Analyze URL content for entities and relationships"""
+    start_time = datetime.utcnow()
+    analysis_id = f"url_{int(start_time.timestamp())}"
+    if not request.url:
+        raise HTTPException(status_code=400, detail="URL is required")
+    try:
+        text = await get_text_from_url(str(request.url))
+        if not text.strip():
+            raise HTTPException(status_code=400, detail="No text extracted from URL")
+        language = detect_language(text)
+        text_stats = get_text_stats(text)
+        # Enhanced analysis
+        analysis_result = await analyze_with_deepseek(text, language)
+        # Generate embeddings
+        embeddings = []
+        if request.include_embeddings:
+            embeddings = await generate_embeddings(text)
+        # Create enhanced graph
+        graph_data = create_enhanced_graph_data(
+            analysis_result.get('entities', []),
+            analysis_result.get('relationships', [])
+        )
+        # Calculate ER statistics
+        er_stats = calculate_er_stats(
+            analysis_result.get('entities', []),
+            analysis_result.get('relationships', [])
+        )
+        # Generate export files
+        export_files = ExportFiles()
+        if request.generate_graph_files:
+            export_files = await generate_export_files(
+                analysis_id,
+                analysis_result.get('entities', []),
+                analysis_result.get('relationships', []),
+                graph_data,
+                request.export_formats
+            )
+        processing_time = (datetime.utcnow() - start_time).total_seconds()
+        response_data = {
+            "analysis_id": analysis_id,
+            "source_text": text,
+            "source_type": "url_content",
+            "language": language,
+            "entities": analysis_result.get('entities', []),
+            "keywords": analysis_result.get('keywords', []),
+            "relationships": analysis_result.get('relationships', []),
+            "summary": analysis_result.get('summary', ''),
+            "embeddings": embeddings,
+            "graph_data": graph_data,
+            "export_files": export_files,
+            "text_stats": text_stats,
+            "er_stats": er_stats,
+            "processing_time": processing_time,
+            "character_count": text_stats["character_count"],
+            "word_count": text_stats["word_count"],
+            "sentence_count": text_stats["sentence_count"]
+        }
+        # Save in background
+        background_tasks.add_task(save_to_database, response_data)
+        background_tasks.add_task(save_to_blob, analysis_id, response_data)
+        return NERResponse(
+            success=True,
+            entity_relationship_stats=er_stats,
+            **response_data
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"URL analysis failed: {e}")
+        return NERResponse(
+            success=False,
+            analysis_id=analysis_id,
+            source_text="",
+            source_type="url_content",
+            language="unknown",
+            entities=[],
+            keywords=[],
+            relationships=[],
+            summary="",
+            graph_data=GraphData(nodes=[], links=[], metadata={}),
+            export_files=ExportFiles(),
+            processing_time=(datetime.utcnow() - start_time).total_seconds(),
+            character_count=0,
+            word_count=0,
+            sentence_count=0,
+            entity_relationship_stats={},
+            error=str(e)
+        )
+@app.get("/download/{analysis_id}/{file_type}")
+async def download_export_file(analysis_id: str, file_type: str):
+    """Download specific export file for an analysis"""
+    try:
+        analysis_dir = EXPORT_DIR / analysis_id
+        if not analysis_dir.exists():
+            raise HTTPException(status_code=404, detail=f"Analysis {analysis_id} not found")
+        file_mapping = {
+            "neo4j_nodes": "neo4j_nodes.csv",
+            "neo4j_relationships": "neo4j_relationships.csv",
+            "json": "analysis_export.json",
+            "graphml": "graph_export.graphml"
+        }
+        if file_type not in file_mapping:
+            raise HTTPException(status_code=400, detail=f"Invalid file type: {file_type}")
+        file_path = analysis_dir / file_mapping[file_type]
+        if not file_path.exists():
+            raise HTTPException(status_code=404, detail=f"File {file_type} not found")
+        return FileResponse(path=file_path, filename=file_mapping[file_type])
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Download failed for {analysis_id}/{file_type}: {e}")
+        raise HTTPException(status_code=500, detail=f"Download failed: {str(e)}")
+@app.get("/entity-types")
+async def get_entity_types():
+    """Get all supported entity types"""
+    return {
+        "success": True,
+        "entity_types": config.ENTITY_TYPES,
+        "total_count": len(config.ENTITY_TYPES)
+    }
+@app.get("/relationship-types")
+async def get_relationship_types():
+    """Get all supported relationship types"""
+    return {
+        "success": True,
+        "relationship_types": config.RELATIONSHIP_TYPES,
+        "total_count": len(config.RELATIONSHIP_TYPES)
+    }
+if __name__ == "__main__":
+    print("🔧 Loading enhanced NER configuration...")
+    print(f"🌐 Will start server on {config.HOST}:{config.PORT}")
+    print(f"🏷️  Enhanced with {len(config.ENTITY_TYPES)} entity types")
+    print(f"🔗 Enhanced with {len(config.RELATIONSHIP_TYPES)} relationship types")
+    uvicorn.run(
+        "ner_service:app",
+        host=config.HOST,
+        port=config.PORT,
+        reload=config.DEBUG,
+        log_level="info"
+    )

service/ocr_service.py ADDED Viewed

	@@ -0,0 +1,588 @@

+#!/usr/bin/env python3
+"""
+OCR Backend API with Azure Document Intelligence - Cleaned and Optimized
+Supports file uploads, URL processing, and web scraping fallback
+"""
+import os
+import io
+import requests
+import numpy as np
+import logging
+from typing import Optional, List, Dict, Any
+from urllib.parse import urlparse, urljoin
+from pathlib import Path
+import mimetypes
+from fastapi import FastAPI, File, UploadFile, HTTPException, Form
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, HttpUrl
+import uvicorn
+# Import unified configuration
+try:
+    from configs import get_config
+    config = get_config().ocr
+    print("✅ Using unified configuration")
+except ImportError:
+    print("⚠️  Unified config not available, using fallback configuration")
+    from dotenv import load_dotenv
+    load_dotenv()
+    class FallbackConfig:
+        HOST = os.getenv("HOST", "0.0.0.0")
+        PORT = int(os.getenv("OCR_PORT", "8400"))
+        DEBUG = os.getenv("DEBUG", "True").lower() == "true"
+        # Azure Document Intelligence configuration
+        AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT", "")
+        AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY", "")
+        # Web scraping configuration
+        MAX_IMAGES_PER_PAGE = int(os.getenv("MAX_IMAGES_PER_PAGE", "10"))
+        REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "30"))
+        USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
+        # File size limits
+        MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
+    config = FallbackConfig()
+from azure.core.credentials import AzureKeyCredential
+from azure.ai.documentintelligence import DocumentIntelligenceClient
+from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+from azure.core.exceptions import HttpResponseError
+from bs4 import BeautifulSoup
+from PIL import Image
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Initialize FastAPI app
+app = FastAPI(
+    title="OCR Backend API",
+    description="OCR service with Azure Document Intelligence, supporting file uploads, URLs, and web scraping",
+    version="2.0.0",
+    debug=config.DEBUG
+)
+# CORS configuration
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Pydantic models
+class URLRequest(BaseModel):
+    url: HttpUrl
+    extract_images: bool = True
+class OCRResponse(BaseModel):
+    success: bool
+    content: str
+    pages: List[Dict[str, Any]]
+    source_type: str  # 'file_upload', 'direct_url', 'web_scraped'
+    source_url: Optional[str] = None
+    error: Optional[str] = None
+class WebScrapingResult(BaseModel):
+    text_content: str
+    images_found: List[str]
+    ocr_results: List[Dict[str, Any]]
+# Utility functions
+def format_bounding_box(bounding_box):
+    """Format bounding box coordinates for display"""
+    if not bounding_box:
+        return "N/A"
+    reshaped_bounding_box = np.array(bounding_box).reshape(-1, 2)
+    return ", ".join(["[{}, {}]".format(x, y) for x, y in reshaped_bounding_box])
+def is_supported_file_type(content_type: str, filename: str = "") -> bool:
+    """Check if the file type is supported for OCR"""
+    supported_types = {
+        'application/pdf',
+        'image/jpeg',
+        'image/jpg',
+        'image/png',
+        'image/tiff',
+        'image/bmp',
+        'image/gif'
+    }
+    if content_type and content_type.lower() in supported_types:
+        return True
+    # Check by file extension if content type is unclear
+    if filename:
+        supported_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp', '.gif'}
+        file_ext = Path(filename).suffix.lower()
+        return file_ext in supported_extensions
+    return False
+def get_document_intelligence_client():
+    """Initialize Azure Document Intelligence client"""
+    if (config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT == "" or
+        config.AZURE_DOCUMENT_INTELLIGENCE_KEY == "" or
+        config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT == "YOUR_FORM_RECOGNIZER_ENDPOINT" or
+        config.AZURE_DOCUMENT_INTELLIGENCE_KEY == "YOUR_FORM_RECOGNIZER_KEY"):
+        raise HTTPException(
+            status_code=500,
+            detail="Azure Document Intelligence credentials not configured"
+        )
+    return DocumentIntelligenceClient(
+        endpoint=config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
+        credential=AzureKeyCredential(config.AZURE_DOCUMENT_INTELLIGENCE_KEY)
+    )
+async def process_ocr_from_url(url: str) -> Dict[str, Any]:
+    """Process OCR from a direct URL"""
+    try:
+        client = get_document_intelligence_client()
+        logger.info(f"Processing OCR from URL: {url}")
+        poller = client.begin_analyze_document(
+            "prebuilt-read",
+            AnalyzeDocumentRequest(url_source=url)
+        )
+        result = poller.result()
+        return format_ocr_result(result, "direct_url", url)
+    except HttpResponseError as e:
+        logger.error(f"Azure OCR error for URL {url}: {e}")
+        raise HTTPException(status_code=400, detail=f"OCR processing failed: {e}")
+    except Exception as e:
+        logger.error(f"Unexpected error processing URL {url}: {e}")
+        raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
+async def process_ocr_from_bytes(file_bytes: bytes, filename: str = "") -> Dict[str, Any]:
+    """Process OCR from file bytes"""
+    try:
+        client = get_document_intelligence_client()
+        logger.info(f"Processing OCR from file: {filename} ({len(file_bytes)} bytes)")
+        poller = client.begin_analyze_document(
+            "prebuilt-read",
+            AnalyzeDocumentRequest(bytes_source=file_bytes)
+        )
+        result = poller.result()
+        return format_ocr_result(result, "file_upload", filename)
+    except HttpResponseError as e:
+        logger.error(f"Azure OCR error for file {filename}: {e}")
+        raise HTTPException(status_code=400, detail=f"OCR processing failed: {e}")
+    except Exception as e:
+        logger.error(f"Unexpected error processing file {filename}: {e}")
+        raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
+def format_ocr_result(result, source_type: str, source_identifier: str = "") -> Dict[str, Any]:
+    """Format Azure Document Intelligence result into standardized response"""
+    pages_data = []
+    for page in result.pages:
+        page_data = {
+            "page_number": page.page_number,
+            "width": page.width,
+            "height": page.height,
+            "unit": page.unit,
+            "lines": [],
+            "words": []
+        }
+        # Process lines
+        if hasattr(page, 'lines') and page.lines:
+            for line_idx, line in enumerate(page.lines):
+                page_data["lines"].append({
+                    "line_number": line_idx,
+                    "content": line.content,
+                    "bounding_box": format_bounding_box(line.polygon) if hasattr(line, 'polygon') else "N/A"
+                })
+        # Process words
+        if hasattr(page, 'words') and page.words:
+            for word in page.words:
+                page_data["words"].append({
+                    "content": word.content,
+                    "confidence": word.confidence if hasattr(word, 'confidence') else None
+                })
+        pages_data.append(page_data)
+    # Check for handwritten content
+    handwritten_detected = False
+    if hasattr(result, 'styles') and result.styles:
+        for style in result.styles:
+            if hasattr(style, 'is_handwritten') and style.is_handwritten:
+                handwritten_detected = True
+                break
+    return {
+        "success": True,
+        "content": result.content if hasattr(result, 'content') else "",
+        "pages": pages_data,
+        "source_type": source_type,
+        "source_url": source_identifier if source_type == "direct_url" else None,
+        "handwritten_detected": handwritten_detected,
+        "error": None
+    }
+async def scrape_web_content(url: str, extract_images: bool = True) -> WebScrapingResult:
+    """Scrape web content and extract text and images"""
+    try:
+        headers = {
+            'User-Agent': config.USER_AGENT
+        }
+        logger.info(f"Scraping web content from: {url}")
+        response = requests.get(url, headers=headers, timeout=config.REQUEST_TIMEOUT)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Extract text content
+        text_content = soup.get_text(separator=' ', strip=True)
+        images_found = []
+        ocr_results = []
+        if extract_images:
+            # Find all images
+            img_tags = soup.find_all('img')
+            for img in img_tags[:config.MAX_IMAGES_PER_PAGE]:
+                img_src = img.get('src')
+                if img_src:
+                    # Make absolute URL
+                    img_url = urljoin(url, img_src)
+                    images_found.append(img_url)
+                    # Try to process image with OCR
+                    try:
+                        # Check if image URL is accessible and is an image
+                        img_response = requests.head(img_url, headers=headers, timeout=10)
+                        content_type = img_response.headers.get('content-type', '')
+                        if is_supported_file_type(content_type):
+                            ocr_result = await process_ocr_from_url(img_url)
+                            if ocr_result['content'].strip():  # Only add if there's actual text
+                                ocr_results.append({
+                                    "image_url": img_url,
+                                    "ocr_content": ocr_result['content'],
+                                    "pages": ocr_result['pages']
+                                })
+                    except Exception as e:
+                        logger.warning(f"Failed to process image {img_url}: {e}")
+                        continue
+        return WebScrapingResult(
+            text_content=text_content,
+            images_found=images_found,
+            ocr_results=ocr_results
+        )
+    except requests.RequestException as e:
+        logger.error(f"Failed to scrape URL {url}: {e}")
+        raise HTTPException(status_code=400, detail=f"Failed to scrape URL: {e}")
+    except Exception as e:
+        logger.error(f"Unexpected error scraping URL {url}: {e}")
+        raise HTTPException(status_code=500, detail=f"Unexpected error during web scraping: {e}")
+def check_url_is_direct_file(url: str) -> tuple[bool, str]:
+    """Check if URL points directly to a file"""
+    try:
+        headers = {
+            'User-Agent': config.USER_AGENT
+        }
+        response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
+        content_type = response.headers.get('content-type', '').lower()
+        # Check content disposition for filename
+        content_disposition = response.headers.get('content-disposition', '')
+        filename = ""
+        if 'filename=' in content_disposition:
+            filename = content_disposition.split('filename=')[1].strip('"')
+        # Parse URL for filename
+        if not filename:
+            parsed_url = urlparse(url)
+            filename = Path(parsed_url.path).name
+        is_file = is_supported_file_type(content_type, filename)
+        return is_file, content_type
+    except Exception as e:
+        logger.warning(f"Failed to check URL {url}: {e}")
+        return False, ""
+# API Endpoints
+@app.get("/")
+async def root():
+    azure_di_available = bool(
+        config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
+        config.AZURE_DOCUMENT_INTELLIGENCE_KEY and
+        config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and
+        config.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY"
+    )
+    return {
+        "message": "OCR Backend API",
+        "version": "2.0.0",
+        "status": "operational",
+        "features": {
+            "file_upload": True,
+            "url_processing": True,
+            "web_scraping": True,
+            "azure_document_intelligence": azure_di_available,
+            "supported_formats": ["PDF", "JPEG", "PNG", "TIFF", "BMP", "GIF"]
+        },
+        "limits": {
+            "max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
+            "max_images_per_page": config.MAX_IMAGES_PER_PAGE,
+            "request_timeout_seconds": config.REQUEST_TIMEOUT
+        }
+    }
+@app.get("/health")
+async def health_check():
+    azure_di_available = bool(
+        config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
+        config.AZURE_DOCUMENT_INTELLIGENCE_KEY and
+        config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and
+        config.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY"
+    )
+    # Test Azure DI connection if configured
+    azure_di_status = "not_configured"
+    if azure_di_available:
+        try:
+            # Quick test of Azure DI client initialization
+            get_document_intelligence_client()
+            azure_di_status = "configured"
+        except Exception as e:
+            azure_di_status = f"error: {str(e)[:100]}"
+    return {
+        "status": "healthy",
+        "service": "OCR Backend API",
+        "version": "2.0.0",
+        "azure_document_intelligence": azure_di_status,
+        "configuration": {
+            "max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
+            "max_images_per_page": config.MAX_IMAGES_PER_PAGE,
+            "request_timeout": config.REQUEST_TIMEOUT,
+            "endpoint_configured": bool(config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT),
+            "key_configured": bool(config.AZURE_DOCUMENT_INTELLIGENCE_KEY)
+        }
+    }
+@app.post("/ocr/upload", response_model=OCRResponse)
+async def ocr_upload_file(file: UploadFile = File(...)):
+    """Upload a file for OCR processing"""
+    # Validate file type
+    if not is_supported_file_type(file.content_type, file.filename):
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported file type: {file.content_type}. Supported types: PDF, JPEG, PNG, TIFF, BMP, GIF"
+        )
+    try:
+        # Read file content
+        file_bytes = await file.read()
+        # Check file size
+        if len(file_bytes) > config.MAX_FILE_SIZE:
+            raise HTTPException(
+                status_code=400,
+                detail=f"File too large. Maximum size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB"
+            )
+        # Process OCR
+        result = await process_ocr_from_bytes(file_bytes, file.filename)
+        return OCRResponse(**result)
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Unexpected error processing uploaded file: {e}")
+        raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
+@app.post("/ocr/url", response_model=OCRResponse)
+async def ocr_from_url(request: URLRequest):
+    """Process OCR from URL - either direct file or web scraping"""
+    url_str = str(request.url)
+    # Check if URL points to a direct file
+    is_direct_file, content_type = check_url_is_direct_file(url_str)
+    if is_direct_file:
+        # Process as direct file URL
+        try:
+            result = await process_ocr_from_url(url_str)
+            return OCRResponse(**result)
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(f"Failed to process direct file URL: {e}")
+            # Fall back to web scraping
+            pass
+    # Web scraping approach
+    try:
+        scraping_result = await scrape_web_content(url_str, request.extract_images)
+        # Combine text content and OCR results
+        combined_content = scraping_result.text_content
+        if scraping_result.ocr_results:
+            ocr_content = "\n\n--- OCR from Images ---\n"
+            for ocr_result in scraping_result.ocr_results:
+                ocr_content += f"\nImage: {ocr_result['image_url']}\n"
+                ocr_content += ocr_result['ocr_content'] + "\n"
+            combined_content += ocr_content
+        # Format response
+        pages_data = [{
+            "page_number": 1,
+            "content_type": "web_scraped",
+            "text_content": scraping_result.text_content,
+            "images_found": len(scraping_result.images_found),
+            "ocr_results": len(scraping_result.ocr_results)
+        }]
+        return OCRResponse(
+            success=True,
+            content=combined_content,
+            pages=pages_data,
+            source_type="web_scraped",
+            source_url=url_str,
+            error=None
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Failed to process URL {url_str}: {e}")
+        return OCRResponse(
+            success=False,
+            content="",
+            pages=[],
+            source_type="web_scraped",
+            source_url=url_str,
+            error=str(e)
+        )
+@app.post("/ocr/analyze")
+async def analyze_document(
+    file: Optional[UploadFile] = File(None),
+    url: Optional[str] = Form(None),
+    extract_images: bool = Form(True)
+):
+    """Unified endpoint for document analysis - accepts either file upload or URL"""
+    if not file and not url:
+        raise HTTPException(status_code=400, detail="Either file or URL must be provided")
+    if file and url:
+        raise HTTPException(status_code=400, detail="Provide either file or URL, not both")
+    try:
+        if file:
+            # Process uploaded file
+            if not is_supported_file_type(file.content_type, file.filename):
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Unsupported file type: {file.content_type}"
+                )
+            file_bytes = await file.read()
+            # Check file size
+            if len(file_bytes) > config.MAX_FILE_SIZE:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"File too large. Maximum size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB"
+                )
+            result = await process_ocr_from_bytes(file_bytes, file.filename)
+            return result
+        else:
+            # Process URL
+            url_request = URLRequest(url=url, extract_images=extract_images)
+            response = await ocr_from_url(url_request)
+            return response.dict()
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Unexpected error in analyze_document: {e}")
+        raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
+# Additional utility endpoints
+@app.get("/supported-formats")
+async def get_supported_formats():
+    """Get list of supported file formats"""
+    return {
+        "supported_formats": {
+            "documents": ["PDF"],
+            "images": ["JPEG", "JPG", "PNG", "TIFF", "TIF", "BMP", "GIF"]
+        },
+        "content_types": [
+            "application/pdf",
+            "image/jpeg",
+            "image/jpg",
+            "image/png",
+            "image/tiff",
+            "image/bmp",
+            "image/gif"
+        ],
+        "max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
+        "max_images_per_page": config.MAX_IMAGES_PER_PAGE
+    }
+@app.get("/config")
+async def get_configuration():
+    """Get current service configuration (for debugging)"""
+    return {
+        "service": "OCR Backend API",
+        "version": "2.0.0",
+        "configuration": {
+            "host": config.HOST,
+            "port": config.PORT,
+            "debug": config.DEBUG,
+            "max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
+            "max_images_per_page": config.MAX_IMAGES_PER_PAGE,
+            "request_timeout": config.REQUEST_TIMEOUT,
+            "azure_di_configured": bool(
+                config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
+                config.AZURE_DOCUMENT_INTELLIGENCE_KEY
+            )
+        }
+    }
+if __name__ == "__main__":
+    print("🔧 Loading OCR service configuration...")
+    print(f"🌐 Will start server on {config.HOST}:{config.PORT}")
+    print(f"📄 Azure Document Intelligence: {'✅ Configured' if config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT else '❌ Not configured'}")
+    print(f"📊 Max file size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB")
+    uvicorn.run(
+        "ocr_service:app",
+        host=config.HOST,
+        port=config.PORT,
+        reload=config.DEBUG,
+        log_level="info"
+    )

service/rag_service.py ADDED Viewed

	@@ -0,0 +1,1367 @@

+#!/usr/bin/env python3
+"""
+RAG (Retrieval-Augmented Generation) Backend API - Cleaned and Optimized
+Integrates OCR, Azure OpenAI embeddings, and PostgreSQL vector storage
+"""
+import os
+import uuid
+import asyncio
+import requests
+import json
+import tempfile
+import traceback
+import logging
+from typing import Optional, List, Dict, Any, Union
+from datetime import datetime
+from fastapi import FastAPI, File, UploadFile, HTTPException, Form, Query, Depends
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, HttpUrl
+import uvicorn
+# Import unified configuration
+try:
+    from configs import get_config
+    config = get_config().rag
+    unified_config = get_config()
+    print("✅ Using unified configuration")
+except ImportError:
+    print("⚠️  Unified config not available, using fallback configuration")
+    from dotenv import load_dotenv
+    load_dotenv()
+    class FallbackConfig:
+        HOST = os.getenv("HOST", "0.0.0.0")
+        PORT = int(os.getenv("RAG_PORT", "8401"))
+        DEBUG = os.getenv("DEBUG", "True").lower() == "true"
+        # OCR Service Configuration
+        OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
+        # PostgreSQL Configuration
+        PG_HOST = os.getenv("POSTGRES_HOST", "")
+        PG_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
+        PG_DATABASE = os.getenv("PG_DATABASE", "vectorsearch")
+        PG_USER = os.getenv("POSTGRES_USER", "")
+        PG_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
+        PG_SSL_MODE = os.getenv("PG_SSL_MODE", "require")
+        # Azure OpenAI Configuration
+        AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
+        AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
+        AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "text-embedding-3-small")
+        AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")
+        # Chunking Configuration
+        CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
+        CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
+        MIN_CHUNK_SIZE = int(os.getenv("MIN_CHUNK_SIZE", "50"))
+        # Processing limits
+        MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
+        REQUEST_TIMEOUT = 300
+    config = FallbackConfig()
+import asyncpg
+import numpy as np
+from openai import AzureOpenAI
+import re
+from pathlib import Path
+from urllib.parse import urlparse
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Initialize FastAPI app
+app = FastAPI(
+    title="RAG Backend API",
+    description="Retrieval-Augmented Generation service with OCR, embeddings, and vector search",
+    version="2.0.0",
+    debug=config.DEBUG
+)
+# CORS configuration
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Pydantic Models
+class DocumentUploadRequest(BaseModel):
+    title: Optional[str] = None
+    keywords: Optional[List[str]] = None
+    metadata: Optional[Dict[str, Any]] = None
+    chunk_size: Optional[int] = None
+    chunk_overlap: Optional[int] = None
+class URLProcessRequest(BaseModel):
+    url: HttpUrl
+    title: Optional[str] = None
+    keywords: Optional[List[str]] = None
+    metadata: Optional[Dict[str, Any]] = None
+    extract_images: bool = True
+    chunk_size: Optional[int] = None
+    chunk_overlap: Optional[int] = None
+class SearchRequest(BaseModel):
+    query: str
+    limit: int = 10
+    similarity_threshold: float = 0.2
+    filter_metadata: Optional[Dict[str, Any]] = None
+class DocumentChunk(BaseModel):
+    id: str
+    document_id: str
+    content: str
+    chunk_index: int
+    embedding: Optional[List[float]] = None
+    metadata: Dict[str, Any]
+    created_at: datetime
+class DocumentResponse(BaseModel):
+    id: str
+    title: str
+    source_type: str
+    source_url: Optional[str]
+    total_chunks: int
+    keywords: List[str]
+    metadata: Dict[str, Any]
+    created_at: datetime
+    processing_status: str
+class SearchResult(BaseModel):
+    chunk: DocumentChunk
+    similarity_score: float
+    document_info: Dict[str, Any]
+class SearchResponse(BaseModel):
+    query: str
+    results: List[SearchResult]
+    total_results: int
+    processing_time: float
+# Database connection pool
+db_pool = None
+# UUID generation method cache
+_uuid_method = None
+async def detect_uuid_method(conn) -> str:
+    """Detect and cache the best available UUID generation method"""
+    global _uuid_method
+    if _uuid_method is not None:
+        return _uuid_method
+    # Test built-in gen_random_uuid() first (PostgreSQL 13+)
+    try:
+        await conn.fetchval("SELECT gen_random_uuid()")
+        _uuid_method = "built-in"
+        logger.info("Using built-in gen_random_uuid() for UUID generation")
+        return _uuid_method
+    except Exception:
+        pass
+    # Test uuid-ossp extension
+    try:
+        await conn.execute("CREATE EXTENSION IF NOT EXISTS \"uuid-ossp\"")
+        await conn.fetchval("SELECT uuid_generate_v4()")
+        _uuid_method = "uuid-ossp"
+        logger.info("Using uuid-ossp extension for UUID generation")
+        return _uuid_method
+    except Exception as e:
+        if "not allow-listed" in str(e) or "not allowlisted" in str(e).lower():
+            logger.info("uuid-ossp extension not allowlisted (normal for Azure PostgreSQL)")
+        else:
+            logger.warning(f"uuid-ossp extension not available: {e}")
+    # Fall back to Python UUID generation
+    _uuid_method = "python"
+    logger.info("Using Python-generated UUIDs")
+    return _uuid_method
+async def get_db_pool():
+    """Get database connection pool"""
+    global db_pool
+    if db_pool is None:
+        try:
+            logger.info(f"Creating database pool with host: {config.PG_HOST}:{config.PG_PORT}")
+            db_pool = await asyncpg.create_pool(
+                host=config.PG_HOST,
+                port=config.PG_PORT,
+                database=config.PG_DATABASE,
+                user=config.PG_USER,
+                password=config.PG_PASSWORD,
+                ssl=config.PG_SSL_MODE,
+                min_size=1,
+                max_size=10,
+                command_timeout=60
+            )
+        except Exception as e:
+            logger.error(f"Failed to create database pool: {e}")
+            raise
+    return db_pool
+async def get_db_connection():
+    """Get database connection from pool"""
+    pool = await get_db_pool()
+    return await pool.acquire()
+async def release_db_connection(connection):
+    """Release database connection back to pool"""
+    pool = await get_db_pool()
+    await pool.release(connection)
+# Azure OpenAI Client
+def get_openai_client():
+    """Initialize Azure OpenAI client"""
+    if (config.AZURE_OPENAI_ENDPOINT == "" or
+        config.AZURE_OPENAI_API_KEY == "" or
+        config.AZURE_OPENAI_ENDPOINT == "YOUR_AZURE_OPENAI_ENDPOINT" or
+        config.AZURE_OPENAI_API_KEY == "YOUR_AZURE_OPENAI_KEY"):
+        raise HTTPException(
+            status_code=500,
+            detail="Azure OpenAI credentials not configured"
+        )
+    return AzureOpenAI(
+        api_version=config.AZURE_OPENAI_API_VERSION,
+        azure_endpoint=config.AZURE_OPENAI_ENDPOINT,
+        api_key=config.AZURE_OPENAI_API_KEY
+    )
+# Text Processing Functions
+def clean_text(text: str) -> str:
+    """Clean and normalize text"""
+    # Remove extra whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove special characters but keep basic punctuation
+    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text)
+    return text.strip()
+def embedding_to_vector_string(embedding: List[float]) -> str:
+    """Convert embedding list to PostgreSQL vector format"""
+    if not embedding or len(embedding) == 0:
+        raise ValueError("Embedding cannot be empty")
+    # Convert to PostgreSQL vector format: '[1.0, 2.0, 3.0]'
+    vector_str = '[' + ','.join(str(float(x)) for x in embedding) + ']'
+    return vector_str
+def create_text_chunks(text: str, chunk_size: int = None, chunk_overlap: int = None) -> List[str]:
+    """Split text into overlapping chunks"""
+    if chunk_size is None:
+        chunk_size = config.CHUNK_SIZE
+    if chunk_overlap is None:
+        chunk_overlap = config.CHUNK_OVERLAP
+    if len(text) <= chunk_size:
+        return [text]
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        # Try to break at sentence boundary
+        if end < len(text):
+            # Look for sentence endings
+            sentence_endings = ['. ', '! ', '? ', '\n\n']
+            for ending in sentence_endings:
+                last_ending = text.rfind(ending, start, end)
+                if last_ending != -1:
+                    end = last_ending + len(ending)
+                    break
+        chunk = text[start:end].strip()
+        if len(chunk) >= config.MIN_CHUNK_SIZE:
+            chunks.append(chunk)
+        # Calculate next start position with overlap
+        start = end - chunk_overlap
+        if start >= len(text):
+            break
+    return chunks
+async def generate_embedding(text: str) -> List[float]:
+    """Generate embedding using Azure OpenAI"""
+    try:
+        if not text or not text.strip():
+            raise ValueError("Text cannot be empty")
+        # Truncate text if it's too long
+        if len(text) > 8000:
+            text = text[:8000]
+            logger.warning("Truncated text for embedding generation")
+        client = get_openai_client()
+        response = client.embeddings.create(
+            input=[text.strip()],
+            model=config.AZURE_OPENAI_DEPLOYMENT
+        )
+        if not response.data or len(response.data) == 0:
+            raise ValueError("No embedding data returned from Azure OpenAI")
+        embedding = response.data[0].embedding
+        if not embedding or len(embedding) == 0:
+            raise ValueError("Empty embedding returned from Azure OpenAI")
+        logger.debug(f"Generated embedding with {len(embedding)} dimensions")
+        return embedding
+    except Exception as e:
+        logger.error(f"Failed to generate embedding: {e}")
+        logger.error(f"Text length: {len(text) if text else 0}")
+        raise HTTPException(status_code=500, detail=f"Embedding generation failed: {e}")
+# OCR Integration
+async def process_with_ocr(file_bytes: bytes = None, url: str = None, extract_images: bool = True, filename: str = None) -> Dict[str, Any]:
+    """Process document using OCR service"""
+    try:
+        logger.info(f"Processing with OCR service at {config.OCR_SERVICE_URL}")
+        if file_bytes:
+            # Check if it's a plain text file
+            is_text_file = False
+            if filename:
+                text_extensions = ['.txt', '.md', '.rst', '.log']
+                if any(filename.lower().endswith(ext) for ext in text_extensions):
+                    is_text_file = True
+            # For plain text files, bypass OCR
+            if is_text_file:
+                try:
+                    content = file_bytes.decode('utf-8')
+                    logger.info(f"Processing plain text file directly: {filename}")
+                    if len(content.strip()) < config.MIN_CHUNK_SIZE:
+                        logger.info(f"Text file {filename} is short ({len(content)} chars) but will process anyway")
+                    return {
+                        'success': True,
+                        'content': content,
+                        'pages': [{
+                            'page_number': 1,
+                            'content_type': 'text',
+                            'text_content': content,
+                            'source': 'direct_text',
+                            'character_count': len(content)
+                        }],
+                        'source_type': 'text_file',
+                        'source_url': None,
+                        'error': None
+                    }
+                except UnicodeDecodeError:
+                    logger.warning(f"Failed to decode {filename} as UTF-8, sending to OCR service")
+            # Use OCR service
+            logger.info(f"Uploading file for OCR processing ({len(file_bytes)} bytes)")
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.tmp') as temp_file:
+                temp_file.write(file_bytes)
+                temp_file.flush()
+                try:
+                    with open(temp_file.name, 'rb') as f:
+                        files = {
+                            'file': (filename or 'document.pdf', f, 'application/octet-stream')
+                        }
+                        data = {
+                            'extract_images': str(extract_images).lower()
+                        }
+                        response = requests.post(
+                            f"{config.OCR_SERVICE_URL}/ocr/analyze",
+                            files=files,
+                            data=data,
+                            timeout=config.REQUEST_TIMEOUT
+                        )
+                finally:
+                    try:
+                        os.unlink(temp_file.name)
+                    except:
+                        pass
+        elif url:
+            # Process URL with OCR service
+            logger.info(f"Processing URL for OCR: {url}")
+            data = {
+                'url': url,
+                'extract_images': str(extract_images).lower()
+            }
+            response = requests.post(
+                f"{config.OCR_SERVICE_URL}/ocr/analyze",
+                data=data,
+                timeout=config.REQUEST_TIMEOUT
+            )
+        else:
+            raise ValueError("Either file_bytes or url must be provided")
+        # Check response
+        logger.info(f"OCR service response status: {response.status_code}")
+        if response.status_code != 200:
+            logger.error(f"OCR service error: {response.status_code} - {response.text}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"OCR processing failed: {response.status_code} {response.reason}"
+            )
+        result = response.json()
+        logger.info(f"OCR processing completed successfully. Success: {result.get('success', False)}")
+        return result
+    except requests.RequestException as e:
+        logger.error(f"OCR service request error: {e}")
+        raise HTTPException(status_code=500, detail=f"OCR service connection failed: {e}")
+    except Exception as e:
+        logger.error(f"OCR processing error: {e}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=f"OCR processing failed: {e}")
+# UUID Generation Helper
+async def generate_uuid(conn) -> str:
+    """Generate UUID using the best available method"""
+    try:
+        uuid_method = await detect_uuid_method(conn)
+        if uuid_method == "built-in":
+            uuid_val = await conn.fetchval("SELECT gen_random_uuid()")
+            return str(uuid_val)
+        elif uuid_method == "uuid-ossp":
+            uuid_val = await conn.fetchval("SELECT uuid_generate_v4()")
+            return str(uuid_val)
+        else:
+            return str(uuid.uuid4())
+    except Exception as e:
+        logger.warning(f"Database UUID generation failed, using Python fallback: {e}")
+        return str(uuid.uuid4())
+# Database Operations
+async def create_document_record(
+    title: str,
+    source_type: str,
+    source_url: str = None,
+    keywords: List[str] = None,
+    metadata: Dict[str, Any] = None
+) -> str:
+    """Create document record in database"""
+    conn = await get_db_connection()
+    try:
+        document_id = await generate_uuid(conn)
+        await conn.execute("""
+            INSERT INTO documents (id, title, source_type, source_url, keywords, metadata, created_at, processing_status)
+            VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
+        """, document_id, title, source_type, source_url, keywords or [],
+             json.dumps(metadata or {}), datetime.utcnow(), "processing")
+        return document_id
+    finally:
+        await release_db_connection(conn)
+async def store_document_chunk(
+    document_id: str,
+    content: str,
+    chunk_index: int,
+    embedding: List[float],
+    metadata: Dict[str, Any] = None
+) -> str:
+    """Store document chunk with embedding"""
+    conn = await get_db_connection()
+    try:
+        chunk_id = await generate_uuid(conn)
+        # Convert embedding to PostgreSQL vector format
+        embedding_vector = embedding_to_vector_string(embedding)
+        await conn.execute("""
+            INSERT INTO document_chunks (id, document_id, content, chunk_index, embedding, metadata, created_at)
+            VALUES ($1, $2, $3, $4, $5::vector, $6, $7)
+        """, chunk_id, document_id, content, chunk_index, embedding_vector,
+             json.dumps(metadata or {}), datetime.utcnow())
+        return chunk_id
+    finally:
+        await release_db_connection(conn)
+async def update_document_status(document_id: str, status: str, total_chunks: int = None):
+    """Update document processing status"""
+    conn = await get_db_connection()
+    try:
+        if total_chunks is not None:
+            await conn.execute("""
+                UPDATE documents SET processing_status = $1, total_chunks = $2 WHERE id = $3
+            """, status, total_chunks, document_id)
+        else:
+            await conn.execute("""
+                UPDATE documents SET processing_status = $1 WHERE id = $2
+            """, status, document_id)
+    finally:
+        await release_db_connection(conn)
+async def search_similar_chunks(
+    query_embedding: List[float],
+    limit: int = 10,
+    similarity_threshold: float = 0.2,
+    filter_metadata: Dict[str, Any] = None
+) -> List[Dict[str, Any]]:
+    """Search for similar document chunks using vector similarity"""
+    conn = await get_db_connection()
+    try:
+        logger.info(f"Searching for similar chunks with threshold {similarity_threshold}, limit {limit}")
+        # Validate inputs
+        if not query_embedding or len(query_embedding) == 0:
+            raise ValueError("Query embedding cannot be empty")
+        logger.info(f"Query embedding dimensions: {len(query_embedding)}")
+        # Convert query embedding to PostgreSQL vector format
+        query_vector = embedding_to_vector_string(query_embedding)
+        # Check if we have any chunks
+        total_chunks = await conn.fetchval("""
+            SELECT COUNT(*) FROM document_chunks dc
+            JOIN documents d ON dc.document_id = d.id
+            WHERE d.processing_status = 'completed' AND dc.embedding IS NOT NULL
+        """)
+        logger.info(f"Total available chunks for search: {total_chunks}")
+        if total_chunks == 0:
+            logger.warning("No chunks available for search")
+            return []
+        # Build the query
+        base_query = """
+            SELECT
+                dc.id, dc.document_id, dc.content, dc.chunk_index, dc.embedding,
+                dc.metadata as chunk_metadata, dc.created_at,
+                d.title, d.source_type, d.source_url, d.keywords, d.metadata as doc_metadata,
+                1 - (dc.embedding <=> $1::vector) as similarity_score
+            FROM document_chunks dc
+            JOIN documents d ON dc.document_id = d.id
+            WHERE d.processing_status = 'completed'
+            AND dc.embedding IS NOT NULL
+        """
+        params = [query_vector]
+        param_count = 1
+        # Add similarity threshold
+        if similarity_threshold > 0:
+            base_query += " AND 1 - (dc.embedding <=> $1::vector) >= $2"
+            params.append(similarity_threshold)
+            param_count += 1
+        # Add metadata filtering
+        if filter_metadata:
+            for key, value in filter_metadata.items():
+                base_query += f" AND d.metadata->>$" + str(param_count + 1) + " = $" + str(param_count + 2)
+                params.extend([key, str(value)])
+                param_count += 2
+                break  # Handle only one filter for now
+        base_query += " ORDER BY similarity_score DESC LIMIT $" + str(param_count + 1)
+        params.append(limit)
+        logger.info(f"Executing vector search query with {len(params)} parameters")
+        try:
+            rows = await conn.fetch(base_query, *params)
+            logger.info(f"Vector search query returned {len(rows)} rows")
+        except Exception as db_error:
+            logger.error(f"Database query error: {db_error}")
+            raise HTTPException(status_code=500, detail=f"Vector search query failed: {db_error}")
+        # Debug: show similarity scores if no results
+        if len(rows) == 0 and similarity_threshold > 0:
+            logger.warning(f"No results found with threshold {similarity_threshold}, trying without threshold")
+            debug_query = """
+                SELECT
+                    dc.id, dc.content,
+                    1 - (dc.embedding <=> $1::vector) as similarity_score
+                FROM document_chunks dc
+                JOIN documents d ON dc.document_id = d.id
+                WHERE d.processing_status = 'completed'
+                AND dc.embedding IS NOT NULL
+                ORDER BY similarity_score DESC
+                LIMIT 3
+            """
+            debug_rows = await conn.fetch(debug_query, query_vector)
+            logger.info(f"Debug: Top 3 similarity scores: {[(r['similarity_score'], r['content'][:50]) for r in debug_rows]}")
+        results = []
+        for row in rows:
+            try:
+                # Safely parse JSON metadata
+                chunk_metadata = {}
+                doc_metadata = {}
+                if row['chunk_metadata']:
+                    try:
+                        chunk_metadata = json.loads(row['chunk_metadata'])
+                    except json.JSONDecodeError:
+                        logger.warning(f"Invalid chunk metadata JSON for chunk {row['id']}")
+                if row['doc_metadata']:
+                    try:
+                        doc_metadata = json.loads(row['doc_metadata'])
+                    except json.JSONDecodeError:
+                        logger.warning(f"Invalid document metadata JSON for document {row['document_id']}")
+                # Convert UUID objects to strings
+                chunk_id = str(row['id']) if row['id'] else None
+                document_id = str(row['document_id']) if row['document_id'] else None
+                results.append({
+                    'chunk_id': chunk_id,
+                    'document_id': document_id,
+                    'content': row['content'],
+                    'chunk_index': row['chunk_index'],
+                    'chunk_metadata': chunk_metadata,
+                    'created_at': row['created_at'],
+                    'document_title': row['title'],
+                    'source_type': row['source_type'],
+                    'source_url': row['source_url'],
+                    'keywords': row['keywords'] or [],
+                    'document_metadata': doc_metadata,
+                    'similarity_score': float(row['similarity_score'])
+                })
+            except Exception as row_error:
+                logger.error(f"Error processing search result row: {row_error}")
+                continue
+        logger.info(f"Vector search returned {len(results)} results")
+        if results:
+            logger.info(f"Top result similarity: {results[0]['similarity_score']:.4f}")
+        return results
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Vector search failed: {e}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=f"Vector search failed: {e}")
+    finally:
+        await release_db_connection(conn)
+# Database initialization
+async def init_database():
+    """Initialize database tables"""
+    conn = await get_db_connection()
+    try:
+        logger.info("🔄 Initializing database tables...")
+        # Create documents table
+        await conn.execute("""
+            CREATE TABLE IF NOT EXISTS documents (
+                id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+                title VARCHAR(500) NOT NULL,
+                source_type VARCHAR(50) NOT NULL,
+                source_url TEXT,
+                keywords TEXT[] DEFAULT '{}',
+                metadata JSONB DEFAULT '{}',
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                processing_status VARCHAR(20) DEFAULT 'processing',
+                total_chunks INTEGER DEFAULT 0
+            );
+        """)
+        # Create document_chunks table
+        await conn.execute("""
+            CREATE TABLE IF NOT EXISTS document_chunks (
+                id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+                document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
+                content TEXT NOT NULL,
+                chunk_index INTEGER NOT NULL,
+                embedding vector(1536),
+                metadata JSONB DEFAULT '{}',
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            );
+        """)
+        # Create indexes
+        try:
+            await conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_documents_status ON documents(processing_status);
+                CREATE INDEX IF NOT EXISTS idx_chunks_document ON document_chunks(document_id);
+                CREATE INDEX IF NOT EXISTS idx_chunks_embedding ON document_chunks USING ivfflat (embedding vector_cosine_ops);
+            """)
+        except Exception as e:
+            logger.warning(f"Could not create some indexes (vector extension may not be available): {e}")
+        logger.info("✅ Database tables initialized")
+    finally:
+        await release_db_connection(conn)
+# App Lifecycle
+@app.on_event("startup")
+async def startup_event():
+    """Application startup"""
+    logger.info("🚀 Starting RAG Backend API...")
+    try:
+        # Test database connection
+        await get_db_pool()
+        logger.info("✅ Database connection established")
+        # Initialize database
+        await init_database()
+        # Test Azure OpenAI
+        try:
+            get_openai_client()
+            logger.info("✅ Azure OpenAI client configured")
+        except Exception as e:
+            logger.warning(f"⚠️  Azure OpenAI client configuration issue: {e}")
+        logger.info("🎉 RAG Backend API is ready!")
+    except Exception as e:
+        logger.error(f"❌ Startup failed: {e}")
+        raise
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Application shutdown"""
+    logger.info("🛑 Shutting down RAG Backend API...")
+    if db_pool:
+        await db_pool.close()
+        logger.info("✅ Database connections closed")
+# API Endpoints
+@app.get("/")
+async def root():
+    return {
+        "message": "RAG Backend API",
+        "version": "2.0.0",
+        "status": "running",
+        "features": {
+            "document_upload": True,
+            "url_processing": True,
+            "vector_search": True,
+            "ocr_integration": True,
+            "azure_openai_embeddings": True,
+            "postgresql_vector_storage": True
+        },
+        "configuration": {
+            "chunk_size": config.CHUNK_SIZE,
+            "chunk_overlap": config.CHUNK_OVERLAP,
+            "min_chunk_size": config.MIN_CHUNK_SIZE,
+            "max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024)
+        },
+        "endpoints": {
+            "health": "/health",
+            "docs": "/docs",
+            "upload": "/documents/upload",
+            "url_process": "/documents/url",
+            "search": "/search",
+            "list_documents": "/documents"
+        }
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    health_status = {
+        "status": "unknown",
+        "service": "RAG Backend API",
+        "version": "2.0.0",
+        "timestamp": datetime.utcnow().isoformat(),
+        "database": "unknown",
+        "openai": "unknown",
+        "uuid_method": "unknown",
+        "ocr_service": "unknown",
+        "configuration": {
+            "pg_host": config.PG_HOST,
+            "pg_port": config.PG_PORT,
+            "pg_database": config.PG_DATABASE,
+            "ocr_service_url": config.OCR_SERVICE_URL,
+            "chunk_size": config.CHUNK_SIZE
+        },
+        "errors": []
+    }
+    # Test database connection
+    try:
+        test_conn = await asyncpg.connect(
+            host=config.PG_HOST,
+            port=config.PG_PORT,
+            database=config.PG_DATABASE,
+            user=config.PG_USER,
+            password=config.PG_PASSWORD,
+            ssl=config.PG_SSL_MODE,
+            timeout=10
+        )
+        db_version = await test_conn.fetchval("SELECT version()")
+        health_status["database"] = "connected"
+        health_status["database_version"] = db_version
+        # Check UUID generation method
+        uuid_method = await detect_uuid_method(test_conn)
+        health_status["uuid_method"] = uuid_method
+        await test_conn.close()
+    except Exception as db_error:
+        health_status["database"] = "failed"
+        health_status["errors"].append(f"Database connection failed: {db_error}")
+    # Test OpenAI
+    try:
+        if (config.AZURE_OPENAI_ENDPOINT == "" or
+            config.AZURE_OPENAI_API_KEY == ""):
+            health_status["openai"] = "not_configured"
+        else:
+            client = get_openai_client()
+            # Test with a simple embedding request
+            test_response = client.embeddings.create(
+                input=["Health check test"],
+                model=config.AZURE_OPENAI_DEPLOYMENT
+            )
+            if test_response.data:
+                health_status["openai"] = "configured"
+                health_status["embedding_dimensions"] = len(test_response.data[0].embedding)
+            else:
+                health_status["openai"] = "failed"
+                health_status["errors"].append("OpenAI embedding test failed")
+    except Exception as openai_error:
+        health_status["openai"] = "failed"
+        health_status["errors"].append(f"OpenAI configuration failed: {openai_error}")
+    # Test OCR service
+    try:
+        ocr_response = requests.get(f"{config.OCR_SERVICE_URL}/health", timeout=5)
+        if ocr_response.status_code == 200:
+            health_status["ocr_service"] = "available"
+        else:
+            health_status["ocr_service"] = "unavailable"
+    except:
+        health_status["ocr_service"] = "unavailable"
+    # Determine overall status
+    if health_status["database"] == "connected" and health_status["openai"] in ["configured", "not_configured"]:
+        health_status["status"] = "healthy"
+    elif health_status["database"] == "connected":
+        health_status["status"] = "degraded"
+    else:
+        health_status["status"] = "unhealthy"
+    return health_status
+@app.post("/documents/upload")
+async def upload_document(
+    file: UploadFile = File(...),
+    title: str = Form(None),
+    keywords: str = Form(None),  # JSON string of list
+    metadata: str = Form(None),  # JSON string
+    chunk_size: int = Form(None),
+    chunk_overlap: int = Form(None)
+):
+    """Upload and process a document"""
+    document_id = None
+    try:
+        # Parse form data
+        keywords_list = json.loads(keywords) if keywords else []
+        metadata_dict = json.loads(metadata) if metadata else {}
+        # Set default title
+        if not title:
+            title = file.filename or "Untitled Document"
+        # Read file content
+        logger.info(f"Processing uploaded file: {file.filename} ({file.content_type})")
+        file_bytes = await file.read()
+        if not file_bytes or len(file_bytes) == 0:
+            raise HTTPException(status_code=400, detail="Empty file uploaded")
+        if len(file_bytes) > config.MAX_FILE_SIZE:
+            raise HTTPException(status_code=400, detail="File too large")
+        # Process with OCR
+        logger.info(f"Processing document with OCR: {title}")
+        ocr_result = await process_with_ocr(file_bytes=file_bytes, filename=file.filename)
+        if not ocr_result.get('success', False):
+            error_msg = ocr_result.get('error', 'Unknown OCR error')
+            logger.error(f"OCR processing failed: {error_msg}")
+            raise HTTPException(status_code=400, detail=f"OCR processing failed: {error_msg}")
+        # Extract text content
+        content = ocr_result.get('content', '')
+        if not content or not content.strip():
+            raise HTTPException(status_code=400, detail="No text content extracted from document")
+        # Clean the text
+        cleaned_content = clean_text(content)
+        if not cleaned_content or len(cleaned_content.strip()) == 0:
+            raise HTTPException(status_code=400, detail="No text content after cleaning")
+        # Allow shorter content for testing
+        if len(cleaned_content.strip()) < config.MIN_CHUNK_SIZE:
+            logger.warning(f"Content is short ({len(cleaned_content)} chars) but processing anyway")
+        # Create document record
+        document_id = await create_document_record(
+            title=title,
+            source_type='file_upload',
+            keywords=keywords_list,
+            metadata={
+                **metadata_dict,
+                'filename': file.filename,
+                'content_type': file.content_type,
+                'file_size': len(file_bytes),
+                'ocr_pages': len(ocr_result.get('pages', []))
+            }
+        )
+        # Create text chunks
+        chunks = create_text_chunks(
+            cleaned_content,
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap
+        )
+        if not chunks:
+            raise HTTPException(status_code=400, detail="No valid chunks created from document")
+        # Process chunks and generate embeddings
+        logger.info(f"Processing {len(chunks)} chunks for document {document_id}")
+        successful_chunks = 0
+        for i, chunk_content in enumerate(chunks):
+            try:
+                if not chunk_content or len(chunk_content.strip()) < 10:
+                    logger.warning(f"Skipping chunk {i} - too small")
+                    continue
+                # Generate embedding
+                embedding = await generate_embedding(chunk_content)
+                # Store chunk
+                await store_document_chunk(
+                    document_id=document_id,
+                    content=chunk_content,
+                    chunk_index=i,
+                    embedding=embedding,
+                    metadata={
+                        'chunk_size': len(chunk_content),
+                        'position': i
+                    }
+                )
+                successful_chunks += 1
+            except Exception as e:
+                logger.error(f"Failed to process chunk {i} for document {document_id}: {e}")
+                continue
+        if successful_chunks == 0:
+            await update_document_status(document_id, "failed")
+            raise HTTPException(status_code=500, detail="No chunks could be processed successfully")
+        # Update document status
+        await update_document_status(document_id, "completed", successful_chunks)
+        logger.info(f"Document {document_id} processed successfully with {successful_chunks} chunks")
+        return {
+            "success": True,
+            "document_id": document_id,
+            "title": title,
+            "total_chunks": successful_chunks,
+            "message": "Document processed successfully"
+        }
+    except HTTPException:
+        if document_id:
+            try:
+                await update_document_status(document_id, "failed")
+            except:
+                pass
+        raise
+    except Exception as e:
+        if document_id:
+            try:
+                await update_document_status(document_id, "failed")
+            except:
+                pass
+        logger.error(f"Unexpected error processing document: {e}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=f"Document processing failed: {e}")
+@app.post("/documents/url")
+async def process_url(request: URLProcessRequest):
+    """Process document from URL"""
+    document_id = None
+    try:
+        url_str = str(request.url)
+        # Set default title
+        title = request.title or f"Document from {urlparse(url_str).netloc}"
+        # Process with OCR
+        logger.info(f"Processing URL with OCR: {url_str}")
+        ocr_result = await process_with_ocr(url=url_str, extract_images=request.extract_images)
+        if not ocr_result.get('success', False):
+            error_msg = ocr_result.get('error', 'Unknown OCR error')
+            logger.error(f"OCR processing failed for URL: {error_msg}")
+            raise HTTPException(status_code=400, detail=f"OCR processing failed: {error_msg}")
+        # Extract text content
+        content = ocr_result.get('content', '')
+        if not content or not content.strip():
+            raise HTTPException(status_code=400, detail="No text content extracted from URL")
+        # Clean the text
+        cleaned_content = clean_text(content)
+        if not cleaned_content or len(cleaned_content.strip()) == 0:
+            raise HTTPException(status_code=400, detail="No text content after cleaning")
+        # Allow shorter content for testing
+        if len(cleaned_content.strip()) < config.MIN_CHUNK_SIZE:
+            logger.warning(f"URL content is short ({len(cleaned_content)} chars) but processing anyway")
+        # Create document record
+        document_id = await create_document_record(
+            title=title,
+            source_type=ocr_result.get('source_type', 'url'),
+            source_url=url_str,
+            keywords=request.keywords or [],
+            metadata={
+                **(request.metadata or {}),
+                'url': url_str,
+                'extract_images': request.extract_images,
+                'ocr_pages': len(ocr_result.get('pages', []))
+            }
+        )
+        # Create text chunks
+        chunks = create_text_chunks(
+            cleaned_content,
+            chunk_size=request.chunk_size,
+            chunk_overlap=request.chunk_overlap
+        )
+        if not chunks:
+            raise HTTPException(status_code=400, detail="No valid chunks created from URL content")
+        # Process chunks and generate embeddings
+        logger.info(f"Processing {len(chunks)} chunks for document {document_id}")
+        successful_chunks = 0
+        for i, chunk_content in enumerate(chunks):
+            try:
+                if not chunk_content or len(chunk_content.strip()) < 10:
+                    logger.warning(f"Skipping chunk {i} - too small")
+                    continue
+                # Generate embedding
+                embedding = await generate_embedding(chunk_content)
+                # Store chunk
+                await store_document_chunk(
+                    document_id=document_id,
+                    content=chunk_content,
+                    chunk_index=i,
+                    embedding=embedding,
+                    metadata={
+                        'chunk_size': len(chunk_content),
+                        'position': i
+                    }
+                )
+                successful_chunks += 1
+            except Exception as e:
+                logger.error(f"Failed to process chunk {i} for document {document_id}: {e}")
+                continue
+        if successful_chunks == 0:
+            await update_document_status(document_id, "failed")
+            raise HTTPException(status_code=500, detail="No chunks could be processed successfully")
+        # Update document status
+        await update_document_status(document_id, "completed", successful_chunks)
+        logger.info(f"URL document {document_id} processed successfully with {successful_chunks} chunks")
+        return {
+            "success": True,
+            "document_id": document_id,
+            "title": title,
+            "total_chunks": successful_chunks,
+            "source_url": url_str,
+            "message": "URL processed successfully"
+        }
+    except HTTPException:
+        if document_id:
+            try:
+                await update_document_status(document_id, "failed")
+            except:
+                pass
+        raise
+    except Exception as e:
+        if document_id:
+            try:
+                await update_document_status(document_id, "failed")
+            except:
+                pass
+        logger.error(f"Unexpected error processing URL: {e}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=f"URL processing failed: {e}")
+@app.post("/search", response_model=SearchResponse)
+async def search_documents(request: SearchRequest):
+    """Search documents using vector similarity"""
+    try:
+        import time
+        start_time = time.time()
+        # Validate input
+        if not request.query or not request.query.strip():
+            raise HTTPException(status_code=400, detail="Query cannot be empty")
+        query_text = request.query.strip()
+        logger.info(f"Performing vector search for query: '{query_text}'")
+        # Generate embedding for query
+        try:
+            query_embedding = await generate_embedding(query_text)
+        except Exception as e:
+            logger.error(f"Failed to generate query embedding: {e}")
+            raise HTTPException(status_code=500, detail=f"Query embedding generation failed: {e}")
+        # Search for similar chunks
+        try:
+            results = await search_similar_chunks(
+                query_embedding=query_embedding,
+                limit=request.limit,
+                similarity_threshold=request.similarity_threshold,
+                filter_metadata=request.filter_metadata
+            )
+        except Exception as e:
+            logger.error(f"Vector search failed: {e}")
+            raise HTTPException(status_code=500, detail=f"Vector search failed: {e}")
+        # Format results
+        search_results = []
+        for result in results:
+            try:
+                chunk = DocumentChunk(
+                    id=result['chunk_id'],
+                    document_id=result['document_id'],
+                    content=result['content'],
+                    chunk_index=result['chunk_index'],
+                    metadata=result['chunk_metadata'],
+                    created_at=result['created_at']
+                )
+                search_results.append(SearchResult(
+                    chunk=chunk,
+                    similarity_score=result['similarity_score'],
+                    document_info={
+                        'title': result['document_title'],
+                        'source_type': result['source_type'],
+                        'source_url': result['source_url'],
+                        'keywords': result['keywords'],
+                        'metadata': result['document_metadata']
+                    }
+                ))
+            except Exception as result_error:
+                logger.error(f"Error formatting search result: {result_error}")
+                continue
+        processing_time = time.time() - start_time
+        logger.info(f"Search completed: {len(search_results)} results in {processing_time:.3f}s")
+        return SearchResponse(
+            query=request.query,
+            results=search_results,
+            total_results=len(search_results),
+            processing_time=processing_time
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Search failed with unexpected error: {e}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=f"Search failed: {e}")
+@app.get("/documents")
+async def list_documents(
+    limit: int = Query(10, ge=1, le=100),
+    offset: int = Query(0, ge=0),
+    status: str = Query(None)
+):
+    """List documents with pagination"""
+    conn = await get_db_connection()
+    try:
+        # Build query
+        base_query = """
+            SELECT id, title, source_type, source_url, keywords, metadata,
+                   created_at, processing_status, total_chunks
+            FROM documents
+        """
+        params = []
+        if status:
+            base_query += " WHERE processing_status = $1"
+            params.append(status)
+        base_query += " ORDER BY created_at DESC LIMIT $" + str(len(params) + 1) + " OFFSET $" + str(len(params) + 2)
+        params.extend([limit, offset])
+        rows = await conn.fetch(base_query, *params)
+        documents = []
+        for row in rows:
+            documents.append({
+                'id': str(row['id']),
+                'title': row['title'],
+                'source_type': row['source_type'],
+                'source_url': row['source_url'],
+                'keywords': row['keywords'],
+                'metadata': json.loads(row['metadata']) if row['metadata'] else {},
+                'created_at': row['created_at'].isoformat(),
+                'processing_status': row['processing_status'],
+                'total_chunks': row['total_chunks']
+            })
+        # Get total count
+        count_query = "SELECT COUNT(*) FROM documents"
+        if status:
+            count_query += " WHERE processing_status = $1"
+            total_count = await conn.fetchval(count_query, status)
+        else:
+            total_count = await conn.fetchval(count_query)
+        return {
+            "documents": documents,
+            "total": total_count,
+            "limit": limit,
+            "offset": offset
+        }
+    finally:
+        await release_db_connection(conn)
+@app.get("/documents/{document_id}")
+async def get_document(document_id: str):
+    """Get document details"""
+    conn = await get_db_connection()
+    try:
+        # Get document
+        doc_row = await conn.fetchrow("""
+            SELECT id, title, source_type, source_url, keywords, metadata,
+                   created_at, processing_status, total_chunks
+            FROM documents WHERE id = $1
+        """, document_id)
+        if not doc_row:
+            raise HTTPException(status_code=404, detail="Document not found")
+        # Get chunks
+        chunk_rows = await conn.fetch("""
+            SELECT id, content, chunk_index, metadata, created_at
+            FROM document_chunks
+            WHERE document_id = $1
+            ORDER BY chunk_index
+        """, document_id)
+        return {
+            'id': str(doc_row['id']),
+            'title': doc_row['title'],
+            'source_type': doc_row['source_type'],
+            'source_url': doc_row['source_url'],
+            'keywords': doc_row['keywords'],
+            'metadata': json.loads(doc_row['metadata']) if doc_row['metadata'] else {},
+            'created_at': doc_row['created_at'].isoformat(),
+            'processing_status': doc_row['processing_status'],
+            'total_chunks': doc_row['total_chunks'],
+            'chunks': [
+                {
+                    'id': str(chunk['id']),
+                    'content': chunk['content'],
+                    'chunk_index': chunk['chunk_index'],
+                    'metadata': json.loads(chunk['metadata']) if chunk['metadata'] else {},
+                    'created_at': chunk['created_at'].isoformat()
+                }
+                for chunk in chunk_rows
+            ]
+        }
+    finally:
+        await release_db_connection(conn)
+@app.delete("/documents/{document_id}")
+async def delete_document(document_id: str):
+    """Delete document and its chunks"""
+    conn = await get_db_connection()
+    try:
+        # Check if document exists
+        exists = await conn.fetchval("SELECT EXISTS(SELECT 1 FROM documents WHERE id = $1)", document_id)
+        if not exists:
+            raise HTTPException(status_code=404, detail="Document not found")
+        # Delete chunks first (foreign key constraint)
+        await conn.execute("DELETE FROM document_chunks WHERE document_id = $1", document_id)
+        # Delete document
+        await conn.execute("DELETE FROM documents WHERE id = $1", document_id)
+        return {"message": "Document deleted successfully"}
+    finally:
+        await release_db_connection(conn)
+if __name__ == "__main__":
+    print("🔧 Loading RAG service configuration...")
+    print(f"🌐 Will start server on {config.HOST}:{config.PORT}")
+    print(f"🗄️  Database: {config.PG_HOST}:{config.PG_PORT}/{config.PG_DATABASE}")
+    print(f"🤖 Azure OpenAI: {'✅ Configured' if config.AZURE_OPENAI_ENDPOINT else '❌ Not configured'}")
+    print(f"🔍 OCR Service: {config.OCR_SERVICE_URL}")
+    uvicorn.run(
+        "rag_service:app",
+        host=config.HOST,
+        port=config.PORT,
+        reload=config.DEBUG,
+        log_level="info"
+    )