Chirapath commited on
Commit
1628024
Β·
verified Β·
1 Parent(s): 2c570fe

Upload 3 files

Browse files
Files changed (3) hide show
  1. service/ner_service.py +1582 -0
  2. service/ocr_service.py +588 -0
  3. service/rag_service.py +1367 -0
service/ner_service.py ADDED
@@ -0,0 +1,1582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced NER Analysis Service - Cleaned and Optimized
4
+ Advanced Named Entity Recognition with Thai language support,
5
+ relationship extraction, and graph database exports
6
+ """
7
+
8
+ import os
9
+ import io
10
+ import json
11
+ import logging
12
+ import re
13
+ import csv
14
+ import tempfile
15
+ import zipfile
16
+ from datetime import datetime
17
+ from typing import Optional, List, Dict, Any, Union, Tuple
18
+ from pathlib import Path
19
+ from contextlib import asynccontextmanager
20
+ from collections import defaultdict
21
+ import xml.etree.ElementTree as ET
22
+
23
+ import httpx
24
+ import asyncpg
25
+ from azure.storage.blob import BlobServiceClient
26
+ from azure.core.credentials import AzureKeyCredential
27
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Form, BackgroundTasks
28
+ from fastapi.middleware.cors import CORSMiddleware
29
+ from fastapi.responses import FileResponse
30
+ from pydantic import BaseModel, HttpUrl, field_validator
31
+ import uvicorn
32
+ import docx
33
+ from azure.ai.inference import ChatCompletionsClient
34
+ from azure.ai.inference.models import SystemMessage, UserMessage
35
+ from openai import AzureOpenAI
36
+
37
+ # Import unified configuration
38
+ try:
39
+ from configs import get_config
40
+ config = get_config().ner
41
+ unified_config = get_config()
42
+ print("βœ… Using unified configuration")
43
+ except ImportError:
44
+ print("⚠️ Unified config not available, using fallback configuration")
45
+ # Fallback configuration
46
+ from dotenv import load_dotenv
47
+ load_dotenv()
48
+
49
+ class FallbackConfig:
50
+ HOST = os.getenv("HOST", "0.0.0.0")
51
+ PORT = int(os.getenv("NER_PORT", "8500"))
52
+ DEBUG = os.getenv("DEBUG", "False").lower() == "true"
53
+
54
+ # Database
55
+ POSTGRES_HOST = os.getenv("POSTGRES_HOST", "")
56
+ POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
57
+ POSTGRES_USER = os.getenv("POSTGRES_USER", "")
58
+ POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
59
+ POSTGRES_DATABASE = os.getenv("POSTGRES_DATABASE", "postgres")
60
+
61
+ # APIs
62
+ OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
63
+ DEEPSEEK_ENDPOINT = os.getenv("DEEPSEEK_ENDPOINT", "")
64
+ DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
65
+ DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "DeepSeek-R1-0528")
66
+ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
67
+ AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
68
+ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
69
+
70
+ # Storage
71
+ AZURE_STORAGE_ACCOUNT_URL = os.getenv("AZURE_STORAGE_ACCOUNT_URL", "")
72
+ AZURE_BLOB_SAS_TOKEN = os.getenv("AZURE_BLOB_SAS_TOKEN", "")
73
+ BLOB_CONTAINER = os.getenv("BLOB_CONTAINER", "historylog")
74
+
75
+ # Limits
76
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
77
+ MAX_TEXT_LENGTH = 100000 # 100KB
78
+
79
+ SUPPORTED_TEXT_FORMATS = {'.txt', '.doc', '.docx', '.rtf'}
80
+ SUPPORTED_OCR_FORMATS = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif'}
81
+
82
+ ENTITY_TYPES = [
83
+ "PERSON", "ORGANIZATION", "LOCATION", "DATE", "TIME", "MONEY", "PRODUCT", "EVENT",
84
+ "VEHICLE", "SUSPICIOUS_OBJECT", "ILLEGAL_ACTIVITY", "EVIDENCE", "ILLEGAL_ITEM",
85
+ "WEAPON", "DRUG", "CHEMICAL", "DOCUMENT", "PHONE_NUMBER", "ADDRESS", "EMAIL"
86
+ ]
87
+
88
+ RELATIONSHIP_TYPES = [
89
+ "works_for", "founded", "located_in", "part_of", "associated_with", "owns", "manages",
90
+ "ΰΈ—ΰΈ³ΰΈ‡ΰΈ²ΰΈ™ΰΈ—ΰΈ΅ΰΉˆ", "ΰΈΰΉˆΰΈ­ΰΈ•ΰΈ±ΰΉ‰ΰΈ‡", "ΰΈ•ΰΈ±ΰΉ‰ΰΈ‡ΰΈ­ΰΈ’ΰΈΉΰΉˆΰΈ—ΰΈ΅ΰΉˆ", "ΰΉ€ΰΈΰΈ΅ΰΉˆΰΈ’ΰΈ§ΰΈ‚ΰΉ‰ΰΈ­ΰΈ‡ΰΈΰΈ±ΰΈš", "ΰΉ€ΰΈ›ΰΉ‡ΰΈ™ΰΉ€ΰΈˆΰΉ‰ΰΈ²ΰΈ‚ΰΈ­ΰΈ‡",
91
+ "arrested_by", "investigated_by", "confiscated_from", "used_in", "evidence_of",
92
+ "ΰΈˆΰΈ±ΰΈšΰΈΰΈΈΰΈ‘ΰΉ‚ΰΈ”ΰΈ’", "ΰΈͺอบΰΈͺΰΈ§ΰΈ™ΰΉ‚ΰΈ”ΰΈ’", "ΰΈ’ΰΈΆΰΈ”ΰΈˆΰΈ²ΰΈ", "ΰΈ«ΰΈ₯ักฐานของ"
93
+ ]
94
+
95
+ config = FallbackConfig()
96
+
97
+ # Setup logging
98
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
99
+ logger = logging.getLogger(__name__)
100
+
101
+ # Export directories
102
+ EXPORT_DIR = Path("exports")
103
+ EXPORT_DIR.mkdir(exist_ok=True)
104
+
105
+ # Global variables
106
+ pg_pool = None
107
+ vector_available = False
108
+ clients = {}
109
+
110
+ # Pydantic Models
111
+ class NERRequest(BaseModel):
112
+ text: Optional[str] = None
113
+ url: Optional[HttpUrl] = None
114
+ extract_relationships: bool = True
115
+ include_embeddings: bool = True
116
+ include_summary: bool = True
117
+ generate_graph_files: bool = True
118
+ export_formats: List[str] = ["neo4j", "json", "graphml"]
119
+
120
+ @field_validator('text')
121
+ @classmethod
122
+ def validate_text_length(cls, v):
123
+ if v and len(v) > config.MAX_TEXT_LENGTH:
124
+ raise ValueError(f"Text too long (max {config.MAX_TEXT_LENGTH} characters)")
125
+ return v
126
+
127
+ class MultiInputRequest(BaseModel):
128
+ texts: Optional[List[str]] = None
129
+ urls: Optional[List[HttpUrl]] = None
130
+ extract_relationships: bool = True
131
+ include_embeddings: bool = True
132
+ include_summary: bool = True
133
+ combine_results: bool = True
134
+ generate_graph_files: bool = True
135
+ export_formats: List[str] = ["neo4j", "json", "graphml"]
136
+
137
+ class EntityResult(BaseModel):
138
+ id: str
139
+ text: str
140
+ label: str
141
+ confidence: float
142
+ start_pos: int
143
+ end_pos: int
144
+ source_type: Optional[str] = None
145
+ source_index: Optional[int] = None
146
+ frequency: int = 1
147
+ importance_score: float = 0.0
148
+ metadata: Optional[Dict[str, Any]] = None
149
+
150
+ class RelationshipResult(BaseModel):
151
+ id: str
152
+ source_entity_id: str
153
+ target_entity_id: str
154
+ source_entity: str
155
+ target_entity: str
156
+ relationship_type: str
157
+ confidence: float
158
+ strength: float
159
+ context: str
160
+ evidence_count: int = 1
161
+ bidirectional: bool = False
162
+ metadata: Optional[Dict[str, Any]] = None
163
+
164
+ class NodeResult(BaseModel):
165
+ id: str
166
+ label: str
167
+ type: str
168
+ confidence: float
169
+ frequency: int = 1
170
+ importance_score: float = 0.0
171
+ properties: Dict[str, Any]
172
+
173
+ class LinkResult(BaseModel):
174
+ id: str
175
+ source: str
176
+ target: str
177
+ relationship: str
178
+ confidence: float
179
+ strength: float
180
+ evidence_count: int = 1
181
+ properties: Dict[str, Any]
182
+
183
+ class GraphData(BaseModel):
184
+ nodes: List[NodeResult]
185
+ links: List[LinkResult]
186
+ metadata: Dict[str, Any]
187
+
188
+ class ExportFiles(BaseModel):
189
+ neo4j_nodes: Optional[str] = None
190
+ neo4j_relationships: Optional[str] = None
191
+ json_export: Optional[str] = None
192
+ graphml_export: Optional[str] = None
193
+ csv_nodes: Optional[str] = None
194
+ csv_edges: Optional[str] = None
195
+ gexf_export: Optional[str] = None
196
+ analysis_report: Optional[str] = None
197
+ download_bundle: Optional[str] = None
198
+
199
+ class NERResponse(BaseModel):
200
+ success: bool
201
+ analysis_id: str
202
+ source_text: str
203
+ source_type: str
204
+ language: str
205
+ entities: List[EntityResult]
206
+ keywords: List[str]
207
+ relationships: List[RelationshipResult]
208
+ summary: str
209
+ embeddings: Optional[List[float]] = None
210
+ graph_data: GraphData
211
+ export_files: ExportFiles
212
+ processing_time: float
213
+ character_count: int
214
+ word_count: int
215
+ sentence_count: int
216
+ entity_relationship_stats: Dict[str, Any]
217
+ error: Optional[str] = None
218
+
219
+ class MultiNERResponse(BaseModel):
220
+ success: bool
221
+ analysis_id: str
222
+ combined_analysis: NERResponse
223
+ individual_analyses: List[NERResponse]
224
+ processing_time: float
225
+ total_sources: int
226
+ error: Optional[str] = None
227
+
228
+ # Utility Functions
229
+ def generate_unique_id(prefix: str = "item") -> str:
230
+ """Generate unique ID with timestamp"""
231
+ return f"{prefix}_{int(datetime.utcnow().timestamp() * 1000)}"
232
+
233
+ def normalize_text(text: str) -> str:
234
+ """Normalize text for comparison"""
235
+ return re.sub(r'\s+', ' ', text.strip().lower())
236
+
237
+ def calculate_text_similarity(text1: str, text2: str) -> float:
238
+ """Calculate basic text similarity"""
239
+ norm1 = normalize_text(text1)
240
+ norm2 = normalize_text(text2)
241
+
242
+ if norm1 == norm2:
243
+ return 1.0
244
+
245
+ words1 = set(norm1.split())
246
+ words2 = set(norm2.split())
247
+
248
+ if not words1 and not words2:
249
+ return 1.0
250
+ if not words1 or not words2:
251
+ return 0.0
252
+
253
+ intersection = words1.intersection(words2)
254
+ union = words1.union(words2)
255
+
256
+ return len(intersection) / len(union) if union else 0.0
257
+
258
+ def deduplicate_entities(entities: List[Dict[str, Any]], similarity_threshold: float = 0.8) -> List[Dict[str, Any]]:
259
+ """Remove duplicate entities based on text similarity"""
260
+ if not entities:
261
+ return []
262
+
263
+ deduplicated = []
264
+ processed_texts = set()
265
+
266
+ for entity in entities:
267
+ entity_text = entity.get('text', '').strip()
268
+ normalized_text = normalize_text(entity_text)
269
+
270
+ if not entity_text or normalized_text in processed_texts:
271
+ continue
272
+
273
+ is_duplicate = False
274
+ for existing_entity in deduplicated:
275
+ existing_text = existing_entity.get('text', '')
276
+ similarity = calculate_text_similarity(entity_text, existing_text)
277
+
278
+ if similarity >= similarity_threshold:
279
+ if entity.get('confidence', 0) > existing_entity.get('confidence', 0):
280
+ deduplicated.remove(existing_entity)
281
+ break
282
+ else:
283
+ is_duplicate = True
284
+ break
285
+
286
+ if not is_duplicate:
287
+ entity['id'] = entity.get('id', generate_unique_id('ent'))
288
+ deduplicated.append(entity)
289
+ processed_texts.add(normalized_text)
290
+
291
+ return deduplicated
292
+
293
+ def detect_language(text: str) -> str:
294
+ """Enhanced language detection"""
295
+ if not text:
296
+ return "en"
297
+
298
+ thai_chars = len(re.findall(r'[ก-ΰΉ™]', text))
299
+ english_chars = len(re.findall(r'[a-zA-Z]', text))
300
+ total_chars = thai_chars + english_chars
301
+
302
+ if total_chars == 0:
303
+ return "en"
304
+
305
+ thai_ratio = thai_chars / total_chars
306
+
307
+ if thai_ratio > 0.3:
308
+ return "th"
309
+ elif thai_ratio > 0.1:
310
+ return "mixed"
311
+ else:
312
+ return "en"
313
+
314
+ def get_text_stats(text: str) -> Dict[str, int]:
315
+ """Get comprehensive text statistics"""
316
+ return {
317
+ "character_count": len(text),
318
+ "word_count": len(text.split()),
319
+ "sentence_count": len(re.findall(r'[.!?]+', text)),
320
+ "paragraph_count": len([p for p in text.split('\n\n') if p.strip()]),
321
+ "line_count": len(text.split('\n'))
322
+ }
323
+
324
+ # Client Management
325
+ def get_blob_client():
326
+ if clients.get('blob') is None and config.AZURE_STORAGE_ACCOUNT_URL and config.AZURE_BLOB_SAS_TOKEN:
327
+ try:
328
+ clients['blob'] = BlobServiceClient(
329
+ account_url=config.AZURE_STORAGE_ACCOUNT_URL,
330
+ credential=config.AZURE_BLOB_SAS_TOKEN
331
+ )
332
+ except Exception as e:
333
+ logger.error(f"Failed to initialize blob client: {e}")
334
+ return clients.get('blob')
335
+
336
+ def get_deepseek_client():
337
+ if clients.get('deepseek') is None and config.DEEPSEEK_ENDPOINT and config.DEEPSEEK_API_KEY:
338
+ try:
339
+ clients['deepseek'] = ChatCompletionsClient(
340
+ endpoint=config.DEEPSEEK_ENDPOINT,
341
+ credential=AzureKeyCredential(config.DEEPSEEK_API_KEY),
342
+ api_version="2024-05-01-preview"
343
+ )
344
+ except Exception as e:
345
+ logger.error(f"Failed to initialize DeepSeek client: {e}")
346
+ return clients.get('deepseek')
347
+
348
+ def get_openai_client():
349
+ if clients.get('openai') is None and config.AZURE_OPENAI_ENDPOINT and config.AZURE_OPENAI_API_KEY:
350
+ try:
351
+ clients['openai'] = AzureOpenAI(
352
+ api_version="2024-12-01-preview",
353
+ azure_endpoint=config.AZURE_OPENAI_ENDPOINT,
354
+ api_key=config.AZURE_OPENAI_API_KEY
355
+ )
356
+ except Exception as e:
357
+ logger.error(f"Failed to initialize OpenAI client: {e}")
358
+ return clients.get('openai')
359
+
360
+ # Database Operations
361
+ async def init_database():
362
+ global pg_pool, vector_available
363
+
364
+ logger.info("πŸ”„ Connecting to database...")
365
+ try:
366
+ pg_pool = await asyncpg.create_pool(
367
+ host=config.POSTGRES_HOST,
368
+ port=config.POSTGRES_PORT,
369
+ user=config.POSTGRES_USER,
370
+ password=config.POSTGRES_PASSWORD,
371
+ database=config.POSTGRES_DATABASE,
372
+ ssl='require',
373
+ min_size=2,
374
+ max_size=10,
375
+ command_timeout=60
376
+ )
377
+
378
+ async with pg_pool.acquire() as conn:
379
+ logger.info("βœ… Database connected")
380
+
381
+ # Check vector extension
382
+ try:
383
+ await conn.execute("CREATE EXTENSION IF NOT EXISTS vector;")
384
+ await conn.fetchval("SELECT '[1,2,3]'::vector(3)")
385
+ vector_available = True
386
+ logger.info("βœ… Vector extension available")
387
+ except:
388
+ vector_available = False
389
+ logger.info("⚠️ Vector extension not available (using JSONB)")
390
+
391
+ # Create tables
392
+ await create_tables(conn)
393
+ logger.info("βœ… Database setup complete")
394
+
395
+ return True
396
+ except Exception as e:
397
+ logger.error(f"❌ Database init failed: {e}")
398
+ return False
399
+
400
+ async def create_tables(conn):
401
+ """Create enhanced database tables for ER model"""
402
+
403
+ await conn.execute("""
404
+ CREATE TABLE IF NOT EXISTS ner_analyses (
405
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
406
+ analysis_id VARCHAR(255) UNIQUE NOT NULL,
407
+ source_text TEXT NOT NULL,
408
+ source_type VARCHAR(50) NOT NULL,
409
+ language VARCHAR(10) DEFAULT 'en',
410
+ entities JSONB NOT NULL DEFAULT '[]',
411
+ keywords JSONB NOT NULL DEFAULT '[]',
412
+ relationships JSONB NOT NULL DEFAULT '[]',
413
+ summary TEXT DEFAULT '',
414
+ embeddings JSONB DEFAULT '[]',
415
+ graph_data JSONB DEFAULT '{}',
416
+ export_files JSONB DEFAULT '{}',
417
+ text_stats JSONB DEFAULT '{}',
418
+ er_stats JSONB DEFAULT '{}',
419
+ processing_time FLOAT DEFAULT 0,
420
+ entity_types JSONB DEFAULT '[]',
421
+ relationship_types JSONB DEFAULT '[]',
422
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
423
+ );
424
+ """)
425
+
426
+ await conn.execute("""
427
+ CREATE TABLE IF NOT EXISTS entities (
428
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
429
+ entity_id VARCHAR(255) NOT NULL,
430
+ analysis_id VARCHAR(255) NOT NULL,
431
+ text VARCHAR(1000) NOT NULL,
432
+ label VARCHAR(100) NOT NULL,
433
+ confidence FLOAT DEFAULT 0,
434
+ start_pos INTEGER DEFAULT 0,
435
+ end_pos INTEGER DEFAULT 0,
436
+ frequency INTEGER DEFAULT 1,
437
+ importance_score FLOAT DEFAULT 0,
438
+ metadata JSONB DEFAULT '{}',
439
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
440
+ FOREIGN KEY (analysis_id) REFERENCES ner_analyses(analysis_id) ON DELETE CASCADE
441
+ );
442
+ """)
443
+
444
+ await conn.execute("""
445
+ CREATE TABLE IF NOT EXISTS relationships (
446
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
447
+ relationship_id VARCHAR(255) NOT NULL,
448
+ analysis_id VARCHAR(255) NOT NULL,
449
+ source_entity_id VARCHAR(255) NOT NULL,
450
+ target_entity_id VARCHAR(255) NOT NULL,
451
+ source_entity VARCHAR(1000) NOT NULL,
452
+ target_entity VARCHAR(1000) NOT NULL,
453
+ relationship_type VARCHAR(200) NOT NULL,
454
+ confidence FLOAT DEFAULT 0,
455
+ strength FLOAT DEFAULT 0,
456
+ context TEXT DEFAULT '',
457
+ evidence_count INTEGER DEFAULT 1,
458
+ bidirectional BOOLEAN DEFAULT FALSE,
459
+ metadata JSONB DEFAULT '{}',
460
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
461
+ FOREIGN KEY (analysis_id) REFERENCES ner_analyses(analysis_id) ON DELETE CASCADE
462
+ );
463
+ """)
464
+
465
+ # Create indexes
466
+ try:
467
+ await conn.execute("""
468
+ CREATE INDEX IF NOT EXISTS idx_analysis_id ON ner_analyses(analysis_id);
469
+ CREATE INDEX IF NOT EXISTS idx_entities_analysis ON entities(analysis_id);
470
+ CREATE INDEX IF NOT EXISTS idx_relationships_analysis ON relationships(analysis_id);
471
+ """)
472
+ except:
473
+ pass
474
+
475
+ # Text Extraction
476
+ def extract_text_from_file(file_content: bytes, filename: str) -> str:
477
+ file_ext = Path(filename).suffix.lower()
478
+
479
+ if file_ext == '.txt':
480
+ return file_content.decode('utf-8', errors='ignore')
481
+ elif file_ext == '.docx':
482
+ doc = docx.Document(io.BytesIO(file_content))
483
+ return '\n'.join([p.text for p in doc.paragraphs])
484
+ else:
485
+ return file_content.decode('utf-8', errors='ignore')
486
+
487
+ async def get_text_from_ocr(file_content: bytes, filename: str) -> str:
488
+ try:
489
+ async with httpx.AsyncClient(timeout=300) as client:
490
+ files = {'file': (filename, file_content)}
491
+ response = await client.post(f"{config.OCR_SERVICE_URL}/ocr/upload", files=files)
492
+ if response.status_code == 200:
493
+ return response.json().get('content', '')
494
+ except Exception as e:
495
+ logger.error(f"OCR service error: {e}")
496
+ pass
497
+ raise HTTPException(status_code=500, detail="OCR processing failed")
498
+
499
+ async def get_text_from_url(url: str) -> str:
500
+ try:
501
+ async with httpx.AsyncClient(timeout=300) as client:
502
+ response = await client.post(f"{config.OCR_SERVICE_URL}/ocr/url",
503
+ json={"url": str(url), "extract_images": True})
504
+ if response.status_code == 200:
505
+ return response.json().get('content', '')
506
+ except Exception as e:
507
+ logger.error(f"URL processing error: {e}")
508
+ pass
509
+ raise HTTPException(status_code=500, detail="URL processing failed")
510
+
511
+ # Enhanced NER and Relationship Analysis
512
+ async def analyze_with_deepseek(text: str, language: str = None) -> Dict[str, Any]:
513
+ """Enhanced analysis with improved relationship extraction"""
514
+ deepseek_client = get_deepseek_client()
515
+ if not deepseek_client:
516
+ logger.warning("DeepSeek not configured, using manual extraction")
517
+ return extract_manual_entities_and_relationships(text, language)
518
+
519
+ try:
520
+ if not language:
521
+ language = detect_language(text)
522
+
523
+ if language == "th":
524
+ system_prompt = """ΰΈ„ΰΈΈΰΈ“ΰΉ€ΰΈ›ΰΉ‡ΰΈ™ΰΈœΰΈΉΰΉ‰ΰΉ€ΰΈŠΰΈ΅ΰΉˆΰΈ’ΰΈ§ΰΈŠΰΈ²ΰΈΰΉƒΰΈ™ΰΈΰΈ²ΰΈ£ΰΈˆΰΈ”ΰΈˆΰΈ³ΰΈ™ΰΈ²ΰΈ‘ΰΉ€ΰΈ­ΰΈΰΈ₯ΰΈ±ΰΈΰΈ©ΰΈ“ΰΉŒΰΉΰΈ₯ะการΰΈͺกัดควาฑΰΈͺΰΈ±ΰΈ‘ΰΈžΰΈ±ΰΈ™ΰΈ˜ΰΉŒΰΈͺΰΈ³ΰΈ«ΰΈ£ΰΈ±ΰΈšΰΈ ΰΈ²ΰΈ©ΰΈ²ΰΉ„ΰΈ—ΰΈ’
525
+
526
+ ΰΈ§ΰΈ΄ΰΉ€ΰΈ„ΰΈ£ΰΈ²ΰΈ°ΰΈ«ΰΉŒΰΈ‚ΰΉ‰ΰΈ­ΰΈ„ΰΈ§ΰΈ²ΰΈ‘ΰΉΰΈ₯ΰΈ°ΰΈͺกัดข้อฑูΰΈ₯ΰΈ”ΰΈ±ΰΈ‡ΰΈ™ΰΈ΅ΰΉ‰:
527
+ 1. นาฑเอกΰΈ₯ΰΈ±ΰΈΰΈ©ΰΈ“ΰΉŒΰΈ—ΰΈΈΰΈΰΈ›ΰΈ£ΰΈ°ΰΉ€ΰΈ ΰΈ— (ΰΈšΰΈΈΰΈ„ΰΈ„ΰΈ₯ ΰΈ­ΰΈ‡ΰΈ„ΰΉŒΰΈΰΈ£ ΰΈͺΰΈ–ΰΈ²ΰΈ™ΰΈ—ΰΈ΅ΰΉˆ ΰΈ§ΰΈ±ΰΈ™ΰΈ—ΰΈ΅ΰΉˆ ΰΉ€ΰΈ§ΰΈ₯ΰΈ² ΰΉ€ΰΈ‡ΰΈ΄ΰΈ™ ΰΈ―ΰΈ₯ΰΈ―)
528
+ 2. ΰΈ„ΰΈ§ΰΈ²ΰΈ‘ΰΈͺΰΈ±ΰΈ‘ΰΈžΰΈ±ΰΈ™ΰΈ˜ΰΉŒΰΈ£ΰΈ°ΰΈ«ΰΈ§ΰΉˆΰΈ²ΰΈ‡ΰΈ™ΰΈ²ΰΈ‘ΰΉ€ΰΈ­ΰΈΰΈ₯ΰΈ±ΰΈΰΈ©ΰΈ“ΰΉŒ - ΰΈ•ΰΉ‰ΰΈ­ΰΈ‡ΰΈͺกัดทุกควาฑΰΈͺΰΈ±ΰΈ‘ΰΈžΰΈ±ΰΈ™ΰΈ˜ΰΉŒΰΈ—ΰΈ΅ΰΉˆΰΈžΰΈš
529
+ 3. ΰΈ„ΰΈ³ΰΈ«ΰΈ₯ักΰΈͺΰΈ³ΰΈ„ΰΈ±ΰΈΰΈˆΰΈ²ΰΈΰΈ‚ΰΉ‰ΰΈ­ΰΈ„ΰΈ§ΰΈ²ΰΈ‘
530
+ 4. ΰΈͺΰΈ£ΰΈΈΰΈ›ΰΈ—ΰΈ΅ΰΉˆΰΈ„ΰΈ£ΰΈ­ΰΈšΰΈ„ΰΈ₯ΰΈΈΰΈ‘
531
+
532
+ ΰΉƒΰΈ«ΰΉ‰ΰΈœΰΈ₯ΰΈ₯ΰΈ±ΰΈžΰΈ˜ΰΉŒΰΉ€ΰΈ›ΰΉ‡ΰΈ™ JSON:
533
+ {
534
+ "entities": [{"text": "ΰΈ‚ΰΉ‰ΰΈ­ΰΈ„ΰΈ§ΰΈ²ΰΈ‘", "label": "ΰΈ›ΰΈ£ΰΈ°ΰΉ€ΰΈ ΰΈ—", "confidence": 0.95, "start_pos": 0, "end_pos": 10}],
535
+ "keywords": ["ΰΈ„ΰΈ³ΰΈ«ΰΈ₯ัก1", "ΰΈ„ΰΈ³ΰΈ«ΰΈ₯ัก2"],
536
+ "relationships": [{"source_entity": "A", "target_entity": "B", "relationship_type": "ΰΈ›ΰΈ£ΰΈ°ΰΉ€ΰΈ ΰΈ—", "confidence": 0.9, "context": "ΰΈšΰΈ£ΰΈ΄ΰΈšΰΈ—"}],
537
+ "summary": "ΰΈͺΰΈ£ΰΈΈΰΈ›"
538
+ }"""
539
+ else:
540
+ system_prompt = """You are an expert in Named Entity Recognition and relationship extraction.
541
+
542
+ Analyze the text and extract:
543
+ 1. All named entities (people, organizations, locations, dates, money, etc.)
544
+ 2. ALL relationships between entities - extract every relationship found
545
+ 3. Important keywords from the text
546
+ 4. Comprehensive summary
547
+
548
+ Return ONLY valid JSON:
549
+ {
550
+ "entities": [{"text": "entity text", "label": "TYPE", "confidence": 0.95, "start_pos": 0, "end_pos": 10}],
551
+ "keywords": ["keyword1", "keyword2"],
552
+ "relationships": [{"source_entity": "Entity A", "target_entity": "Entity B", "relationship_type": "relationship_type", "confidence": 0.9, "context": "context"}],
553
+ "summary": "Comprehensive summary"
554
+ }"""
555
+
556
+ user_prompt = f"ΰΈ§ΰΈ΄ΰΉ€ΰΈ„ΰΈ£ΰΈ²ΰΈ°ΰΈ«ΰΉŒΰΈ‚ΰΉ‰ΰΈ­ΰΈ„ΰΈ§ΰΈ²ΰΈ‘ΰΈ™ΰΈ΅ΰΉ‰:\n\n{text[:8000]}" if language == "th" else f"Analyze this text:\n\n{text[:8000]}"
557
+
558
+ response = deepseek_client.complete(
559
+ messages=[
560
+ SystemMessage(content=system_prompt),
561
+ UserMessage(content=user_prompt)
562
+ ],
563
+ max_tokens=6000,
564
+ model=config.DEEPSEEK_MODEL,
565
+ temperature=0.1
566
+ )
567
+
568
+ result_text = response.choices[0].message.content.strip()
569
+
570
+ # Extract JSON from response
571
+ start_idx = result_text.find('{')
572
+ end_idx = result_text.rfind('}') + 1
573
+ if start_idx != -1 and end_idx > start_idx:
574
+ json_text = result_text[start_idx:end_idx]
575
+ try:
576
+ json_result = json.loads(json_text)
577
+ logger.info("βœ… Successfully parsed JSON from DeepSeek")
578
+ except:
579
+ try:
580
+ fixed_json = json_text.replace("'", '"').replace('True', 'true').replace('False', 'false')
581
+ json_result = json.loads(fixed_json)
582
+ logger.info("βœ… Successfully parsed fixed JSON")
583
+ except:
584
+ json_result = None
585
+ else:
586
+ json_result = None
587
+
588
+ if json_result:
589
+ entities = deduplicate_entities(json_result.get('entities', []))
590
+ keywords = json_result.get('keywords', [])
591
+ relationships = json_result.get('relationships', [])
592
+ summary = json_result.get('summary', '')
593
+
594
+ # Ensure relationships are extracted
595
+ if len(relationships) == 0 and len(entities) >= 2:
596
+ logger.warning("No relationships found by DeepSeek, applying rule-based extraction")
597
+ rule_based_relationships = extract_rule_based_relationships(entities, text, language)
598
+ relationships.extend(rule_based_relationships)
599
+
600
+ # Enhance relationships with IDs
601
+ for rel in relationships:
602
+ if 'id' not in rel:
603
+ rel['id'] = generate_unique_id('rel')
604
+ if 'strength' not in rel:
605
+ rel['strength'] = rel.get('confidence', 0.8)
606
+ if 'evidence_count' not in rel:
607
+ rel['evidence_count'] = 1
608
+ if 'bidirectional' not in rel:
609
+ rel['bidirectional'] = False
610
+
611
+ return {
612
+ "entities": entities,
613
+ "keywords": keywords[:20],
614
+ "relationships": relationships,
615
+ "summary": summary or f"Analysis of {len(text)} characters"
616
+ }
617
+
618
+ logger.warning("JSON parsing failed, using manual extraction")
619
+ return extract_manual_entities_and_relationships(text, language)
620
+
621
+ except Exception as e:
622
+ logger.error(f"DeepSeek analysis error: {e}")
623
+ return extract_manual_entities_and_relationships(text, language)
624
+
625
+ def extract_rule_based_relationships(entities: List[Dict], text: str, language: str) -> List[Dict]:
626
+ """Extract relationships using rule-based approach"""
627
+ relationships = []
628
+
629
+ if len(entities) < 2:
630
+ return relationships
631
+
632
+ # Define relationship patterns
633
+ if language == "th":
634
+ patterns = [
635
+ (r'(.+?)\s*ΰΈ—ΰΈ³ΰΈ‡ΰΈ²ΰΈ™(?:ΰΈ—ΰΈ΅ΰΉˆ|ΰΉƒΰΈ™|กับ)\s*(.+)', 'ΰΈ—ΰΈ³ΰΈ‡ΰΈ²ΰΈ™ΰΈ—ΰΈ΅ΰΉˆ'),
636
+ (r'(.+?)\s*ΰΉ€ΰΈ›ΰΉ‡ΰΈ™(?:ΰΉ€ΰΈˆΰΉ‰ΰΈ²ΰΈ‚ΰΈ­ΰΈ‡|ΰΈ‚ΰΈ­ΰΈ‡)\s*(.+)', 'ΰΉ€ΰΈ›ΰΉ‡ΰΈ™ΰΉ€ΰΈˆΰΉ‰ΰΈ²ΰΈ‚ΰΈ­ΰΈ‡'),
637
+ (r'(.+?)\s*ΰΈ•ΰΈ±ΰΉ‰ΰΈ‡ΰΈ­ΰΈ’ΰΈΉΰΉˆ(?:ΰΈ—ΰΈ΅ΰΉˆ|ΰΉƒΰΈ™)\s*(.+)', 'ΰΈ•ΰΈ±ΰΉ‰ΰΈ‡ΰΈ­ΰΈ’ΰΈΉΰΉˆΰΈ—ΰΈ΅ΰΉˆ'),
638
+ (r'(.+?)\s*(?:จับกุฑ|จับ)\s*(.+)', 'ΰΈˆΰΈ±ΰΈšΰΈΰΈΈΰΈ‘ΰΉ‚ΰΈ”ΰΈ’'),
639
+ ]
640
+ else:
641
+ patterns = [
642
+ (r'(.+?)\s*(?:works?\s+(?:for|at|in)|employed\s+by)\s*(.+)', 'works_for'),
643
+ (r'(.+?)\s*(?:owns?|possesses?)\s*(.+)', 'owns'),
644
+ (r'(.+?)\s*(?:located\s+(?:in|at)|based\s+in)\s*(.+)', 'located_in'),
645
+ (r'(.+?)\s*(?:arrested\s+by|detained\s+by)\s*(.+)', 'arrested_by'),
646
+ ]
647
+
648
+ for pattern, rel_type in patterns:
649
+ for match in re.finditer(pattern, text, re.IGNORECASE | re.UNICODE):
650
+ source_text = match.group(1).strip()
651
+ target_text = match.group(2).strip()
652
+
653
+ source_entity = find_best_entity_match(source_text, entities)
654
+ target_entity = find_best_entity_match(target_text, entities)
655
+
656
+ if source_entity and target_entity and source_entity != target_entity:
657
+ relationship = {
658
+ 'id': generate_unique_id('rel'),
659
+ 'source_entity': source_entity['text'],
660
+ 'target_entity': target_entity['text'],
661
+ 'relationship_type': rel_type,
662
+ 'confidence': 0.7,
663
+ 'strength': 0.7,
664
+ 'context': match.group(0),
665
+ 'evidence_count': 1,
666
+ 'bidirectional': False,
667
+ 'metadata': {'extraction_method': 'rule_based'}
668
+ }
669
+ relationships.append(relationship)
670
+
671
+ return relationships
672
+
673
+ def find_best_entity_match(text: str, entities: List[Dict]) -> Optional[Dict]:
674
+ """Find the best matching entity for given text"""
675
+ text_norm = normalize_text(text)
676
+
677
+ for entity in entities:
678
+ if normalize_text(entity['text']) == text_norm:
679
+ return entity
680
+
681
+ best_match = None
682
+ best_score = 0
683
+
684
+ for entity in entities:
685
+ score = calculate_text_similarity(text, entity['text'])
686
+ if score > best_score and score > 0.6:
687
+ best_score = score
688
+ best_match = entity
689
+
690
+ return best_match
691
+
692
+ def extract_manual_entities_and_relationships(text: str, language: str = None) -> Dict[str, Any]:
693
+ """Enhanced manual extraction with relationship detection"""
694
+ if not language:
695
+ language = detect_language(text)
696
+
697
+ entities = []
698
+ keywords = []
699
+
700
+ # Enhanced patterns for different languages
701
+ if language == "th":
702
+ patterns = {
703
+ 'PERSON': [r'(?:ΰΈ„ΰΈΈΰΈ“|ΰΈ™ΰΈ²ΰΈ’|ΰΈ™ΰΈ²ΰΈ‡|ΰΈ™ΰΈ²ΰΈ‡ΰΈͺΰΈ²ΰΈ§|ΰΈ”ΰΈ£\.?)\s*[ก-ΰΉ™\w\s]+'],
704
+ 'ORGANIZATION': [r'ΰΈšΰΈ£ΰΈ΄ΰΈ©ΰΈ±ΰΈ—\s+[ก-ΰΉ™\w\s]+(?:ΰΈˆΰΈ³ΰΈΰΈ±ΰΈ”|ΰΈ‘ΰΈ«ΰΈ²ΰΈŠΰΈ™)', r'ΰΈͺΰΈ–ΰΈ²ΰΈ™ΰΈ΅ΰΈ•ΰΈ³ΰΈ£ΰΈ§ΰΈˆ[ก-ΰΉ™\w\s]+'],
705
+ 'LOCATION': [r'ΰΈˆΰΈ±ΰΈ‡ΰΈ«ΰΈ§ΰΈ±ΰΈ”[ก-ΰΉ™\w\s]+', r'ΰΈΰΈ£ΰΈΈΰΈ‡ΰΉ€ΰΈ—ΰΈžΰΈ‘ΰΈ«ΰΈ²ΰΈ™ΰΈ„ΰΈ£|ΰΈΰΈ£ΰΈΈΰΈ‡ΰΉ€ΰΈ—ΰΈžΰΈ―?'],
706
+ 'MONEY': [r'\d+(?:,\d{3})*\s*(?:ΰΈšΰΈ²ΰΈ—|ΰΈ₯ΰΉ‰ΰΈ²ΰΈ™ΰΈšΰΈ²ΰΈ—|ΰΈžΰΈ±ΰΈ™ΰΈšΰΈ²ΰΈ—)'],
707
+ 'DATE': [r'\d{1,2}\/\d{1,2}\/\d{4}'],
708
+ }
709
+ words = re.findall(r'[ก-ΰΉ™]+', text)
710
+ thai_stop_words = {'แΰΈ₯ΰΈ°', 'ΰΈ«ΰΈ£ΰΈ·ΰΈ­', 'ΰΉΰΈ•ΰΉˆ', 'ΰΉƒΰΈ™', 'ΰΈ—ΰΈ΅ΰΉˆ', 'ΰΉ€ΰΈžΰΈ·ΰΉˆΰΈ­', 'กับ', 'จาก', 'ΰΉ‚ΰΈ”ΰΈ’', 'ΰΈ‚ΰΈ­ΰΈ‡'}
711
+ keywords = [word for word in words if word not in thai_stop_words and len(word) > 2]
712
+ else:
713
+ patterns = {
714
+ 'PERSON': [r'\b(?:Mr|Mrs|Ms|Dr|Prof)\.\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*'],
715
+ 'ORGANIZATION': [r'\b[A-Z][a-zA-Z]+\s+(?:Inc|Corp|Company|Ltd|Co|LLC|Corporation|Limited|University)\b'],
716
+ 'LOCATION': [r'\b(?:New York|Los Angeles|Chicago|Bangkok|London|Paris|Berlin)\b'],
717
+ 'MONEY': [r'\$[\d,]+\.?\d*', r'\b\d+(?:,\d{3})*\s*(?:dollars?|USD|million|billion)\b'],
718
+ 'DATE': [r'\b\d{1,2}\/\d{1,2}\/\d{4}\b'],
719
+ }
720
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
721
+ english_stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
722
+ keywords = [word.lower() for word in words if word.lower() not in english_stop_words]
723
+
724
+ # Extract entities
725
+ for label, pattern_list in patterns.items():
726
+ for pattern in pattern_list:
727
+ for match in re.finditer(pattern, text, re.UNICODE | re.IGNORECASE):
728
+ entity_text = match.group().strip()
729
+ if len(entity_text) > 1:
730
+ entities.append({
731
+ "id": generate_unique_id('ent'),
732
+ "text": entity_text,
733
+ "label": label,
734
+ "confidence": 0.8,
735
+ "start_pos": match.start(),
736
+ "end_pos": match.end(),
737
+ "frequency": 1,
738
+ "importance_score": 0.7,
739
+ "metadata": {"source": "manual_extraction"}
740
+ })
741
+
742
+ # Deduplicate
743
+ entities = deduplicate_entities(entities)
744
+ keywords = list(set(keywords))[:20]
745
+
746
+ # Extract relationships
747
+ relationships = []
748
+ if len(entities) >= 2:
749
+ relationships = extract_rule_based_relationships(entities, text, language)
750
+
751
+ summary = f"Analysis of {len(text)} characters found {len(entities)} entities and {len(relationships)} relationships"
752
+
753
+ return {
754
+ "entities": entities,
755
+ "keywords": keywords,
756
+ "relationships": relationships,
757
+ "summary": summary
758
+ }
759
+
760
+ async def generate_embeddings(text: str) -> List[float]:
761
+ openai_client = get_openai_client()
762
+ if not openai_client:
763
+ return []
764
+
765
+ try:
766
+ response = openai_client.embeddings.create(
767
+ input=[text[:8000]],
768
+ model=config.EMBEDDING_MODEL,
769
+ dimensions=1536
770
+ )
771
+ return response.data[0].embedding
772
+ except Exception as e:
773
+ logger.error(f"Embedding failed: {e}")
774
+ return []
775
+
776
+ def create_enhanced_graph_data(entities: List[Dict], relationships: List[Dict]) -> GraphData:
777
+ """Create enhanced graph data with comprehensive ER model"""
778
+ nodes = []
779
+ links = []
780
+ entity_map = {}
781
+
782
+ # Create nodes
783
+ for entity in entities:
784
+ node_id = entity.get('id', generate_unique_id('ent'))
785
+ entity_map[entity['text']] = node_id
786
+
787
+ node_properties = {
788
+ "original_text": entity['text'],
789
+ "entity_type": entity['label'],
790
+ "confidence": entity.get('confidence', 0.0),
791
+ "start_position": entity.get('start_pos', 0),
792
+ "end_position": entity.get('end_pos', 0),
793
+ "frequency": entity.get('frequency', 1),
794
+ "importance_score": entity.get('importance_score', 0.0),
795
+ "metadata": entity.get('metadata', {})
796
+ }
797
+
798
+ nodes.append(NodeResult(
799
+ id=node_id,
800
+ label=entity['text'],
801
+ type=entity['label'],
802
+ confidence=entity.get('confidence', 0.0),
803
+ frequency=entity.get('frequency', 1),
804
+ importance_score=entity.get('importance_score', 0.0),
805
+ properties=node_properties
806
+ ))
807
+
808
+ # Create links
809
+ for rel in relationships:
810
+ source_id = entity_map.get(rel['source_entity'])
811
+ target_id = entity_map.get(rel['target_entity'])
812
+
813
+ if source_id and target_id:
814
+ link_id = rel.get('id', generate_unique_id('link'))
815
+
816
+ link_properties = {
817
+ "relationship_type": rel['relationship_type'],
818
+ "confidence": rel.get('confidence', 0.0),
819
+ "strength": rel.get('strength', rel.get('confidence', 0.0)),
820
+ "context": rel.get('context', ''),
821
+ "evidence_count": rel.get('evidence_count', 1),
822
+ "bidirectional": rel.get('bidirectional', False),
823
+ "metadata": rel.get('metadata', {})
824
+ }
825
+
826
+ links.append(LinkResult(
827
+ id=link_id,
828
+ source=source_id,
829
+ target=target_id,
830
+ relationship=rel['relationship_type'],
831
+ confidence=rel.get('confidence', 0.0),
832
+ strength=rel.get('strength', rel.get('confidence', 0.0)),
833
+ evidence_count=rel.get('evidence_count', 1),
834
+ properties=link_properties
835
+ ))
836
+
837
+ # Calculate metadata
838
+ entity_types = defaultdict(int)
839
+ relationship_types = defaultdict(int)
840
+
841
+ for entity in entities:
842
+ entity_types[entity['label']] += 1
843
+
844
+ for rel in relationships:
845
+ relationship_types[rel['relationship_type']] += 1
846
+
847
+ metadata = {
848
+ "total_entities": len(entities),
849
+ "total_relationships": len(relationships),
850
+ "entity_type_distribution": dict(entity_types),
851
+ "relationship_type_distribution": dict(relationship_types),
852
+ "graph_density": len(relationships) / (len(entities) * (len(entities) - 1) / 2) if len(entities) > 1 else 0,
853
+ "average_entity_confidence": sum(entity.get('confidence', 0) for entity in entities) / len(entities) if entities else 0,
854
+ "average_relationship_confidence": sum(rel.get('confidence', 0) for rel in relationships) / len(relationships) if relationships else 0,
855
+ "unique_entity_types": len(entity_types),
856
+ "unique_relationship_types": len(relationship_types)
857
+ }
858
+
859
+ return GraphData(
860
+ nodes=nodes,
861
+ links=links,
862
+ metadata=metadata
863
+ )
864
+
865
+ # Export Functions (simplified)
866
+ async def generate_export_files(analysis_id: str, entities: List[Dict], relationships: List[Dict],
867
+ graph_data: GraphData, formats: List[str]) -> ExportFiles:
868
+ """Generate export files for various formats"""
869
+
870
+ export_files = ExportFiles()
871
+ analysis_dir = EXPORT_DIR / analysis_id
872
+ analysis_dir.mkdir(exist_ok=True)
873
+
874
+ try:
875
+ if "neo4j" in formats:
876
+ nodes_file, rels_file = await generate_neo4j_csv(analysis_dir, entities, relationships)
877
+ export_files.neo4j_nodes = str(nodes_file)
878
+ export_files.neo4j_relationships = str(rels_file)
879
+
880
+ if "json" in formats:
881
+ json_file = await generate_json_export(analysis_dir, entities, relationships, graph_data)
882
+ export_files.json_export = str(json_file)
883
+
884
+ if "graphml" in formats:
885
+ graphml_file = await generate_graphml_export(analysis_dir, entities, relationships)
886
+ export_files.graphml_export = str(graphml_file)
887
+
888
+ logger.info(f"βœ… Generated export files for analysis {analysis_id}")
889
+
890
+ except Exception as e:
891
+ logger.error(f"❌ Export file generation failed: {e}")
892
+
893
+ return export_files
894
+
895
+ async def generate_neo4j_csv(export_dir: Path, entities: List[Dict], relationships: List[Dict]) -> Tuple[Path, Path]:
896
+ """Generate Neo4j compatible CSV files"""
897
+
898
+ nodes_file = export_dir / "neo4j_nodes.csv"
899
+ with open(nodes_file, 'w', newline='', encoding='utf-8') as f:
900
+ writer = csv.writer(f)
901
+ writer.writerow([
902
+ 'nodeId:ID', 'text', 'label:LABEL', 'confidence:float',
903
+ 'frequency:int', 'importance:float'
904
+ ])
905
+
906
+ for entity in entities:
907
+ writer.writerow([
908
+ entity.get('id', generate_unique_id('ent')),
909
+ entity['text'],
910
+ entity['label'],
911
+ entity.get('confidence', 0.0),
912
+ entity.get('frequency', 1),
913
+ entity.get('importance_score', 0.0)
914
+ ])
915
+
916
+ rels_file = export_dir / "neo4j_relationships.csv"
917
+ entity_map = {entity['text']: entity.get('id', generate_unique_id('ent')) for entity in entities}
918
+
919
+ with open(rels_file, 'w', newline='', encoding='utf-8') as f:
920
+ writer = csv.writer(f)
921
+ writer.writerow([
922
+ ':START_ID', ':END_ID', ':TYPE', 'confidence:float',
923
+ 'strength:float', 'context'
924
+ ])
925
+
926
+ for rel in relationships:
927
+ source_id = entity_map.get(rel['source_entity'])
928
+ target_id = entity_map.get(rel['target_entity'])
929
+
930
+ if source_id and target_id:
931
+ writer.writerow([
932
+ source_id,
933
+ target_id,
934
+ rel['relationship_type'].upper().replace(' ', '_'),
935
+ rel.get('confidence', 0.0),
936
+ rel.get('strength', rel.get('confidence', 0.0)),
937
+ rel.get('context', '')
938
+ ])
939
+
940
+ return nodes_file, rels_file
941
+
942
+ async def generate_json_export(export_dir: Path, entities: List[Dict], relationships: List[Dict], graph_data: GraphData) -> Path:
943
+ """Generate comprehensive JSON export"""
944
+
945
+ json_file = export_dir / "analysis_export.json"
946
+
947
+ export_data = {
948
+ "metadata": {
949
+ "export_timestamp": datetime.utcnow().isoformat(),
950
+ "format_version": "1.0",
951
+ "total_entities": len(entities),
952
+ "total_relationships": len(relationships)
953
+ },
954
+ "entities": entities,
955
+ "relationships": relationships,
956
+ "graph_data": graph_data.dict(),
957
+ "statistics": {
958
+ "entity_types": list(set(e['label'] for e in entities)),
959
+ "relationship_types": list(set(r['relationship_type'] for r in relationships)),
960
+ "average_confidence": sum(e.get('confidence', 0) for e in entities) / len(entities) if entities else 0
961
+ }
962
+ }
963
+
964
+ with open(json_file, 'w', encoding='utf-8') as f:
965
+ json.dump(export_data, f, indent=2, ensure_ascii=False)
966
+
967
+ return json_file
968
+
969
+ async def generate_graphml_export(export_dir: Path, entities: List[Dict], relationships: List[Dict]) -> Path:
970
+ """Generate GraphML format"""
971
+
972
+ graphml_file = export_dir / "graph_export.graphml"
973
+
974
+ # Create GraphML structure
975
+ root = ET.Element('graphml')
976
+ root.set('xmlns', 'http://graphml.graphdrawing.org/xmlns')
977
+
978
+ # Define attributes
979
+ ET.SubElement(root, 'key', id='label', **{'for': 'node', 'attr.name': 'label', 'attr.type': 'string'})
980
+ ET.SubElement(root, 'key', id='type', **{'for': 'node', 'attr.name': 'type', 'attr.type': 'string'})
981
+ ET.SubElement(root, 'key', id='rel_type', **{'for': 'edge', 'attr.name': 'relationship', 'attr.type': 'string'})
982
+
983
+ graph = ET.SubElement(root, 'graph', id='G', edgedefault='directed')
984
+
985
+ # Add nodes
986
+ entity_map = {}
987
+ for entity in entities:
988
+ node_id = entity.get('id', generate_unique_id('ent'))
989
+ entity_map[entity['text']] = node_id
990
+
991
+ node = ET.SubElement(graph, 'node', id=node_id)
992
+
993
+ label_data = ET.SubElement(node, 'data', key='label')
994
+ label_data.text = entity['text']
995
+
996
+ type_data = ET.SubElement(node, 'data', key='type')
997
+ type_data.text = entity['label']
998
+
999
+ # Add edges
1000
+ for i, rel in enumerate(relationships):
1001
+ source_id = entity_map.get(rel['source_entity'])
1002
+ target_id = entity_map.get(rel['target_entity'])
1003
+
1004
+ if source_id and target_id:
1005
+ edge = ET.SubElement(graph, 'edge', id=f"e{i}", source=source_id, target=target_id)
1006
+
1007
+ rel_data = ET.SubElement(edge, 'data', key='rel_type')
1008
+ rel_data.text = rel['relationship_type']
1009
+
1010
+ # Write to file
1011
+ tree = ET.ElementTree(root)
1012
+ tree.write(graphml_file, encoding='utf-8', xml_declaration=True)
1013
+
1014
+ return graphml_file
1015
+
1016
+ def calculate_er_stats(entities: List[Dict], relationships: List[Dict]) -> Dict[str, Any]:
1017
+ """Calculate Entity-Relationship statistics"""
1018
+
1019
+ if not entities:
1020
+ return {}
1021
+
1022
+ entity_types = defaultdict(int)
1023
+ relationship_types = defaultdict(int)
1024
+
1025
+ for entity in entities:
1026
+ entity_types[entity['label']] += 1
1027
+
1028
+ for rel in relationships:
1029
+ relationship_types[rel['relationship_type']] += 1
1030
+
1031
+ return {
1032
+ "total_entities": len(entities),
1033
+ "total_relationships": len(relationships),
1034
+ "entity_type_distribution": dict(entity_types),
1035
+ "relationship_type_distribution": dict(relationship_types),
1036
+ "graph_density": len(relationships) / (len(entities) * (len(entities) - 1) / 2) if len(entities) > 1 else 0,
1037
+ "unique_entity_types": len(entity_types),
1038
+ "unique_relationship_types": len(relationship_types)
1039
+ }
1040
+
1041
+ async def save_to_database(data: Dict[str, Any]) -> bool:
1042
+ if not pg_pool:
1043
+ logger.error("No database pool available")
1044
+ return False
1045
+
1046
+ try:
1047
+ async with pg_pool.acquire() as conn:
1048
+ await conn.execute("""
1049
+ INSERT INTO ner_analyses (
1050
+ analysis_id, source_text, source_type, language, entities, keywords,
1051
+ relationships, summary, embeddings, graph_data, export_files, text_stats,
1052
+ er_stats, processing_time, entity_types, relationship_types
1053
+ ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16)
1054
+ ON CONFLICT (analysis_id) DO UPDATE SET
1055
+ entities = EXCLUDED.entities,
1056
+ relationships = EXCLUDED.relationships,
1057
+ summary = EXCLUDED.summary
1058
+ """,
1059
+ data['analysis_id'],
1060
+ data['source_text'][:10000],
1061
+ data['source_type'],
1062
+ data['language'],
1063
+ json.dumps(data['entities'], ensure_ascii=False),
1064
+ json.dumps(data['keywords'], ensure_ascii=False),
1065
+ json.dumps(data['relationships'], ensure_ascii=False),
1066
+ data['summary'],
1067
+ json.dumps(data.get('embeddings', [])),
1068
+ json.dumps(data.get('graph_data', {}), ensure_ascii=False, default=str),
1069
+ json.dumps(data.get('export_files', {}), ensure_ascii=False, default=str),
1070
+ json.dumps(data.get('text_stats', {})),
1071
+ json.dumps(data.get('er_stats', {})),
1072
+ float(data.get('processing_time', 0)),
1073
+ json.dumps(list(set(entity.get('label', '') for entity in data.get('entities', [])))),
1074
+ json.dumps(list(set(rel.get('relationship_type', '') for rel in data.get('relationships', []))))
1075
+ )
1076
+
1077
+ logger.info(f"βœ… Analysis {data['analysis_id']} saved to database")
1078
+ return True
1079
+ except Exception as e:
1080
+ logger.error(f"❌ DB save failed for {data.get('analysis_id', 'unknown')}: {e}")
1081
+ return False
1082
+
1083
+ async def save_to_blob(analysis_id: str, data: Dict[str, Any]) -> bool:
1084
+ blob_client = get_blob_client()
1085
+ if not blob_client:
1086
+ return False
1087
+
1088
+ try:
1089
+ blob_name = f"ner_analysis/{analysis_id}_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.json"
1090
+ blob_client_obj = blob_client.get_blob_client(container=config.BLOB_CONTAINER, blob=blob_name)
1091
+ blob_client_obj.upload_blob(json.dumps(data, indent=2, ensure_ascii=False, default=str), overwrite=True)
1092
+ return True
1093
+ except Exception as e:
1094
+ logger.error(f"Blob save failed: {e}")
1095
+ return False
1096
+
1097
+ # App Lifecycle
1098
+ @asynccontextmanager
1099
+ async def lifespan(app: FastAPI):
1100
+ logger.info("πŸš€ Starting Enhanced NER Analysis Service...")
1101
+
1102
+ logger.info("πŸ”„ Database initialization...")
1103
+ db_ok = await init_database()
1104
+ if not db_ok:
1105
+ logger.error("❌ Database initialization failed!")
1106
+ raise RuntimeError("Database initialization failed")
1107
+
1108
+ logger.info("πŸ”„ Initializing API clients...")
1109
+ get_deepseek_client()
1110
+ get_openai_client()
1111
+ get_blob_client()
1112
+
1113
+ logger.info("πŸ”„ Creating export directories...")
1114
+ EXPORT_DIR.mkdir(exist_ok=True)
1115
+
1116
+ logger.info("πŸŽ‰ Enhanced NER Analysis Service is ready!")
1117
+ logger.info(f"πŸ“‘ Server running on http://{config.HOST}:{config.PORT}")
1118
+
1119
+ yield
1120
+
1121
+ logger.info("πŸ›‘ Shutting down...")
1122
+ if pg_pool:
1123
+ await pg_pool.close()
1124
+ logger.info("βœ… Database connections closed")
1125
+
1126
+ # FastAPI App
1127
+ app = FastAPI(
1128
+ title="Enhanced NER Analysis Service",
1129
+ description="Advanced Named Entity Recognition with relationship extraction and graph exports",
1130
+ version="2.0.0",
1131
+ lifespan=lifespan
1132
+ )
1133
+
1134
+ app.add_middleware(
1135
+ CORSMiddleware,
1136
+ allow_origins=["*"],
1137
+ allow_credentials=True,
1138
+ allow_methods=["*"],
1139
+ allow_headers=["*"],
1140
+ )
1141
+
1142
+ # API Endpoints
1143
+ @app.get("/")
1144
+ async def root():
1145
+ deepseek_available = bool(config.DEEPSEEK_ENDPOINT and config.DEEPSEEK_API_KEY)
1146
+ openai_available = bool(config.AZURE_OPENAI_ENDPOINT and config.AZURE_OPENAI_API_KEY)
1147
+ blob_available = bool(config.AZURE_STORAGE_ACCOUNT_URL and config.AZURE_BLOB_SAS_TOKEN)
1148
+
1149
+ return {
1150
+ "message": "Enhanced NER Analysis Service",
1151
+ "version": "2.0.0",
1152
+ "status": "operational",
1153
+ "supported_entities": config.ENTITY_TYPES,
1154
+ "supported_relationships": config.RELATIONSHIP_TYPES[:10],
1155
+ "export_formats": ["neo4j", "json", "graphml"],
1156
+ "features": {
1157
+ "ner_analysis": True,
1158
+ "relationship_extraction": True,
1159
+ "thai_language_support": True,
1160
+ "graph_database_export": True,
1161
+ "embedding_generation": openai_available,
1162
+ "deepseek_analysis": deepseek_available,
1163
+ "blob_storage": blob_available
1164
+ }
1165
+ }
1166
+
1167
+ @app.get("/health")
1168
+ async def health():
1169
+ deepseek_available = bool(config.DEEPSEEK_ENDPOINT and config.DEEPSEEK_API_KEY)
1170
+ openai_available = bool(config.AZURE_OPENAI_ENDPOINT and config.AZURE_OPENAI_API_KEY)
1171
+ blob_available = bool(config.AZURE_STORAGE_ACCOUNT_URL and config.AZURE_BLOB_SAS_TOKEN)
1172
+
1173
+ return {
1174
+ "status": "healthy",
1175
+ "service": "NER Analysis Service",
1176
+ "version": "2.0.0",
1177
+ "database": pg_pool is not None,
1178
+ "vector_extension": vector_available,
1179
+ "deepseek": deepseek_available,
1180
+ "openai": openai_available,
1181
+ "blob_storage": blob_available,
1182
+ "supported_entity_count": len(config.ENTITY_TYPES),
1183
+ "supported_relationship_count": len(config.RELATIONSHIP_TYPES),
1184
+ "export_formats": ["neo4j", "json", "graphml"]
1185
+ }
1186
+
1187
+ @app.post("/analyze/text", response_model=NERResponse)
1188
+ async def analyze_text(request: NERRequest, background_tasks: BackgroundTasks):
1189
+ """Analyze text for entities and relationships"""
1190
+ start_time = datetime.utcnow()
1191
+ analysis_id = f"text_{int(start_time.timestamp())}"
1192
+
1193
+ if not request.text or not request.text.strip():
1194
+ raise HTTPException(status_code=400, detail="Text is required")
1195
+
1196
+ try:
1197
+ language = detect_language(request.text)
1198
+ text_stats = get_text_stats(request.text)
1199
+
1200
+ # Enhanced analysis
1201
+ analysis_result = await analyze_with_deepseek(request.text, language)
1202
+
1203
+ # Generate embeddings if requested
1204
+ embeddings = []
1205
+ if request.include_embeddings:
1206
+ embeddings = await generate_embeddings(request.text)
1207
+
1208
+ # Create enhanced graph
1209
+ graph_data = create_enhanced_graph_data(
1210
+ analysis_result.get('entities', []),
1211
+ analysis_result.get('relationships', [])
1212
+ )
1213
+
1214
+ # Calculate ER statistics
1215
+ er_stats = calculate_er_stats(
1216
+ analysis_result.get('entities', []),
1217
+ analysis_result.get('relationships', [])
1218
+ )
1219
+
1220
+ # Generate export files if requested
1221
+ export_files = ExportFiles()
1222
+ if request.generate_graph_files:
1223
+ export_files = await generate_export_files(
1224
+ analysis_id,
1225
+ analysis_result.get('entities', []),
1226
+ analysis_result.get('relationships', []),
1227
+ graph_data,
1228
+ request.export_formats
1229
+ )
1230
+
1231
+ processing_time = (datetime.utcnow() - start_time).total_seconds()
1232
+
1233
+ response_data = {
1234
+ "analysis_id": analysis_id,
1235
+ "source_text": request.text,
1236
+ "source_type": "text_input",
1237
+ "language": language,
1238
+ "entities": analysis_result.get('entities', []),
1239
+ "keywords": analysis_result.get('keywords', []),
1240
+ "relationships": analysis_result.get('relationships', []),
1241
+ "summary": analysis_result.get('summary', ''),
1242
+ "embeddings": embeddings,
1243
+ "graph_data": graph_data,
1244
+ "export_files": export_files,
1245
+ "text_stats": text_stats,
1246
+ "er_stats": er_stats,
1247
+ "processing_time": processing_time,
1248
+ "character_count": text_stats["character_count"],
1249
+ "word_count": text_stats["word_count"],
1250
+ "sentence_count": text_stats["sentence_count"]
1251
+ }
1252
+
1253
+ # Save to database in background
1254
+ background_tasks.add_task(save_to_database, response_data)
1255
+ background_tasks.add_task(save_to_blob, analysis_id, response_data)
1256
+
1257
+ return NERResponse(
1258
+ success=True,
1259
+ entity_relationship_stats=er_stats,
1260
+ **response_data
1261
+ )
1262
+
1263
+ except HTTPException:
1264
+ raise
1265
+ except Exception as e:
1266
+ logger.error(f"Text analysis failed: {e}")
1267
+ return NERResponse(
1268
+ success=False,
1269
+ analysis_id=analysis_id,
1270
+ source_text=request.text[:1000],
1271
+ source_type="text_input",
1272
+ language="unknown",
1273
+ entities=[],
1274
+ keywords=[],
1275
+ relationships=[],
1276
+ summary="",
1277
+ graph_data=GraphData(nodes=[], links=[], metadata={}),
1278
+ export_files=ExportFiles(),
1279
+ processing_time=(datetime.utcnow() - start_time).total_seconds(),
1280
+ character_count=0,
1281
+ word_count=0,
1282
+ sentence_count=0,
1283
+ entity_relationship_stats={},
1284
+ error=str(e)
1285
+ )
1286
+
1287
+ @app.post("/analyze/file", response_model=NERResponse)
1288
+ async def analyze_file(
1289
+ file: UploadFile = File(...),
1290
+ extract_relationships: bool = Form(True),
1291
+ include_embeddings: bool = Form(True),
1292
+ include_summary: bool = Form(True),
1293
+ generate_graph_files: bool = Form(True),
1294
+ export_formats: str = Form("neo4j,json"),
1295
+ background_tasks: BackgroundTasks = None
1296
+ ):
1297
+ """Analyze uploaded file for entities and relationships"""
1298
+ start_time = datetime.utcnow()
1299
+ analysis_id = f"file_{int(start_time.timestamp())}"
1300
+
1301
+ if not file.filename:
1302
+ raise HTTPException(status_code=400, detail="No filename")
1303
+
1304
+ try:
1305
+ file_content = await file.read()
1306
+ if len(file_content) > config.MAX_FILE_SIZE:
1307
+ raise HTTPException(status_code=400, detail="File too large")
1308
+
1309
+ file_ext = Path(file.filename).suffix.lower()
1310
+ export_format_list = export_formats.split(',') if export_formats else ["json"]
1311
+
1312
+ if file_ext in config.SUPPORTED_TEXT_FORMATS:
1313
+ text = extract_text_from_file(file_content, file.filename)
1314
+ source_type = "text_file"
1315
+ elif file_ext in config.SUPPORTED_OCR_FORMATS:
1316
+ text = await get_text_from_ocr(file_content, file.filename)
1317
+ source_type = "ocr_file"
1318
+ else:
1319
+ raise HTTPException(status_code=400, detail=f"Unsupported format: {file_ext}")
1320
+
1321
+ if not text.strip():
1322
+ raise HTTPException(status_code=400, detail="No text extracted")
1323
+
1324
+ language = detect_language(text)
1325
+ text_stats = get_text_stats(text)
1326
+
1327
+ # Enhanced analysis
1328
+ analysis_result = await analyze_with_deepseek(text, language)
1329
+
1330
+ # Generate embeddings
1331
+ embeddings = []
1332
+ if include_embeddings:
1333
+ embeddings = await generate_embeddings(text)
1334
+
1335
+ # Create enhanced graph
1336
+ graph_data = create_enhanced_graph_data(
1337
+ analysis_result.get('entities', []),
1338
+ analysis_result.get('relationships', [])
1339
+ )
1340
+
1341
+ # Calculate ER statistics
1342
+ er_stats = calculate_er_stats(
1343
+ analysis_result.get('entities', []),
1344
+ analysis_result.get('relationships', [])
1345
+ )
1346
+
1347
+ # Generate export files
1348
+ export_files = ExportFiles()
1349
+ if generate_graph_files:
1350
+ export_files = await generate_export_files(
1351
+ analysis_id,
1352
+ analysis_result.get('entities', []),
1353
+ analysis_result.get('relationships', []),
1354
+ graph_data,
1355
+ export_format_list
1356
+ )
1357
+
1358
+ processing_time = (datetime.utcnow() - start_time).total_seconds()
1359
+
1360
+ response_data = {
1361
+ "analysis_id": analysis_id,
1362
+ "source_text": text,
1363
+ "source_type": source_type,
1364
+ "language": language,
1365
+ "entities": analysis_result.get('entities', []),
1366
+ "keywords": analysis_result.get('keywords', []),
1367
+ "relationships": analysis_result.get('relationships', []),
1368
+ "summary": analysis_result.get('summary', ''),
1369
+ "embeddings": embeddings,
1370
+ "graph_data": graph_data,
1371
+ "export_files": export_files,
1372
+ "text_stats": text_stats,
1373
+ "er_stats": er_stats,
1374
+ "processing_time": processing_time,
1375
+ "character_count": text_stats["character_count"],
1376
+ "word_count": text_stats["word_count"],
1377
+ "sentence_count": text_stats["sentence_count"]
1378
+ }
1379
+
1380
+ # Save in background
1381
+ if background_tasks:
1382
+ background_tasks.add_task(save_to_database, response_data)
1383
+ background_tasks.add_task(save_to_blob, analysis_id, response_data)
1384
+
1385
+ return NERResponse(
1386
+ success=True,
1387
+ entity_relationship_stats=er_stats,
1388
+ **response_data
1389
+ )
1390
+
1391
+ except HTTPException:
1392
+ raise
1393
+ except Exception as e:
1394
+ logger.error(f"File analysis failed: {e}")
1395
+ return NERResponse(
1396
+ success=False,
1397
+ analysis_id=analysis_id,
1398
+ source_text="",
1399
+ source_type="file_input",
1400
+ language="unknown",
1401
+ entities=[],
1402
+ keywords=[],
1403
+ relationships=[],
1404
+ summary="",
1405
+ graph_data=GraphData(nodes=[], links=[], metadata={}),
1406
+ export_files=ExportFiles(),
1407
+ processing_time=(datetime.utcnow() - start_time).total_seconds(),
1408
+ character_count=0,
1409
+ word_count=0,
1410
+ sentence_count=0,
1411
+ entity_relationship_stats={},
1412
+ error=str(e)
1413
+ )
1414
+
1415
+ @app.post("/analyze/url", response_model=NERResponse)
1416
+ async def analyze_url(request: NERRequest, background_tasks: BackgroundTasks):
1417
+ """Analyze URL content for entities and relationships"""
1418
+ start_time = datetime.utcnow()
1419
+ analysis_id = f"url_{int(start_time.timestamp())}"
1420
+
1421
+ if not request.url:
1422
+ raise HTTPException(status_code=400, detail="URL is required")
1423
+
1424
+ try:
1425
+ text = await get_text_from_url(str(request.url))
1426
+
1427
+ if not text.strip():
1428
+ raise HTTPException(status_code=400, detail="No text extracted from URL")
1429
+
1430
+ language = detect_language(text)
1431
+ text_stats = get_text_stats(text)
1432
+
1433
+ # Enhanced analysis
1434
+ analysis_result = await analyze_with_deepseek(text, language)
1435
+
1436
+ # Generate embeddings
1437
+ embeddings = []
1438
+ if request.include_embeddings:
1439
+ embeddings = await generate_embeddings(text)
1440
+
1441
+ # Create enhanced graph
1442
+ graph_data = create_enhanced_graph_data(
1443
+ analysis_result.get('entities', []),
1444
+ analysis_result.get('relationships', [])
1445
+ )
1446
+
1447
+ # Calculate ER statistics
1448
+ er_stats = calculate_er_stats(
1449
+ analysis_result.get('entities', []),
1450
+ analysis_result.get('relationships', [])
1451
+ )
1452
+
1453
+ # Generate export files
1454
+ export_files = ExportFiles()
1455
+ if request.generate_graph_files:
1456
+ export_files = await generate_export_files(
1457
+ analysis_id,
1458
+ analysis_result.get('entities', []),
1459
+ analysis_result.get('relationships', []),
1460
+ graph_data,
1461
+ request.export_formats
1462
+ )
1463
+
1464
+ processing_time = (datetime.utcnow() - start_time).total_seconds()
1465
+
1466
+ response_data = {
1467
+ "analysis_id": analysis_id,
1468
+ "source_text": text,
1469
+ "source_type": "url_content",
1470
+ "language": language,
1471
+ "entities": analysis_result.get('entities', []),
1472
+ "keywords": analysis_result.get('keywords', []),
1473
+ "relationships": analysis_result.get('relationships', []),
1474
+ "summary": analysis_result.get('summary', ''),
1475
+ "embeddings": embeddings,
1476
+ "graph_data": graph_data,
1477
+ "export_files": export_files,
1478
+ "text_stats": text_stats,
1479
+ "er_stats": er_stats,
1480
+ "processing_time": processing_time,
1481
+ "character_count": text_stats["character_count"],
1482
+ "word_count": text_stats["word_count"],
1483
+ "sentence_count": text_stats["sentence_count"]
1484
+ }
1485
+
1486
+ # Save in background
1487
+ background_tasks.add_task(save_to_database, response_data)
1488
+ background_tasks.add_task(save_to_blob, analysis_id, response_data)
1489
+
1490
+ return NERResponse(
1491
+ success=True,
1492
+ entity_relationship_stats=er_stats,
1493
+ **response_data
1494
+ )
1495
+
1496
+ except HTTPException:
1497
+ raise
1498
+ except Exception as e:
1499
+ logger.error(f"URL analysis failed: {e}")
1500
+ return NERResponse(
1501
+ success=False,
1502
+ analysis_id=analysis_id,
1503
+ source_text="",
1504
+ source_type="url_content",
1505
+ language="unknown",
1506
+ entities=[],
1507
+ keywords=[],
1508
+ relationships=[],
1509
+ summary="",
1510
+ graph_data=GraphData(nodes=[], links=[], metadata={}),
1511
+ export_files=ExportFiles(),
1512
+ processing_time=(datetime.utcnow() - start_time).total_seconds(),
1513
+ character_count=0,
1514
+ word_count=0,
1515
+ sentence_count=0,
1516
+ entity_relationship_stats={},
1517
+ error=str(e)
1518
+ )
1519
+
1520
+ @app.get("/download/{analysis_id}/{file_type}")
1521
+ async def download_export_file(analysis_id: str, file_type: str):
1522
+ """Download specific export file for an analysis"""
1523
+ try:
1524
+ analysis_dir = EXPORT_DIR / analysis_id
1525
+
1526
+ if not analysis_dir.exists():
1527
+ raise HTTPException(status_code=404, detail=f"Analysis {analysis_id} not found")
1528
+
1529
+ file_mapping = {
1530
+ "neo4j_nodes": "neo4j_nodes.csv",
1531
+ "neo4j_relationships": "neo4j_relationships.csv",
1532
+ "json": "analysis_export.json",
1533
+ "graphml": "graph_export.graphml"
1534
+ }
1535
+
1536
+ if file_type not in file_mapping:
1537
+ raise HTTPException(status_code=400, detail=f"Invalid file type: {file_type}")
1538
+
1539
+ file_path = analysis_dir / file_mapping[file_type]
1540
+
1541
+ if not file_path.exists():
1542
+ raise HTTPException(status_code=404, detail=f"File {file_type} not found")
1543
+
1544
+ return FileResponse(path=file_path, filename=file_mapping[file_type])
1545
+
1546
+ except HTTPException:
1547
+ raise
1548
+ except Exception as e:
1549
+ logger.error(f"Download failed for {analysis_id}/{file_type}: {e}")
1550
+ raise HTTPException(status_code=500, detail=f"Download failed: {str(e)}")
1551
+
1552
+ @app.get("/entity-types")
1553
+ async def get_entity_types():
1554
+ """Get all supported entity types"""
1555
+ return {
1556
+ "success": True,
1557
+ "entity_types": config.ENTITY_TYPES,
1558
+ "total_count": len(config.ENTITY_TYPES)
1559
+ }
1560
+
1561
+ @app.get("/relationship-types")
1562
+ async def get_relationship_types():
1563
+ """Get all supported relationship types"""
1564
+ return {
1565
+ "success": True,
1566
+ "relationship_types": config.RELATIONSHIP_TYPES,
1567
+ "total_count": len(config.RELATIONSHIP_TYPES)
1568
+ }
1569
+
1570
+ if __name__ == "__main__":
1571
+ print("πŸ”§ Loading enhanced NER configuration...")
1572
+ print(f"🌐 Will start server on {config.HOST}:{config.PORT}")
1573
+ print(f"🏷️ Enhanced with {len(config.ENTITY_TYPES)} entity types")
1574
+ print(f"πŸ”— Enhanced with {len(config.RELATIONSHIP_TYPES)} relationship types")
1575
+
1576
+ uvicorn.run(
1577
+ "ner_service:app",
1578
+ host=config.HOST,
1579
+ port=config.PORT,
1580
+ reload=config.DEBUG,
1581
+ log_level="info"
1582
+ )
service/ocr_service.py ADDED
@@ -0,0 +1,588 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OCR Backend API with Azure Document Intelligence - Cleaned and Optimized
4
+ Supports file uploads, URL processing, and web scraping fallback
5
+ """
6
+
7
+ import os
8
+ import io
9
+ import requests
10
+ import numpy as np
11
+ import logging
12
+ from typing import Optional, List, Dict, Any
13
+ from urllib.parse import urlparse, urljoin
14
+ from pathlib import Path
15
+ import mimetypes
16
+
17
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Form
18
+ from fastapi.middleware.cors import CORSMiddleware
19
+ from pydantic import BaseModel, HttpUrl
20
+ import uvicorn
21
+
22
+ # Import unified configuration
23
+ try:
24
+ from configs import get_config
25
+ config = get_config().ocr
26
+ print("βœ… Using unified configuration")
27
+ except ImportError:
28
+ print("⚠️ Unified config not available, using fallback configuration")
29
+ from dotenv import load_dotenv
30
+ load_dotenv()
31
+
32
+ class FallbackConfig:
33
+ HOST = os.getenv("HOST", "0.0.0.0")
34
+ PORT = int(os.getenv("OCR_PORT", "8400"))
35
+ DEBUG = os.getenv("DEBUG", "True").lower() == "true"
36
+
37
+ # Azure Document Intelligence configuration
38
+ AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT", "")
39
+ AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY", "")
40
+
41
+ # Web scraping configuration
42
+ MAX_IMAGES_PER_PAGE = int(os.getenv("MAX_IMAGES_PER_PAGE", "10"))
43
+ REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "30"))
44
+ USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
45
+
46
+ # File size limits
47
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
48
+
49
+ config = FallbackConfig()
50
+
51
+ from azure.core.credentials import AzureKeyCredential
52
+ from azure.ai.documentintelligence import DocumentIntelligenceClient
53
+ from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
54
+ from azure.core.exceptions import HttpResponseError
55
+
56
+ from bs4 import BeautifulSoup
57
+ from PIL import Image
58
+
59
+ # Configure logging
60
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
61
+ logger = logging.getLogger(__name__)
62
+
63
+ # Initialize FastAPI app
64
+ app = FastAPI(
65
+ title="OCR Backend API",
66
+ description="OCR service with Azure Document Intelligence, supporting file uploads, URLs, and web scraping",
67
+ version="2.0.0",
68
+ debug=config.DEBUG
69
+ )
70
+
71
+ # CORS configuration
72
+ app.add_middleware(
73
+ CORSMiddleware,
74
+ allow_origins=["*"],
75
+ allow_credentials=True,
76
+ allow_methods=["*"],
77
+ allow_headers=["*"],
78
+ )
79
+
80
+ # Pydantic models
81
+ class URLRequest(BaseModel):
82
+ url: HttpUrl
83
+ extract_images: bool = True
84
+
85
+ class OCRResponse(BaseModel):
86
+ success: bool
87
+ content: str
88
+ pages: List[Dict[str, Any]]
89
+ source_type: str # 'file_upload', 'direct_url', 'web_scraped'
90
+ source_url: Optional[str] = None
91
+ error: Optional[str] = None
92
+
93
+ class WebScrapingResult(BaseModel):
94
+ text_content: str
95
+ images_found: List[str]
96
+ ocr_results: List[Dict[str, Any]]
97
+
98
+ # Utility functions
99
+ def format_bounding_box(bounding_box):
100
+ """Format bounding box coordinates for display"""
101
+ if not bounding_box:
102
+ return "N/A"
103
+ reshaped_bounding_box = np.array(bounding_box).reshape(-1, 2)
104
+ return ", ".join(["[{}, {}]".format(x, y) for x, y in reshaped_bounding_box])
105
+
106
+ def is_supported_file_type(content_type: str, filename: str = "") -> bool:
107
+ """Check if the file type is supported for OCR"""
108
+ supported_types = {
109
+ 'application/pdf',
110
+ 'image/jpeg',
111
+ 'image/jpg',
112
+ 'image/png',
113
+ 'image/tiff',
114
+ 'image/bmp',
115
+ 'image/gif'
116
+ }
117
+
118
+ if content_type and content_type.lower() in supported_types:
119
+ return True
120
+
121
+ # Check by file extension if content type is unclear
122
+ if filename:
123
+ supported_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp', '.gif'}
124
+ file_ext = Path(filename).suffix.lower()
125
+ return file_ext in supported_extensions
126
+
127
+ return False
128
+
129
+ def get_document_intelligence_client():
130
+ """Initialize Azure Document Intelligence client"""
131
+ if (config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT == "" or
132
+ config.AZURE_DOCUMENT_INTELLIGENCE_KEY == "" or
133
+ config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT == "YOUR_FORM_RECOGNIZER_ENDPOINT" or
134
+ config.AZURE_DOCUMENT_INTELLIGENCE_KEY == "YOUR_FORM_RECOGNIZER_KEY"):
135
+ raise HTTPException(
136
+ status_code=500,
137
+ detail="Azure Document Intelligence credentials not configured"
138
+ )
139
+
140
+ return DocumentIntelligenceClient(
141
+ endpoint=config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
142
+ credential=AzureKeyCredential(config.AZURE_DOCUMENT_INTELLIGENCE_KEY)
143
+ )
144
+
145
+ async def process_ocr_from_url(url: str) -> Dict[str, Any]:
146
+ """Process OCR from a direct URL"""
147
+ try:
148
+ client = get_document_intelligence_client()
149
+
150
+ logger.info(f"Processing OCR from URL: {url}")
151
+ poller = client.begin_analyze_document(
152
+ "prebuilt-read",
153
+ AnalyzeDocumentRequest(url_source=url)
154
+ )
155
+ result = poller.result()
156
+
157
+ return format_ocr_result(result, "direct_url", url)
158
+
159
+ except HttpResponseError as e:
160
+ logger.error(f"Azure OCR error for URL {url}: {e}")
161
+ raise HTTPException(status_code=400, detail=f"OCR processing failed: {e}")
162
+ except Exception as e:
163
+ logger.error(f"Unexpected error processing URL {url}: {e}")
164
+ raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
165
+
166
+ async def process_ocr_from_bytes(file_bytes: bytes, filename: str = "") -> Dict[str, Any]:
167
+ """Process OCR from file bytes"""
168
+ try:
169
+ client = get_document_intelligence_client()
170
+
171
+ logger.info(f"Processing OCR from file: {filename} ({len(file_bytes)} bytes)")
172
+ poller = client.begin_analyze_document(
173
+ "prebuilt-read",
174
+ AnalyzeDocumentRequest(bytes_source=file_bytes)
175
+ )
176
+ result = poller.result()
177
+
178
+ return format_ocr_result(result, "file_upload", filename)
179
+
180
+ except HttpResponseError as e:
181
+ logger.error(f"Azure OCR error for file {filename}: {e}")
182
+ raise HTTPException(status_code=400, detail=f"OCR processing failed: {e}")
183
+ except Exception as e:
184
+ logger.error(f"Unexpected error processing file {filename}: {e}")
185
+ raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
186
+
187
+ def format_ocr_result(result, source_type: str, source_identifier: str = "") -> Dict[str, Any]:
188
+ """Format Azure Document Intelligence result into standardized response"""
189
+ pages_data = []
190
+
191
+ for page in result.pages:
192
+ page_data = {
193
+ "page_number": page.page_number,
194
+ "width": page.width,
195
+ "height": page.height,
196
+ "unit": page.unit,
197
+ "lines": [],
198
+ "words": []
199
+ }
200
+
201
+ # Process lines
202
+ if hasattr(page, 'lines') and page.lines:
203
+ for line_idx, line in enumerate(page.lines):
204
+ page_data["lines"].append({
205
+ "line_number": line_idx,
206
+ "content": line.content,
207
+ "bounding_box": format_bounding_box(line.polygon) if hasattr(line, 'polygon') else "N/A"
208
+ })
209
+
210
+ # Process words
211
+ if hasattr(page, 'words') and page.words:
212
+ for word in page.words:
213
+ page_data["words"].append({
214
+ "content": word.content,
215
+ "confidence": word.confidence if hasattr(word, 'confidence') else None
216
+ })
217
+
218
+ pages_data.append(page_data)
219
+
220
+ # Check for handwritten content
221
+ handwritten_detected = False
222
+ if hasattr(result, 'styles') and result.styles:
223
+ for style in result.styles:
224
+ if hasattr(style, 'is_handwritten') and style.is_handwritten:
225
+ handwritten_detected = True
226
+ break
227
+
228
+ return {
229
+ "success": True,
230
+ "content": result.content if hasattr(result, 'content') else "",
231
+ "pages": pages_data,
232
+ "source_type": source_type,
233
+ "source_url": source_identifier if source_type == "direct_url" else None,
234
+ "handwritten_detected": handwritten_detected,
235
+ "error": None
236
+ }
237
+
238
+ async def scrape_web_content(url: str, extract_images: bool = True) -> WebScrapingResult:
239
+ """Scrape web content and extract text and images"""
240
+ try:
241
+ headers = {
242
+ 'User-Agent': config.USER_AGENT
243
+ }
244
+
245
+ logger.info(f"Scraping web content from: {url}")
246
+ response = requests.get(url, headers=headers, timeout=config.REQUEST_TIMEOUT)
247
+ response.raise_for_status()
248
+
249
+ soup = BeautifulSoup(response.content, 'html.parser')
250
+
251
+ # Extract text content
252
+ text_content = soup.get_text(separator=' ', strip=True)
253
+
254
+ images_found = []
255
+ ocr_results = []
256
+
257
+ if extract_images:
258
+ # Find all images
259
+ img_tags = soup.find_all('img')
260
+
261
+ for img in img_tags[:config.MAX_IMAGES_PER_PAGE]:
262
+ img_src = img.get('src')
263
+ if img_src:
264
+ # Make absolute URL
265
+ img_url = urljoin(url, img_src)
266
+ images_found.append(img_url)
267
+
268
+ # Try to process image with OCR
269
+ try:
270
+ # Check if image URL is accessible and is an image
271
+ img_response = requests.head(img_url, headers=headers, timeout=10)
272
+ content_type = img_response.headers.get('content-type', '')
273
+
274
+ if is_supported_file_type(content_type):
275
+ ocr_result = await process_ocr_from_url(img_url)
276
+ if ocr_result['content'].strip(): # Only add if there's actual text
277
+ ocr_results.append({
278
+ "image_url": img_url,
279
+ "ocr_content": ocr_result['content'],
280
+ "pages": ocr_result['pages']
281
+ })
282
+ except Exception as e:
283
+ logger.warning(f"Failed to process image {img_url}: {e}")
284
+ continue
285
+
286
+ return WebScrapingResult(
287
+ text_content=text_content,
288
+ images_found=images_found,
289
+ ocr_results=ocr_results
290
+ )
291
+
292
+ except requests.RequestException as e:
293
+ logger.error(f"Failed to scrape URL {url}: {e}")
294
+ raise HTTPException(status_code=400, detail=f"Failed to scrape URL: {e}")
295
+ except Exception as e:
296
+ logger.error(f"Unexpected error scraping URL {url}: {e}")
297
+ raise HTTPException(status_code=500, detail=f"Unexpected error during web scraping: {e}")
298
+
299
+ def check_url_is_direct_file(url: str) -> tuple[bool, str]:
300
+ """Check if URL points directly to a file"""
301
+ try:
302
+ headers = {
303
+ 'User-Agent': config.USER_AGENT
304
+ }
305
+
306
+ response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
307
+ content_type = response.headers.get('content-type', '').lower()
308
+
309
+ # Check content disposition for filename
310
+ content_disposition = response.headers.get('content-disposition', '')
311
+ filename = ""
312
+ if 'filename=' in content_disposition:
313
+ filename = content_disposition.split('filename=')[1].strip('"')
314
+
315
+ # Parse URL for filename
316
+ if not filename:
317
+ parsed_url = urlparse(url)
318
+ filename = Path(parsed_url.path).name
319
+
320
+ is_file = is_supported_file_type(content_type, filename)
321
+ return is_file, content_type
322
+
323
+ except Exception as e:
324
+ logger.warning(f"Failed to check URL {url}: {e}")
325
+ return False, ""
326
+
327
+ # API Endpoints
328
+ @app.get("/")
329
+ async def root():
330
+ azure_di_available = bool(
331
+ config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
332
+ config.AZURE_DOCUMENT_INTELLIGENCE_KEY and
333
+ config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and
334
+ config.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY"
335
+ )
336
+
337
+ return {
338
+ "message": "OCR Backend API",
339
+ "version": "2.0.0",
340
+ "status": "operational",
341
+ "features": {
342
+ "file_upload": True,
343
+ "url_processing": True,
344
+ "web_scraping": True,
345
+ "azure_document_intelligence": azure_di_available,
346
+ "supported_formats": ["PDF", "JPEG", "PNG", "TIFF", "BMP", "GIF"]
347
+ },
348
+ "limits": {
349
+ "max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
350
+ "max_images_per_page": config.MAX_IMAGES_PER_PAGE,
351
+ "request_timeout_seconds": config.REQUEST_TIMEOUT
352
+ }
353
+ }
354
+
355
+ @app.get("/health")
356
+ async def health_check():
357
+ azure_di_available = bool(
358
+ config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
359
+ config.AZURE_DOCUMENT_INTELLIGENCE_KEY and
360
+ config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and
361
+ config.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY"
362
+ )
363
+
364
+ # Test Azure DI connection if configured
365
+ azure_di_status = "not_configured"
366
+ if azure_di_available:
367
+ try:
368
+ # Quick test of Azure DI client initialization
369
+ get_document_intelligence_client()
370
+ azure_di_status = "configured"
371
+ except Exception as e:
372
+ azure_di_status = f"error: {str(e)[:100]}"
373
+
374
+ return {
375
+ "status": "healthy",
376
+ "service": "OCR Backend API",
377
+ "version": "2.0.0",
378
+ "azure_document_intelligence": azure_di_status,
379
+ "configuration": {
380
+ "max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
381
+ "max_images_per_page": config.MAX_IMAGES_PER_PAGE,
382
+ "request_timeout": config.REQUEST_TIMEOUT,
383
+ "endpoint_configured": bool(config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT),
384
+ "key_configured": bool(config.AZURE_DOCUMENT_INTELLIGENCE_KEY)
385
+ }
386
+ }
387
+
388
+ @app.post("/ocr/upload", response_model=OCRResponse)
389
+ async def ocr_upload_file(file: UploadFile = File(...)):
390
+ """Upload a file for OCR processing"""
391
+
392
+ # Validate file type
393
+ if not is_supported_file_type(file.content_type, file.filename):
394
+ raise HTTPException(
395
+ status_code=400,
396
+ detail=f"Unsupported file type: {file.content_type}. Supported types: PDF, JPEG, PNG, TIFF, BMP, GIF"
397
+ )
398
+
399
+ try:
400
+ # Read file content
401
+ file_bytes = await file.read()
402
+
403
+ # Check file size
404
+ if len(file_bytes) > config.MAX_FILE_SIZE:
405
+ raise HTTPException(
406
+ status_code=400,
407
+ detail=f"File too large. Maximum size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB"
408
+ )
409
+
410
+ # Process OCR
411
+ result = await process_ocr_from_bytes(file_bytes, file.filename)
412
+
413
+ return OCRResponse(**result)
414
+
415
+ except HTTPException:
416
+ raise
417
+ except Exception as e:
418
+ logger.error(f"Unexpected error processing uploaded file: {e}")
419
+ raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
420
+
421
+ @app.post("/ocr/url", response_model=OCRResponse)
422
+ async def ocr_from_url(request: URLRequest):
423
+ """Process OCR from URL - either direct file or web scraping"""
424
+
425
+ url_str = str(request.url)
426
+
427
+ # Check if URL points to a direct file
428
+ is_direct_file, content_type = check_url_is_direct_file(url_str)
429
+
430
+ if is_direct_file:
431
+ # Process as direct file URL
432
+ try:
433
+ result = await process_ocr_from_url(url_str)
434
+ return OCRResponse(**result)
435
+ except HTTPException:
436
+ raise
437
+ except Exception as e:
438
+ logger.error(f"Failed to process direct file URL: {e}")
439
+ # Fall back to web scraping
440
+ pass
441
+
442
+ # Web scraping approach
443
+ try:
444
+ scraping_result = await scrape_web_content(url_str, request.extract_images)
445
+
446
+ # Combine text content and OCR results
447
+ combined_content = scraping_result.text_content
448
+
449
+ if scraping_result.ocr_results:
450
+ ocr_content = "\n\n--- OCR from Images ---\n"
451
+ for ocr_result in scraping_result.ocr_results:
452
+ ocr_content += f"\nImage: {ocr_result['image_url']}\n"
453
+ ocr_content += ocr_result['ocr_content'] + "\n"
454
+ combined_content += ocr_content
455
+
456
+ # Format response
457
+ pages_data = [{
458
+ "page_number": 1,
459
+ "content_type": "web_scraped",
460
+ "text_content": scraping_result.text_content,
461
+ "images_found": len(scraping_result.images_found),
462
+ "ocr_results": len(scraping_result.ocr_results)
463
+ }]
464
+
465
+ return OCRResponse(
466
+ success=True,
467
+ content=combined_content,
468
+ pages=pages_data,
469
+ source_type="web_scraped",
470
+ source_url=url_str,
471
+ error=None
472
+ )
473
+
474
+ except HTTPException:
475
+ raise
476
+ except Exception as e:
477
+ logger.error(f"Failed to process URL {url_str}: {e}")
478
+ return OCRResponse(
479
+ success=False,
480
+ content="",
481
+ pages=[],
482
+ source_type="web_scraped",
483
+ source_url=url_str,
484
+ error=str(e)
485
+ )
486
+
487
+ @app.post("/ocr/analyze")
488
+ async def analyze_document(
489
+ file: Optional[UploadFile] = File(None),
490
+ url: Optional[str] = Form(None),
491
+ extract_images: bool = Form(True)
492
+ ):
493
+ """Unified endpoint for document analysis - accepts either file upload or URL"""
494
+
495
+ if not file and not url:
496
+ raise HTTPException(status_code=400, detail="Either file or URL must be provided")
497
+
498
+ if file and url:
499
+ raise HTTPException(status_code=400, detail="Provide either file or URL, not both")
500
+
501
+ try:
502
+ if file:
503
+ # Process uploaded file
504
+ if not is_supported_file_type(file.content_type, file.filename):
505
+ raise HTTPException(
506
+ status_code=400,
507
+ detail=f"Unsupported file type: {file.content_type}"
508
+ )
509
+
510
+ file_bytes = await file.read()
511
+
512
+ # Check file size
513
+ if len(file_bytes) > config.MAX_FILE_SIZE:
514
+ raise HTTPException(
515
+ status_code=400,
516
+ detail=f"File too large. Maximum size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB"
517
+ )
518
+
519
+ result = await process_ocr_from_bytes(file_bytes, file.filename)
520
+ return result
521
+
522
+ else:
523
+ # Process URL
524
+ url_request = URLRequest(url=url, extract_images=extract_images)
525
+ response = await ocr_from_url(url_request)
526
+ return response.dict()
527
+
528
+ except HTTPException:
529
+ raise
530
+ except Exception as e:
531
+ logger.error(f"Unexpected error in analyze_document: {e}")
532
+ raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
533
+
534
+ # Additional utility endpoints
535
+ @app.get("/supported-formats")
536
+ async def get_supported_formats():
537
+ """Get list of supported file formats"""
538
+ return {
539
+ "supported_formats": {
540
+ "documents": ["PDF"],
541
+ "images": ["JPEG", "JPG", "PNG", "TIFF", "TIF", "BMP", "GIF"]
542
+ },
543
+ "content_types": [
544
+ "application/pdf",
545
+ "image/jpeg",
546
+ "image/jpg",
547
+ "image/png",
548
+ "image/tiff",
549
+ "image/bmp",
550
+ "image/gif"
551
+ ],
552
+ "max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
553
+ "max_images_per_page": config.MAX_IMAGES_PER_PAGE
554
+ }
555
+
556
+ @app.get("/config")
557
+ async def get_configuration():
558
+ """Get current service configuration (for debugging)"""
559
+ return {
560
+ "service": "OCR Backend API",
561
+ "version": "2.0.0",
562
+ "configuration": {
563
+ "host": config.HOST,
564
+ "port": config.PORT,
565
+ "debug": config.DEBUG,
566
+ "max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
567
+ "max_images_per_page": config.MAX_IMAGES_PER_PAGE,
568
+ "request_timeout": config.REQUEST_TIMEOUT,
569
+ "azure_di_configured": bool(
570
+ config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
571
+ config.AZURE_DOCUMENT_INTELLIGENCE_KEY
572
+ )
573
+ }
574
+ }
575
+
576
+ if __name__ == "__main__":
577
+ print("πŸ”§ Loading OCR service configuration...")
578
+ print(f"🌐 Will start server on {config.HOST}:{config.PORT}")
579
+ print(f"πŸ“„ Azure Document Intelligence: {'βœ… Configured' if config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT else '❌ Not configured'}")
580
+ print(f"πŸ“Š Max file size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB")
581
+
582
+ uvicorn.run(
583
+ "ocr_service:app",
584
+ host=config.HOST,
585
+ port=config.PORT,
586
+ reload=config.DEBUG,
587
+ log_level="info"
588
+ )
service/rag_service.py ADDED
@@ -0,0 +1,1367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ RAG (Retrieval-Augmented Generation) Backend API - Cleaned and Optimized
4
+ Integrates OCR, Azure OpenAI embeddings, and PostgreSQL vector storage
5
+ """
6
+
7
+ import os
8
+ import uuid
9
+ import asyncio
10
+ import requests
11
+ import json
12
+ import tempfile
13
+ import traceback
14
+ import logging
15
+ from typing import Optional, List, Dict, Any, Union
16
+ from datetime import datetime
17
+
18
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Form, Query, Depends
19
+ from fastapi.middleware.cors import CORSMiddleware
20
+ from pydantic import BaseModel, HttpUrl
21
+ import uvicorn
22
+
23
+ # Import unified configuration
24
+ try:
25
+ from configs import get_config
26
+ config = get_config().rag
27
+ unified_config = get_config()
28
+ print("βœ… Using unified configuration")
29
+ except ImportError:
30
+ print("⚠️ Unified config not available, using fallback configuration")
31
+ from dotenv import load_dotenv
32
+ load_dotenv()
33
+
34
+ class FallbackConfig:
35
+ HOST = os.getenv("HOST", "0.0.0.0")
36
+ PORT = int(os.getenv("RAG_PORT", "8401"))
37
+ DEBUG = os.getenv("DEBUG", "True").lower() == "true"
38
+
39
+ # OCR Service Configuration
40
+ OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
41
+
42
+ # PostgreSQL Configuration
43
+ PG_HOST = os.getenv("POSTGRES_HOST", "")
44
+ PG_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
45
+ PG_DATABASE = os.getenv("PG_DATABASE", "vectorsearch")
46
+ PG_USER = os.getenv("POSTGRES_USER", "")
47
+ PG_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
48
+ PG_SSL_MODE = os.getenv("PG_SSL_MODE", "require")
49
+
50
+ # Azure OpenAI Configuration
51
+ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
52
+ AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
53
+ AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "text-embedding-3-small")
54
+ AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")
55
+
56
+ # Chunking Configuration
57
+ CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
58
+ CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
59
+ MIN_CHUNK_SIZE = int(os.getenv("MIN_CHUNK_SIZE", "50"))
60
+
61
+ # Processing limits
62
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
63
+ REQUEST_TIMEOUT = 300
64
+
65
+ config = FallbackConfig()
66
+
67
+ import asyncpg
68
+ import numpy as np
69
+ from openai import AzureOpenAI
70
+ import re
71
+ from pathlib import Path
72
+ from urllib.parse import urlparse
73
+
74
+ # Configure logging
75
+ logging.basicConfig(
76
+ level=logging.INFO,
77
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
78
+ )
79
+ logger = logging.getLogger(__name__)
80
+
81
+ # Initialize FastAPI app
82
+ app = FastAPI(
83
+ title="RAG Backend API",
84
+ description="Retrieval-Augmented Generation service with OCR, embeddings, and vector search",
85
+ version="2.0.0",
86
+ debug=config.DEBUG
87
+ )
88
+
89
+ # CORS configuration
90
+ app.add_middleware(
91
+ CORSMiddleware,
92
+ allow_origins=["*"],
93
+ allow_credentials=True,
94
+ allow_methods=["*"],
95
+ allow_headers=["*"],
96
+ )
97
+
98
+ # Pydantic Models
99
+ class DocumentUploadRequest(BaseModel):
100
+ title: Optional[str] = None
101
+ keywords: Optional[List[str]] = None
102
+ metadata: Optional[Dict[str, Any]] = None
103
+ chunk_size: Optional[int] = None
104
+ chunk_overlap: Optional[int] = None
105
+
106
+ class URLProcessRequest(BaseModel):
107
+ url: HttpUrl
108
+ title: Optional[str] = None
109
+ keywords: Optional[List[str]] = None
110
+ metadata: Optional[Dict[str, Any]] = None
111
+ extract_images: bool = True
112
+ chunk_size: Optional[int] = None
113
+ chunk_overlap: Optional[int] = None
114
+
115
+ class SearchRequest(BaseModel):
116
+ query: str
117
+ limit: int = 10
118
+ similarity_threshold: float = 0.2
119
+ filter_metadata: Optional[Dict[str, Any]] = None
120
+
121
+ class DocumentChunk(BaseModel):
122
+ id: str
123
+ document_id: str
124
+ content: str
125
+ chunk_index: int
126
+ embedding: Optional[List[float]] = None
127
+ metadata: Dict[str, Any]
128
+ created_at: datetime
129
+
130
+ class DocumentResponse(BaseModel):
131
+ id: str
132
+ title: str
133
+ source_type: str
134
+ source_url: Optional[str]
135
+ total_chunks: int
136
+ keywords: List[str]
137
+ metadata: Dict[str, Any]
138
+ created_at: datetime
139
+ processing_status: str
140
+
141
+ class SearchResult(BaseModel):
142
+ chunk: DocumentChunk
143
+ similarity_score: float
144
+ document_info: Dict[str, Any]
145
+
146
+ class SearchResponse(BaseModel):
147
+ query: str
148
+ results: List[SearchResult]
149
+ total_results: int
150
+ processing_time: float
151
+
152
+ # Database connection pool
153
+ db_pool = None
154
+
155
+ # UUID generation method cache
156
+ _uuid_method = None
157
+
158
+ async def detect_uuid_method(conn) -> str:
159
+ """Detect and cache the best available UUID generation method"""
160
+ global _uuid_method
161
+
162
+ if _uuid_method is not None:
163
+ return _uuid_method
164
+
165
+ # Test built-in gen_random_uuid() first (PostgreSQL 13+)
166
+ try:
167
+ await conn.fetchval("SELECT gen_random_uuid()")
168
+ _uuid_method = "built-in"
169
+ logger.info("Using built-in gen_random_uuid() for UUID generation")
170
+ return _uuid_method
171
+ except Exception:
172
+ pass
173
+
174
+ # Test uuid-ossp extension
175
+ try:
176
+ await conn.execute("CREATE EXTENSION IF NOT EXISTS \"uuid-ossp\"")
177
+ await conn.fetchval("SELECT uuid_generate_v4()")
178
+ _uuid_method = "uuid-ossp"
179
+ logger.info("Using uuid-ossp extension for UUID generation")
180
+ return _uuid_method
181
+ except Exception as e:
182
+ if "not allow-listed" in str(e) or "not allowlisted" in str(e).lower():
183
+ logger.info("uuid-ossp extension not allowlisted (normal for Azure PostgreSQL)")
184
+ else:
185
+ logger.warning(f"uuid-ossp extension not available: {e}")
186
+
187
+ # Fall back to Python UUID generation
188
+ _uuid_method = "python"
189
+ logger.info("Using Python-generated UUIDs")
190
+ return _uuid_method
191
+
192
+ async def get_db_pool():
193
+ """Get database connection pool"""
194
+ global db_pool
195
+ if db_pool is None:
196
+ try:
197
+ logger.info(f"Creating database pool with host: {config.PG_HOST}:{config.PG_PORT}")
198
+ db_pool = await asyncpg.create_pool(
199
+ host=config.PG_HOST,
200
+ port=config.PG_PORT,
201
+ database=config.PG_DATABASE,
202
+ user=config.PG_USER,
203
+ password=config.PG_PASSWORD,
204
+ ssl=config.PG_SSL_MODE,
205
+ min_size=1,
206
+ max_size=10,
207
+ command_timeout=60
208
+ )
209
+ except Exception as e:
210
+ logger.error(f"Failed to create database pool: {e}")
211
+ raise
212
+ return db_pool
213
+
214
+ async def get_db_connection():
215
+ """Get database connection from pool"""
216
+ pool = await get_db_pool()
217
+ return await pool.acquire()
218
+
219
+ async def release_db_connection(connection):
220
+ """Release database connection back to pool"""
221
+ pool = await get_db_pool()
222
+ await pool.release(connection)
223
+
224
+ # Azure OpenAI Client
225
+ def get_openai_client():
226
+ """Initialize Azure OpenAI client"""
227
+ if (config.AZURE_OPENAI_ENDPOINT == "" or
228
+ config.AZURE_OPENAI_API_KEY == "" or
229
+ config.AZURE_OPENAI_ENDPOINT == "YOUR_AZURE_OPENAI_ENDPOINT" or
230
+ config.AZURE_OPENAI_API_KEY == "YOUR_AZURE_OPENAI_KEY"):
231
+ raise HTTPException(
232
+ status_code=500,
233
+ detail="Azure OpenAI credentials not configured"
234
+ )
235
+
236
+ return AzureOpenAI(
237
+ api_version=config.AZURE_OPENAI_API_VERSION,
238
+ azure_endpoint=config.AZURE_OPENAI_ENDPOINT,
239
+ api_key=config.AZURE_OPENAI_API_KEY
240
+ )
241
+
242
+ # Text Processing Functions
243
+ def clean_text(text: str) -> str:
244
+ """Clean and normalize text"""
245
+ # Remove extra whitespace
246
+ text = re.sub(r'\s+', ' ', text)
247
+ # Remove special characters but keep basic punctuation
248
+ text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text)
249
+ return text.strip()
250
+
251
+ def embedding_to_vector_string(embedding: List[float]) -> str:
252
+ """Convert embedding list to PostgreSQL vector format"""
253
+ if not embedding or len(embedding) == 0:
254
+ raise ValueError("Embedding cannot be empty")
255
+
256
+ # Convert to PostgreSQL vector format: '[1.0, 2.0, 3.0]'
257
+ vector_str = '[' + ','.join(str(float(x)) for x in embedding) + ']'
258
+ return vector_str
259
+
260
+ def create_text_chunks(text: str, chunk_size: int = None, chunk_overlap: int = None) -> List[str]:
261
+ """Split text into overlapping chunks"""
262
+ if chunk_size is None:
263
+ chunk_size = config.CHUNK_SIZE
264
+ if chunk_overlap is None:
265
+ chunk_overlap = config.CHUNK_OVERLAP
266
+
267
+ if len(text) <= chunk_size:
268
+ return [text]
269
+
270
+ chunks = []
271
+ start = 0
272
+
273
+ while start < len(text):
274
+ end = start + chunk_size
275
+
276
+ # Try to break at sentence boundary
277
+ if end < len(text):
278
+ # Look for sentence endings
279
+ sentence_endings = ['. ', '! ', '? ', '\n\n']
280
+ for ending in sentence_endings:
281
+ last_ending = text.rfind(ending, start, end)
282
+ if last_ending != -1:
283
+ end = last_ending + len(ending)
284
+ break
285
+
286
+ chunk = text[start:end].strip()
287
+ if len(chunk) >= config.MIN_CHUNK_SIZE:
288
+ chunks.append(chunk)
289
+
290
+ # Calculate next start position with overlap
291
+ start = end - chunk_overlap
292
+ if start >= len(text):
293
+ break
294
+
295
+ return chunks
296
+
297
+ async def generate_embedding(text: str) -> List[float]:
298
+ """Generate embedding using Azure OpenAI"""
299
+ try:
300
+ if not text or not text.strip():
301
+ raise ValueError("Text cannot be empty")
302
+
303
+ # Truncate text if it's too long
304
+ if len(text) > 8000:
305
+ text = text[:8000]
306
+ logger.warning("Truncated text for embedding generation")
307
+
308
+ client = get_openai_client()
309
+
310
+ response = client.embeddings.create(
311
+ input=[text.strip()],
312
+ model=config.AZURE_OPENAI_DEPLOYMENT
313
+ )
314
+
315
+ if not response.data or len(response.data) == 0:
316
+ raise ValueError("No embedding data returned from Azure OpenAI")
317
+
318
+ embedding = response.data[0].embedding
319
+
320
+ if not embedding or len(embedding) == 0:
321
+ raise ValueError("Empty embedding returned from Azure OpenAI")
322
+
323
+ logger.debug(f"Generated embedding with {len(embedding)} dimensions")
324
+ return embedding
325
+
326
+ except Exception as e:
327
+ logger.error(f"Failed to generate embedding: {e}")
328
+ logger.error(f"Text length: {len(text) if text else 0}")
329
+ raise HTTPException(status_code=500, detail=f"Embedding generation failed: {e}")
330
+
331
+ # OCR Integration
332
+ async def process_with_ocr(file_bytes: bytes = None, url: str = None, extract_images: bool = True, filename: str = None) -> Dict[str, Any]:
333
+ """Process document using OCR service"""
334
+ try:
335
+ logger.info(f"Processing with OCR service at {config.OCR_SERVICE_URL}")
336
+
337
+ if file_bytes:
338
+ # Check if it's a plain text file
339
+ is_text_file = False
340
+ if filename:
341
+ text_extensions = ['.txt', '.md', '.rst', '.log']
342
+ if any(filename.lower().endswith(ext) for ext in text_extensions):
343
+ is_text_file = True
344
+
345
+ # For plain text files, bypass OCR
346
+ if is_text_file:
347
+ try:
348
+ content = file_bytes.decode('utf-8')
349
+ logger.info(f"Processing plain text file directly: {filename}")
350
+
351
+ if len(content.strip()) < config.MIN_CHUNK_SIZE:
352
+ logger.info(f"Text file {filename} is short ({len(content)} chars) but will process anyway")
353
+
354
+ return {
355
+ 'success': True,
356
+ 'content': content,
357
+ 'pages': [{
358
+ 'page_number': 1,
359
+ 'content_type': 'text',
360
+ 'text_content': content,
361
+ 'source': 'direct_text',
362
+ 'character_count': len(content)
363
+ }],
364
+ 'source_type': 'text_file',
365
+ 'source_url': None,
366
+ 'error': None
367
+ }
368
+ except UnicodeDecodeError:
369
+ logger.warning(f"Failed to decode {filename} as UTF-8, sending to OCR service")
370
+
371
+ # Use OCR service
372
+ logger.info(f"Uploading file for OCR processing ({len(file_bytes)} bytes)")
373
+
374
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.tmp') as temp_file:
375
+ temp_file.write(file_bytes)
376
+ temp_file.flush()
377
+
378
+ try:
379
+ with open(temp_file.name, 'rb') as f:
380
+ files = {
381
+ 'file': (filename or 'document.pdf', f, 'application/octet-stream')
382
+ }
383
+ data = {
384
+ 'extract_images': str(extract_images).lower()
385
+ }
386
+
387
+ response = requests.post(
388
+ f"{config.OCR_SERVICE_URL}/ocr/analyze",
389
+ files=files,
390
+ data=data,
391
+ timeout=config.REQUEST_TIMEOUT
392
+ )
393
+ finally:
394
+ try:
395
+ os.unlink(temp_file.name)
396
+ except:
397
+ pass
398
+
399
+ elif url:
400
+ # Process URL with OCR service
401
+ logger.info(f"Processing URL for OCR: {url}")
402
+
403
+ data = {
404
+ 'url': url,
405
+ 'extract_images': str(extract_images).lower()
406
+ }
407
+
408
+ response = requests.post(
409
+ f"{config.OCR_SERVICE_URL}/ocr/analyze",
410
+ data=data,
411
+ timeout=config.REQUEST_TIMEOUT
412
+ )
413
+ else:
414
+ raise ValueError("Either file_bytes or url must be provided")
415
+
416
+ # Check response
417
+ logger.info(f"OCR service response status: {response.status_code}")
418
+
419
+ if response.status_code != 200:
420
+ logger.error(f"OCR service error: {response.status_code} - {response.text}")
421
+ raise HTTPException(
422
+ status_code=500,
423
+ detail=f"OCR processing failed: {response.status_code} {response.reason}"
424
+ )
425
+
426
+ result = response.json()
427
+ logger.info(f"OCR processing completed successfully. Success: {result.get('success', False)}")
428
+
429
+ return result
430
+
431
+ except requests.RequestException as e:
432
+ logger.error(f"OCR service request error: {e}")
433
+ raise HTTPException(status_code=500, detail=f"OCR service connection failed: {e}")
434
+ except Exception as e:
435
+ logger.error(f"OCR processing error: {e}")
436
+ logger.error(traceback.format_exc())
437
+ raise HTTPException(status_code=500, detail=f"OCR processing failed: {e}")
438
+
439
+ # UUID Generation Helper
440
+ async def generate_uuid(conn) -> str:
441
+ """Generate UUID using the best available method"""
442
+ try:
443
+ uuid_method = await detect_uuid_method(conn)
444
+
445
+ if uuid_method == "built-in":
446
+ uuid_val = await conn.fetchval("SELECT gen_random_uuid()")
447
+ return str(uuid_val)
448
+ elif uuid_method == "uuid-ossp":
449
+ uuid_val = await conn.fetchval("SELECT uuid_generate_v4()")
450
+ return str(uuid_val)
451
+ else:
452
+ return str(uuid.uuid4())
453
+
454
+ except Exception as e:
455
+ logger.warning(f"Database UUID generation failed, using Python fallback: {e}")
456
+ return str(uuid.uuid4())
457
+
458
+ # Database Operations
459
+ async def create_document_record(
460
+ title: str,
461
+ source_type: str,
462
+ source_url: str = None,
463
+ keywords: List[str] = None,
464
+ metadata: Dict[str, Any] = None
465
+ ) -> str:
466
+ """Create document record in database"""
467
+ conn = await get_db_connection()
468
+ try:
469
+ document_id = await generate_uuid(conn)
470
+
471
+ await conn.execute("""
472
+ INSERT INTO documents (id, title, source_type, source_url, keywords, metadata, created_at, processing_status)
473
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
474
+ """, document_id, title, source_type, source_url, keywords or [],
475
+ json.dumps(metadata or {}), datetime.utcnow(), "processing")
476
+
477
+ return document_id
478
+
479
+ finally:
480
+ await release_db_connection(conn)
481
+
482
+ async def store_document_chunk(
483
+ document_id: str,
484
+ content: str,
485
+ chunk_index: int,
486
+ embedding: List[float],
487
+ metadata: Dict[str, Any] = None
488
+ ) -> str:
489
+ """Store document chunk with embedding"""
490
+ conn = await get_db_connection()
491
+ try:
492
+ chunk_id = await generate_uuid(conn)
493
+
494
+ # Convert embedding to PostgreSQL vector format
495
+ embedding_vector = embedding_to_vector_string(embedding)
496
+
497
+ await conn.execute("""
498
+ INSERT INTO document_chunks (id, document_id, content, chunk_index, embedding, metadata, created_at)
499
+ VALUES ($1, $2, $3, $4, $5::vector, $6, $7)
500
+ """, chunk_id, document_id, content, chunk_index, embedding_vector,
501
+ json.dumps(metadata or {}), datetime.utcnow())
502
+
503
+ return chunk_id
504
+
505
+ finally:
506
+ await release_db_connection(conn)
507
+
508
+ async def update_document_status(document_id: str, status: str, total_chunks: int = None):
509
+ """Update document processing status"""
510
+ conn = await get_db_connection()
511
+ try:
512
+ if total_chunks is not None:
513
+ await conn.execute("""
514
+ UPDATE documents SET processing_status = $1, total_chunks = $2 WHERE id = $3
515
+ """, status, total_chunks, document_id)
516
+ else:
517
+ await conn.execute("""
518
+ UPDATE documents SET processing_status = $1 WHERE id = $2
519
+ """, status, document_id)
520
+
521
+ finally:
522
+ await release_db_connection(conn)
523
+
524
+ async def search_similar_chunks(
525
+ query_embedding: List[float],
526
+ limit: int = 10,
527
+ similarity_threshold: float = 0.2,
528
+ filter_metadata: Dict[str, Any] = None
529
+ ) -> List[Dict[str, Any]]:
530
+ """Search for similar document chunks using vector similarity"""
531
+ conn = await get_db_connection()
532
+ try:
533
+ logger.info(f"Searching for similar chunks with threshold {similarity_threshold}, limit {limit}")
534
+
535
+ # Validate inputs
536
+ if not query_embedding or len(query_embedding) == 0:
537
+ raise ValueError("Query embedding cannot be empty")
538
+
539
+ logger.info(f"Query embedding dimensions: {len(query_embedding)}")
540
+
541
+ # Convert query embedding to PostgreSQL vector format
542
+ query_vector = embedding_to_vector_string(query_embedding)
543
+
544
+ # Check if we have any chunks
545
+ total_chunks = await conn.fetchval("""
546
+ SELECT COUNT(*) FROM document_chunks dc
547
+ JOIN documents d ON dc.document_id = d.id
548
+ WHERE d.processing_status = 'completed' AND dc.embedding IS NOT NULL
549
+ """)
550
+
551
+ logger.info(f"Total available chunks for search: {total_chunks}")
552
+
553
+ if total_chunks == 0:
554
+ logger.warning("No chunks available for search")
555
+ return []
556
+
557
+ # Build the query
558
+ base_query = """
559
+ SELECT
560
+ dc.id, dc.document_id, dc.content, dc.chunk_index, dc.embedding,
561
+ dc.metadata as chunk_metadata, dc.created_at,
562
+ d.title, d.source_type, d.source_url, d.keywords, d.metadata as doc_metadata,
563
+ 1 - (dc.embedding <=> $1::vector) as similarity_score
564
+ FROM document_chunks dc
565
+ JOIN documents d ON dc.document_id = d.id
566
+ WHERE d.processing_status = 'completed'
567
+ AND dc.embedding IS NOT NULL
568
+ """
569
+
570
+ params = [query_vector]
571
+ param_count = 1
572
+
573
+ # Add similarity threshold
574
+ if similarity_threshold > 0:
575
+ base_query += " AND 1 - (dc.embedding <=> $1::vector) >= $2"
576
+ params.append(similarity_threshold)
577
+ param_count += 1
578
+
579
+ # Add metadata filtering
580
+ if filter_metadata:
581
+ for key, value in filter_metadata.items():
582
+ base_query += f" AND d.metadata->>$" + str(param_count + 1) + " = $" + str(param_count + 2)
583
+ params.extend([key, str(value)])
584
+ param_count += 2
585
+ break # Handle only one filter for now
586
+
587
+ base_query += " ORDER BY similarity_score DESC LIMIT $" + str(param_count + 1)
588
+ params.append(limit)
589
+
590
+ logger.info(f"Executing vector search query with {len(params)} parameters")
591
+
592
+ try:
593
+ rows = await conn.fetch(base_query, *params)
594
+ logger.info(f"Vector search query returned {len(rows)} rows")
595
+ except Exception as db_error:
596
+ logger.error(f"Database query error: {db_error}")
597
+ raise HTTPException(status_code=500, detail=f"Vector search query failed: {db_error}")
598
+
599
+ # Debug: show similarity scores if no results
600
+ if len(rows) == 0 and similarity_threshold > 0:
601
+ logger.warning(f"No results found with threshold {similarity_threshold}, trying without threshold")
602
+ debug_query = """
603
+ SELECT
604
+ dc.id, dc.content,
605
+ 1 - (dc.embedding <=> $1::vector) as similarity_score
606
+ FROM document_chunks dc
607
+ JOIN documents d ON dc.document_id = d.id
608
+ WHERE d.processing_status = 'completed'
609
+ AND dc.embedding IS NOT NULL
610
+ ORDER BY similarity_score DESC
611
+ LIMIT 3
612
+ """
613
+ debug_rows = await conn.fetch(debug_query, query_vector)
614
+ logger.info(f"Debug: Top 3 similarity scores: {[(r['similarity_score'], r['content'][:50]) for r in debug_rows]}")
615
+
616
+ results = []
617
+ for row in rows:
618
+ try:
619
+ # Safely parse JSON metadata
620
+ chunk_metadata = {}
621
+ doc_metadata = {}
622
+
623
+ if row['chunk_metadata']:
624
+ try:
625
+ chunk_metadata = json.loads(row['chunk_metadata'])
626
+ except json.JSONDecodeError:
627
+ logger.warning(f"Invalid chunk metadata JSON for chunk {row['id']}")
628
+
629
+ if row['doc_metadata']:
630
+ try:
631
+ doc_metadata = json.loads(row['doc_metadata'])
632
+ except json.JSONDecodeError:
633
+ logger.warning(f"Invalid document metadata JSON for document {row['document_id']}")
634
+
635
+ # Convert UUID objects to strings
636
+ chunk_id = str(row['id']) if row['id'] else None
637
+ document_id = str(row['document_id']) if row['document_id'] else None
638
+
639
+ results.append({
640
+ 'chunk_id': chunk_id,
641
+ 'document_id': document_id,
642
+ 'content': row['content'],
643
+ 'chunk_index': row['chunk_index'],
644
+ 'chunk_metadata': chunk_metadata,
645
+ 'created_at': row['created_at'],
646
+ 'document_title': row['title'],
647
+ 'source_type': row['source_type'],
648
+ 'source_url': row['source_url'],
649
+ 'keywords': row['keywords'] or [],
650
+ 'document_metadata': doc_metadata,
651
+ 'similarity_score': float(row['similarity_score'])
652
+ })
653
+ except Exception as row_error:
654
+ logger.error(f"Error processing search result row: {row_error}")
655
+ continue
656
+
657
+ logger.info(f"Vector search returned {len(results)} results")
658
+ if results:
659
+ logger.info(f"Top result similarity: {results[0]['similarity_score']:.4f}")
660
+
661
+ return results
662
+
663
+ except HTTPException:
664
+ raise
665
+ except Exception as e:
666
+ logger.error(f"Vector search failed: {e}")
667
+ logger.error(traceback.format_exc())
668
+ raise HTTPException(status_code=500, detail=f"Vector search failed: {e}")
669
+ finally:
670
+ await release_db_connection(conn)
671
+
672
+ # Database initialization
673
+ async def init_database():
674
+ """Initialize database tables"""
675
+ conn = await get_db_connection()
676
+ try:
677
+ logger.info("πŸ”„ Initializing database tables...")
678
+
679
+ # Create documents table
680
+ await conn.execute("""
681
+ CREATE TABLE IF NOT EXISTS documents (
682
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
683
+ title VARCHAR(500) NOT NULL,
684
+ source_type VARCHAR(50) NOT NULL,
685
+ source_url TEXT,
686
+ keywords TEXT[] DEFAULT '{}',
687
+ metadata JSONB DEFAULT '{}',
688
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
689
+ processing_status VARCHAR(20) DEFAULT 'processing',
690
+ total_chunks INTEGER DEFAULT 0
691
+ );
692
+ """)
693
+
694
+ # Create document_chunks table
695
+ await conn.execute("""
696
+ CREATE TABLE IF NOT EXISTS document_chunks (
697
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
698
+ document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
699
+ content TEXT NOT NULL,
700
+ chunk_index INTEGER NOT NULL,
701
+ embedding vector(1536),
702
+ metadata JSONB DEFAULT '{}',
703
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
704
+ );
705
+ """)
706
+
707
+ # Create indexes
708
+ try:
709
+ await conn.execute("""
710
+ CREATE INDEX IF NOT EXISTS idx_documents_status ON documents(processing_status);
711
+ CREATE INDEX IF NOT EXISTS idx_chunks_document ON document_chunks(document_id);
712
+ CREATE INDEX IF NOT EXISTS idx_chunks_embedding ON document_chunks USING ivfflat (embedding vector_cosine_ops);
713
+ """)
714
+ except Exception as e:
715
+ logger.warning(f"Could not create some indexes (vector extension may not be available): {e}")
716
+
717
+ logger.info("βœ… Database tables initialized")
718
+
719
+ finally:
720
+ await release_db_connection(conn)
721
+
722
+ # App Lifecycle
723
+ @app.on_event("startup")
724
+ async def startup_event():
725
+ """Application startup"""
726
+ logger.info("πŸš€ Starting RAG Backend API...")
727
+
728
+ try:
729
+ # Test database connection
730
+ await get_db_pool()
731
+ logger.info("βœ… Database connection established")
732
+
733
+ # Initialize database
734
+ await init_database()
735
+
736
+ # Test Azure OpenAI
737
+ try:
738
+ get_openai_client()
739
+ logger.info("βœ… Azure OpenAI client configured")
740
+ except Exception as e:
741
+ logger.warning(f"⚠️ Azure OpenAI client configuration issue: {e}")
742
+
743
+ logger.info("πŸŽ‰ RAG Backend API is ready!")
744
+
745
+ except Exception as e:
746
+ logger.error(f"❌ Startup failed: {e}")
747
+ raise
748
+
749
+ @app.on_event("shutdown")
750
+ async def shutdown_event():
751
+ """Application shutdown"""
752
+ logger.info("πŸ›‘ Shutting down RAG Backend API...")
753
+
754
+ if db_pool:
755
+ await db_pool.close()
756
+ logger.info("βœ… Database connections closed")
757
+
758
+ # API Endpoints
759
+ @app.get("/")
760
+ async def root():
761
+ return {
762
+ "message": "RAG Backend API",
763
+ "version": "2.0.0",
764
+ "status": "running",
765
+ "features": {
766
+ "document_upload": True,
767
+ "url_processing": True,
768
+ "vector_search": True,
769
+ "ocr_integration": True,
770
+ "azure_openai_embeddings": True,
771
+ "postgresql_vector_storage": True
772
+ },
773
+ "configuration": {
774
+ "chunk_size": config.CHUNK_SIZE,
775
+ "chunk_overlap": config.CHUNK_OVERLAP,
776
+ "min_chunk_size": config.MIN_CHUNK_SIZE,
777
+ "max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024)
778
+ },
779
+ "endpoints": {
780
+ "health": "/health",
781
+ "docs": "/docs",
782
+ "upload": "/documents/upload",
783
+ "url_process": "/documents/url",
784
+ "search": "/search",
785
+ "list_documents": "/documents"
786
+ }
787
+ }
788
+
789
+ @app.get("/health")
790
+ async def health_check():
791
+ """Health check endpoint"""
792
+ health_status = {
793
+ "status": "unknown",
794
+ "service": "RAG Backend API",
795
+ "version": "2.0.0",
796
+ "timestamp": datetime.utcnow().isoformat(),
797
+ "database": "unknown",
798
+ "openai": "unknown",
799
+ "uuid_method": "unknown",
800
+ "ocr_service": "unknown",
801
+ "configuration": {
802
+ "pg_host": config.PG_HOST,
803
+ "pg_port": config.PG_PORT,
804
+ "pg_database": config.PG_DATABASE,
805
+ "ocr_service_url": config.OCR_SERVICE_URL,
806
+ "chunk_size": config.CHUNK_SIZE
807
+ },
808
+ "errors": []
809
+ }
810
+
811
+ # Test database connection
812
+ try:
813
+ test_conn = await asyncpg.connect(
814
+ host=config.PG_HOST,
815
+ port=config.PG_PORT,
816
+ database=config.PG_DATABASE,
817
+ user=config.PG_USER,
818
+ password=config.PG_PASSWORD,
819
+ ssl=config.PG_SSL_MODE,
820
+ timeout=10
821
+ )
822
+
823
+ db_version = await test_conn.fetchval("SELECT version()")
824
+ health_status["database"] = "connected"
825
+ health_status["database_version"] = db_version
826
+
827
+ # Check UUID generation method
828
+ uuid_method = await detect_uuid_method(test_conn)
829
+ health_status["uuid_method"] = uuid_method
830
+
831
+ await test_conn.close()
832
+
833
+ except Exception as db_error:
834
+ health_status["database"] = "failed"
835
+ health_status["errors"].append(f"Database connection failed: {db_error}")
836
+
837
+ # Test OpenAI
838
+ try:
839
+ if (config.AZURE_OPENAI_ENDPOINT == "" or
840
+ config.AZURE_OPENAI_API_KEY == ""):
841
+ health_status["openai"] = "not_configured"
842
+ else:
843
+ client = get_openai_client()
844
+ # Test with a simple embedding request
845
+ test_response = client.embeddings.create(
846
+ input=["Health check test"],
847
+ model=config.AZURE_OPENAI_DEPLOYMENT
848
+ )
849
+ if test_response.data:
850
+ health_status["openai"] = "configured"
851
+ health_status["embedding_dimensions"] = len(test_response.data[0].embedding)
852
+ else:
853
+ health_status["openai"] = "failed"
854
+ health_status["errors"].append("OpenAI embedding test failed")
855
+ except Exception as openai_error:
856
+ health_status["openai"] = "failed"
857
+ health_status["errors"].append(f"OpenAI configuration failed: {openai_error}")
858
+
859
+ # Test OCR service
860
+ try:
861
+ ocr_response = requests.get(f"{config.OCR_SERVICE_URL}/health", timeout=5)
862
+ if ocr_response.status_code == 200:
863
+ health_status["ocr_service"] = "available"
864
+ else:
865
+ health_status["ocr_service"] = "unavailable"
866
+ except:
867
+ health_status["ocr_service"] = "unavailable"
868
+
869
+ # Determine overall status
870
+ if health_status["database"] == "connected" and health_status["openai"] in ["configured", "not_configured"]:
871
+ health_status["status"] = "healthy"
872
+ elif health_status["database"] == "connected":
873
+ health_status["status"] = "degraded"
874
+ else:
875
+ health_status["status"] = "unhealthy"
876
+
877
+ return health_status
878
+
879
+ @app.post("/documents/upload")
880
+ async def upload_document(
881
+ file: UploadFile = File(...),
882
+ title: str = Form(None),
883
+ keywords: str = Form(None), # JSON string of list
884
+ metadata: str = Form(None), # JSON string
885
+ chunk_size: int = Form(None),
886
+ chunk_overlap: int = Form(None)
887
+ ):
888
+ """Upload and process a document"""
889
+ document_id = None
890
+ try:
891
+ # Parse form data
892
+ keywords_list = json.loads(keywords) if keywords else []
893
+ metadata_dict = json.loads(metadata) if metadata else {}
894
+
895
+ # Set default title
896
+ if not title:
897
+ title = file.filename or "Untitled Document"
898
+
899
+ # Read file content
900
+ logger.info(f"Processing uploaded file: {file.filename} ({file.content_type})")
901
+ file_bytes = await file.read()
902
+
903
+ if not file_bytes or len(file_bytes) == 0:
904
+ raise HTTPException(status_code=400, detail="Empty file uploaded")
905
+
906
+ if len(file_bytes) > config.MAX_FILE_SIZE:
907
+ raise HTTPException(status_code=400, detail="File too large")
908
+
909
+ # Process with OCR
910
+ logger.info(f"Processing document with OCR: {title}")
911
+ ocr_result = await process_with_ocr(file_bytes=file_bytes, filename=file.filename)
912
+
913
+ if not ocr_result.get('success', False):
914
+ error_msg = ocr_result.get('error', 'Unknown OCR error')
915
+ logger.error(f"OCR processing failed: {error_msg}")
916
+ raise HTTPException(status_code=400, detail=f"OCR processing failed: {error_msg}")
917
+
918
+ # Extract text content
919
+ content = ocr_result.get('content', '')
920
+ if not content or not content.strip():
921
+ raise HTTPException(status_code=400, detail="No text content extracted from document")
922
+
923
+ # Clean the text
924
+ cleaned_content = clean_text(content)
925
+
926
+ if not cleaned_content or len(cleaned_content.strip()) == 0:
927
+ raise HTTPException(status_code=400, detail="No text content after cleaning")
928
+
929
+ # Allow shorter content for testing
930
+ if len(cleaned_content.strip()) < config.MIN_CHUNK_SIZE:
931
+ logger.warning(f"Content is short ({len(cleaned_content)} chars) but processing anyway")
932
+
933
+ # Create document record
934
+ document_id = await create_document_record(
935
+ title=title,
936
+ source_type='file_upload',
937
+ keywords=keywords_list,
938
+ metadata={
939
+ **metadata_dict,
940
+ 'filename': file.filename,
941
+ 'content_type': file.content_type,
942
+ 'file_size': len(file_bytes),
943
+ 'ocr_pages': len(ocr_result.get('pages', []))
944
+ }
945
+ )
946
+
947
+ # Create text chunks
948
+ chunks = create_text_chunks(
949
+ cleaned_content,
950
+ chunk_size=chunk_size,
951
+ chunk_overlap=chunk_overlap
952
+ )
953
+
954
+ if not chunks:
955
+ raise HTTPException(status_code=400, detail="No valid chunks created from document")
956
+
957
+ # Process chunks and generate embeddings
958
+ logger.info(f"Processing {len(chunks)} chunks for document {document_id}")
959
+
960
+ successful_chunks = 0
961
+ for i, chunk_content in enumerate(chunks):
962
+ try:
963
+ if not chunk_content or len(chunk_content.strip()) < 10:
964
+ logger.warning(f"Skipping chunk {i} - too small")
965
+ continue
966
+
967
+ # Generate embedding
968
+ embedding = await generate_embedding(chunk_content)
969
+
970
+ # Store chunk
971
+ await store_document_chunk(
972
+ document_id=document_id,
973
+ content=chunk_content,
974
+ chunk_index=i,
975
+ embedding=embedding,
976
+ metadata={
977
+ 'chunk_size': len(chunk_content),
978
+ 'position': i
979
+ }
980
+ )
981
+
982
+ successful_chunks += 1
983
+
984
+ except Exception as e:
985
+ logger.error(f"Failed to process chunk {i} for document {document_id}: {e}")
986
+ continue
987
+
988
+ if successful_chunks == 0:
989
+ await update_document_status(document_id, "failed")
990
+ raise HTTPException(status_code=500, detail="No chunks could be processed successfully")
991
+
992
+ # Update document status
993
+ await update_document_status(document_id, "completed", successful_chunks)
994
+
995
+ logger.info(f"Document {document_id} processed successfully with {successful_chunks} chunks")
996
+
997
+ return {
998
+ "success": True,
999
+ "document_id": document_id,
1000
+ "title": title,
1001
+ "total_chunks": successful_chunks,
1002
+ "message": "Document processed successfully"
1003
+ }
1004
+
1005
+ except HTTPException:
1006
+ if document_id:
1007
+ try:
1008
+ await update_document_status(document_id, "failed")
1009
+ except:
1010
+ pass
1011
+ raise
1012
+ except Exception as e:
1013
+ if document_id:
1014
+ try:
1015
+ await update_document_status(document_id, "failed")
1016
+ except:
1017
+ pass
1018
+
1019
+ logger.error(f"Unexpected error processing document: {e}")
1020
+ logger.error(traceback.format_exc())
1021
+ raise HTTPException(status_code=500, detail=f"Document processing failed: {e}")
1022
+
1023
+ @app.post("/documents/url")
1024
+ async def process_url(request: URLProcessRequest):
1025
+ """Process document from URL"""
1026
+ document_id = None
1027
+ try:
1028
+ url_str = str(request.url)
1029
+
1030
+ # Set default title
1031
+ title = request.title or f"Document from {urlparse(url_str).netloc}"
1032
+
1033
+ # Process with OCR
1034
+ logger.info(f"Processing URL with OCR: {url_str}")
1035
+ ocr_result = await process_with_ocr(url=url_str, extract_images=request.extract_images)
1036
+
1037
+ if not ocr_result.get('success', False):
1038
+ error_msg = ocr_result.get('error', 'Unknown OCR error')
1039
+ logger.error(f"OCR processing failed for URL: {error_msg}")
1040
+ raise HTTPException(status_code=400, detail=f"OCR processing failed: {error_msg}")
1041
+
1042
+ # Extract text content
1043
+ content = ocr_result.get('content', '')
1044
+ if not content or not content.strip():
1045
+ raise HTTPException(status_code=400, detail="No text content extracted from URL")
1046
+
1047
+ # Clean the text
1048
+ cleaned_content = clean_text(content)
1049
+
1050
+ if not cleaned_content or len(cleaned_content.strip()) == 0:
1051
+ raise HTTPException(status_code=400, detail="No text content after cleaning")
1052
+
1053
+ # Allow shorter content for testing
1054
+ if len(cleaned_content.strip()) < config.MIN_CHUNK_SIZE:
1055
+ logger.warning(f"URL content is short ({len(cleaned_content)} chars) but processing anyway")
1056
+
1057
+ # Create document record
1058
+ document_id = await create_document_record(
1059
+ title=title,
1060
+ source_type=ocr_result.get('source_type', 'url'),
1061
+ source_url=url_str,
1062
+ keywords=request.keywords or [],
1063
+ metadata={
1064
+ **(request.metadata or {}),
1065
+ 'url': url_str,
1066
+ 'extract_images': request.extract_images,
1067
+ 'ocr_pages': len(ocr_result.get('pages', []))
1068
+ }
1069
+ )
1070
+
1071
+ # Create text chunks
1072
+ chunks = create_text_chunks(
1073
+ cleaned_content,
1074
+ chunk_size=request.chunk_size,
1075
+ chunk_overlap=request.chunk_overlap
1076
+ )
1077
+
1078
+ if not chunks:
1079
+ raise HTTPException(status_code=400, detail="No valid chunks created from URL content")
1080
+
1081
+ # Process chunks and generate embeddings
1082
+ logger.info(f"Processing {len(chunks)} chunks for document {document_id}")
1083
+
1084
+ successful_chunks = 0
1085
+ for i, chunk_content in enumerate(chunks):
1086
+ try:
1087
+ if not chunk_content or len(chunk_content.strip()) < 10:
1088
+ logger.warning(f"Skipping chunk {i} - too small")
1089
+ continue
1090
+
1091
+ # Generate embedding
1092
+ embedding = await generate_embedding(chunk_content)
1093
+
1094
+ # Store chunk
1095
+ await store_document_chunk(
1096
+ document_id=document_id,
1097
+ content=chunk_content,
1098
+ chunk_index=i,
1099
+ embedding=embedding,
1100
+ metadata={
1101
+ 'chunk_size': len(chunk_content),
1102
+ 'position': i
1103
+ }
1104
+ )
1105
+
1106
+ successful_chunks += 1
1107
+
1108
+ except Exception as e:
1109
+ logger.error(f"Failed to process chunk {i} for document {document_id}: {e}")
1110
+ continue
1111
+
1112
+ if successful_chunks == 0:
1113
+ await update_document_status(document_id, "failed")
1114
+ raise HTTPException(status_code=500, detail="No chunks could be processed successfully")
1115
+
1116
+ # Update document status
1117
+ await update_document_status(document_id, "completed", successful_chunks)
1118
+
1119
+ logger.info(f"URL document {document_id} processed successfully with {successful_chunks} chunks")
1120
+
1121
+ return {
1122
+ "success": True,
1123
+ "document_id": document_id,
1124
+ "title": title,
1125
+ "total_chunks": successful_chunks,
1126
+ "source_url": url_str,
1127
+ "message": "URL processed successfully"
1128
+ }
1129
+
1130
+ except HTTPException:
1131
+ if document_id:
1132
+ try:
1133
+ await update_document_status(document_id, "failed")
1134
+ except:
1135
+ pass
1136
+ raise
1137
+ except Exception as e:
1138
+ if document_id:
1139
+ try:
1140
+ await update_document_status(document_id, "failed")
1141
+ except:
1142
+ pass
1143
+
1144
+ logger.error(f"Unexpected error processing URL: {e}")
1145
+ logger.error(traceback.format_exc())
1146
+ raise HTTPException(status_code=500, detail=f"URL processing failed: {e}")
1147
+
1148
+ @app.post("/search", response_model=SearchResponse)
1149
+ async def search_documents(request: SearchRequest):
1150
+ """Search documents using vector similarity"""
1151
+ try:
1152
+ import time
1153
+ start_time = time.time()
1154
+
1155
+ # Validate input
1156
+ if not request.query or not request.query.strip():
1157
+ raise HTTPException(status_code=400, detail="Query cannot be empty")
1158
+
1159
+ query_text = request.query.strip()
1160
+ logger.info(f"Performing vector search for query: '{query_text}'")
1161
+
1162
+ # Generate embedding for query
1163
+ try:
1164
+ query_embedding = await generate_embedding(query_text)
1165
+ except Exception as e:
1166
+ logger.error(f"Failed to generate query embedding: {e}")
1167
+ raise HTTPException(status_code=500, detail=f"Query embedding generation failed: {e}")
1168
+
1169
+ # Search for similar chunks
1170
+ try:
1171
+ results = await search_similar_chunks(
1172
+ query_embedding=query_embedding,
1173
+ limit=request.limit,
1174
+ similarity_threshold=request.similarity_threshold,
1175
+ filter_metadata=request.filter_metadata
1176
+ )
1177
+ except Exception as e:
1178
+ logger.error(f"Vector search failed: {e}")
1179
+ raise HTTPException(status_code=500, detail=f"Vector search failed: {e}")
1180
+
1181
+ # Format results
1182
+ search_results = []
1183
+ for result in results:
1184
+ try:
1185
+ chunk = DocumentChunk(
1186
+ id=result['chunk_id'],
1187
+ document_id=result['document_id'],
1188
+ content=result['content'],
1189
+ chunk_index=result['chunk_index'],
1190
+ metadata=result['chunk_metadata'],
1191
+ created_at=result['created_at']
1192
+ )
1193
+
1194
+ search_results.append(SearchResult(
1195
+ chunk=chunk,
1196
+ similarity_score=result['similarity_score'],
1197
+ document_info={
1198
+ 'title': result['document_title'],
1199
+ 'source_type': result['source_type'],
1200
+ 'source_url': result['source_url'],
1201
+ 'keywords': result['keywords'],
1202
+ 'metadata': result['document_metadata']
1203
+ }
1204
+ ))
1205
+ except Exception as result_error:
1206
+ logger.error(f"Error formatting search result: {result_error}")
1207
+ continue
1208
+
1209
+ processing_time = time.time() - start_time
1210
+
1211
+ logger.info(f"Search completed: {len(search_results)} results in {processing_time:.3f}s")
1212
+
1213
+ return SearchResponse(
1214
+ query=request.query,
1215
+ results=search_results,
1216
+ total_results=len(search_results),
1217
+ processing_time=processing_time
1218
+ )
1219
+
1220
+ except HTTPException:
1221
+ raise
1222
+ except Exception as e:
1223
+ logger.error(f"Search failed with unexpected error: {e}")
1224
+ logger.error(traceback.format_exc())
1225
+ raise HTTPException(status_code=500, detail=f"Search failed: {e}")
1226
+
1227
+ @app.get("/documents")
1228
+ async def list_documents(
1229
+ limit: int = Query(10, ge=1, le=100),
1230
+ offset: int = Query(0, ge=0),
1231
+ status: str = Query(None)
1232
+ ):
1233
+ """List documents with pagination"""
1234
+ conn = await get_db_connection()
1235
+ try:
1236
+ # Build query
1237
+ base_query = """
1238
+ SELECT id, title, source_type, source_url, keywords, metadata,
1239
+ created_at, processing_status, total_chunks
1240
+ FROM documents
1241
+ """
1242
+
1243
+ params = []
1244
+ if status:
1245
+ base_query += " WHERE processing_status = $1"
1246
+ params.append(status)
1247
+
1248
+ base_query += " ORDER BY created_at DESC LIMIT $" + str(len(params) + 1) + " OFFSET $" + str(len(params) + 2)
1249
+ params.extend([limit, offset])
1250
+
1251
+ rows = await conn.fetch(base_query, *params)
1252
+
1253
+ documents = []
1254
+ for row in rows:
1255
+ documents.append({
1256
+ 'id': str(row['id']),
1257
+ 'title': row['title'],
1258
+ 'source_type': row['source_type'],
1259
+ 'source_url': row['source_url'],
1260
+ 'keywords': row['keywords'],
1261
+ 'metadata': json.loads(row['metadata']) if row['metadata'] else {},
1262
+ 'created_at': row['created_at'].isoformat(),
1263
+ 'processing_status': row['processing_status'],
1264
+ 'total_chunks': row['total_chunks']
1265
+ })
1266
+
1267
+ # Get total count
1268
+ count_query = "SELECT COUNT(*) FROM documents"
1269
+ if status:
1270
+ count_query += " WHERE processing_status = $1"
1271
+ total_count = await conn.fetchval(count_query, status)
1272
+ else:
1273
+ total_count = await conn.fetchval(count_query)
1274
+
1275
+ return {
1276
+ "documents": documents,
1277
+ "total": total_count,
1278
+ "limit": limit,
1279
+ "offset": offset
1280
+ }
1281
+
1282
+ finally:
1283
+ await release_db_connection(conn)
1284
+
1285
+ @app.get("/documents/{document_id}")
1286
+ async def get_document(document_id: str):
1287
+ """Get document details"""
1288
+ conn = await get_db_connection()
1289
+ try:
1290
+ # Get document
1291
+ doc_row = await conn.fetchrow("""
1292
+ SELECT id, title, source_type, source_url, keywords, metadata,
1293
+ created_at, processing_status, total_chunks
1294
+ FROM documents WHERE id = $1
1295
+ """, document_id)
1296
+
1297
+ if not doc_row:
1298
+ raise HTTPException(status_code=404, detail="Document not found")
1299
+
1300
+ # Get chunks
1301
+ chunk_rows = await conn.fetch("""
1302
+ SELECT id, content, chunk_index, metadata, created_at
1303
+ FROM document_chunks
1304
+ WHERE document_id = $1
1305
+ ORDER BY chunk_index
1306
+ """, document_id)
1307
+
1308
+ return {
1309
+ 'id': str(doc_row['id']),
1310
+ 'title': doc_row['title'],
1311
+ 'source_type': doc_row['source_type'],
1312
+ 'source_url': doc_row['source_url'],
1313
+ 'keywords': doc_row['keywords'],
1314
+ 'metadata': json.loads(doc_row['metadata']) if doc_row['metadata'] else {},
1315
+ 'created_at': doc_row['created_at'].isoformat(),
1316
+ 'processing_status': doc_row['processing_status'],
1317
+ 'total_chunks': doc_row['total_chunks'],
1318
+ 'chunks': [
1319
+ {
1320
+ 'id': str(chunk['id']),
1321
+ 'content': chunk['content'],
1322
+ 'chunk_index': chunk['chunk_index'],
1323
+ 'metadata': json.loads(chunk['metadata']) if chunk['metadata'] else {},
1324
+ 'created_at': chunk['created_at'].isoformat()
1325
+ }
1326
+ for chunk in chunk_rows
1327
+ ]
1328
+ }
1329
+
1330
+ finally:
1331
+ await release_db_connection(conn)
1332
+
1333
+ @app.delete("/documents/{document_id}")
1334
+ async def delete_document(document_id: str):
1335
+ """Delete document and its chunks"""
1336
+ conn = await get_db_connection()
1337
+ try:
1338
+ # Check if document exists
1339
+ exists = await conn.fetchval("SELECT EXISTS(SELECT 1 FROM documents WHERE id = $1)", document_id)
1340
+ if not exists:
1341
+ raise HTTPException(status_code=404, detail="Document not found")
1342
+
1343
+ # Delete chunks first (foreign key constraint)
1344
+ await conn.execute("DELETE FROM document_chunks WHERE document_id = $1", document_id)
1345
+
1346
+ # Delete document
1347
+ await conn.execute("DELETE FROM documents WHERE id = $1", document_id)
1348
+
1349
+ return {"message": "Document deleted successfully"}
1350
+
1351
+ finally:
1352
+ await release_db_connection(conn)
1353
+
1354
+ if __name__ == "__main__":
1355
+ print("πŸ”§ Loading RAG service configuration...")
1356
+ print(f"🌐 Will start server on {config.HOST}:{config.PORT}")
1357
+ print(f"πŸ—„οΈ Database: {config.PG_HOST}:{config.PG_PORT}/{config.PG_DATABASE}")
1358
+ print(f"πŸ€– Azure OpenAI: {'βœ… Configured' if config.AZURE_OPENAI_ENDPOINT else '❌ Not configured'}")
1359
+ print(f"πŸ” OCR Service: {config.OCR_SERVICE_URL}")
1360
+
1361
+ uvicorn.run(
1362
+ "rag_service:app",
1363
+ host=config.HOST,
1364
+ port=config.PORT,
1365
+ reload=config.DEBUG,
1366
+ log_level="info"
1367
+ )