File size: 17,869 Bytes
963ae98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
#!/usr/bin/env python3
"""

Centralized Configuration Management for Unified AI Services

Manages configuration for NER, OCR, and RAG services

"""

import os
import logging
from pathlib import Path
from typing import Optional, Dict, Any, List
from dotenv import load_dotenv

# Load environment variables
env_path = Path(__file__).parent / '.env'
if env_path.exists():
    load_dotenv(dotenv_path=env_path)
else:
    load_dotenv()  # Load from default location

# Setup logging
logging.basicConfig(
    level=getattr(logging, os.getenv("LOG_LEVEL", "INFO").upper()),
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class BaseConfig:
    """Base configuration class with common settings"""
    
    def __init__(self):
        # Server Configuration
        self.HOST = os.getenv("HOST", "0.0.0.0")
        self.DEBUG = os.getenv("DEBUG", "False").lower() == "true"
        
        # Database Configuration (shared by NER and RAG)
        self.POSTGRES_HOST = os.getenv("POSTGRES_HOST", "")
        self.POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
        self.POSTGRES_USER = os.getenv("POSTGRES_USER", "")
        self.POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
        self.POSTGRES_DATABASE = os.getenv("POSTGRES_DATABASE", "postgres")
        
        # Azure OpenAI Configuration (shared by NER and RAG)
        self.AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
        self.AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
        self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
        self.AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "text-embedding-3-large")
        
        # Azure Storage Configuration (shared by NER and RAG)
        self.AZURE_STORAGE_ACCOUNT_URL = os.getenv("AZURE_STORAGE_ACCOUNT_URL", "")
        self.AZURE_BLOB_SAS_TOKEN = os.getenv("AZURE_BLOB_SAS_TOKEN", "")
        self.BLOB_CONTAINER = os.getenv("BLOB_CONTAINER", "historylog")
        
        # Processing Configuration
        self.MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", "50")) * 1024 * 1024  # Convert MB to bytes
        self.REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "300"))
        
        # CORS Configuration
        self.ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*")
        
    def validate_azure_openai(self) -> bool:
        """Validate Azure OpenAI configuration"""
        return bool(
            self.AZURE_OPENAI_ENDPOINT and 
            self.AZURE_OPENAI_API_KEY and
            self.AZURE_OPENAI_ENDPOINT != "YOUR_AZURE_OPENAI_ENDPOINT" and
            self.AZURE_OPENAI_API_KEY != "YOUR_AZURE_OPENAI_KEY"
        )
    
    def validate_postgres(self) -> bool:
        """Validate PostgreSQL configuration"""
        return bool(
            self.POSTGRES_HOST and
            self.POSTGRES_USER and
            self.POSTGRES_PASSWORD and
            self.POSTGRES_DATABASE
        )
    
    def validate_azure_storage(self) -> bool:
        """Validate Azure Storage configuration"""
        return bool(
            self.AZURE_STORAGE_ACCOUNT_URL and
            self.AZURE_BLOB_SAS_TOKEN
        )

class NERConfig(BaseConfig):
    """Configuration for NER Service"""
    
    def __init__(self):
        super().__init__()
        self.PORT = int(os.getenv("NER_PORT", "8500"))
        
        # DeepSeek Configuration
        self.DEEPSEEK_ENDPOINT = os.getenv("DEEPSEEK_ENDPOINT", "")
        self.DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
        self.DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "DeepSeek-R1-0528")
        
        # OCR Service Configuration
        self.OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
        
        # NER Specific Settings
        self.MAX_TEXT_LENGTH = 100000  # 100KB
        self.SUPPORTED_TEXT_FORMATS = {'.txt', '.doc', '.docx', '.rtf'}
        self.SUPPORTED_OCR_FORMATS = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif'}
        
        # Entity and Relationship Types
        self.ENTITY_TYPES = [
            "PERSON", "ORGANIZATION", "LOCATION", "DATE", "TIME", "MONEY", "PRODUCT", "EVENT",
            "VEHICLE", "SUSPICIOUS_OBJECT", "ILLEGAL_ACTIVITY", "EVIDENCE", "ILLEGAL_ITEM",
            "WEAPON", "DRUG", "CHEMICAL", "DOCUMENT", "PHONE_NUMBER", "ADDRESS", "EMAIL"
        ]
        
        self.RELATIONSHIP_TYPES = [
            # Standard relationships
            "works_for", "founded", "located_in", "part_of", "associated_with", "owns", "manages",
            "leads", "reports_to", "collaborates_with", "partners_with", "supplies_to", "acquires",
            "invests_in", "headquartered_in", "operates_in", "born_in", "lives_in", "studied_at",
            "graduated_from", "worked_at", "visited", "attended", "participated_in", "sponsored",
            "developed", "created", "invented", "discovered", "published", "authored", "edited",
            # Thai relationships
            "ทำงานที่", "ก่อตั้ง", "ตั้งอยู่ที่", "เป็นส่วนหนึ่งของ", "เกี่ยวข้องกับ", "เป็นเจ้าของ", "จัดการ",
            "นำโดย", "รายงานต่อ", "ร่วมงานกับ", "เป็นพันธมิตรกับ", "จัดหาให้", "ซื้อกิจการ", "ลงทุนใน",
            "สำนักงานใหญ่ที่", "ดำเนินการใน", "เกิดที่", "อาศัยอยู่ที่", "ศึกษาที่", "จบการศึกษาจาก",
            # Law enforcement relationships
            "arrested_by", "investigated_by", "confiscated_from", "used_in", "evidence_of", "witness_of",
            "victim_of", "suspect_in", "charged_with", "convicted_of", "sentenced_by", "defended_by",
            "prosecuted_by", "testified_against", "alibi_for", "found_at", "seized_from", "linked_to",
            "จับกุมโดย", "สอบสวนโดย", "ยึดจาก", "ใช้ในการ", "หลักฐานของ", "พยานใน", "เหยื่อของ",
            "ผู้ต้องสงสัยใน", "ถูกตั้งข้อหา", "ถูกตัดสิน", "ถูกพิพากษาโดย", "ต่อสู้คดีโดย", "ฟ้องร้องโดย",
            "ให้การต่อต้าน", "เป็นข้อแก้ตัวสำหรับ", "พบที่", "ยึดจาก", "เชื่อมโยงกับ",
            # Criminal relationships
            "possess_illegal", "transport_illegal", "sell_illegal", "buy_illegal", "hide_evidence",
            "plan_crime", "commit_crime", "flee_from", "escape_from", "hide_at", "meet_with",
            "communicate_with", "threaten", "blackmail", "bribe", "corrupt", "money_launder",
            "ครอบครองของผิดกฎหมาย", "ขนส่งของผิดกฎหมาย", "ขายของผิดกฎหมาย", "ซื้อของผิดกฎหมาย",
            "ซ่อนหลักฐาน", "วางแผนอาชญากรรม", "กระทำอาชญากรรม", "หลบหนีจาก", "แอบซ่อนที่",
            "พบปะกับ", "ติดต่อกับ", "ข่มขู่", "แบล็คเมล์", "ให้สินบน", "ทุจริต", "ฟอกเงิน"
        ]
    
    def validate_deepseek(self) -> bool:
        """Validate DeepSeek configuration"""
        return bool(
            self.DEEPSEEK_ENDPOINT and 
            self.DEEPSEEK_API_KEY and
            self.DEEPSEEK_ENDPOINT != "YOUR_DEEPSEEK_ENDPOINT" and
            self.DEEPSEEK_API_KEY != "YOUR_DEEPSEEK_API_KEY"
        )

class OCRConfig(BaseConfig):
    """Configuration for OCR Service"""
    
    def __init__(self):
        super().__init__()
        self.PORT = int(os.getenv("OCR_PORT", "8400"))
        
        # Azure Document Intelligence Configuration
        self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT", "")
        self.AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY", "")
        
        # Web scraping configuration
        self.MAX_IMAGES_PER_PAGE = int(os.getenv("MAX_IMAGES_PER_PAGE", "10"))
        self.USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    def validate_azure_document_intelligence(self) -> bool:
        """Validate Azure Document Intelligence configuration"""
        return bool(
            self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and 
            self.AZURE_DOCUMENT_INTELLIGENCE_KEY and
            self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and
            self.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY"
        )

class RAGConfig(BaseConfig):
    """Configuration for RAG Service"""
    
    def __init__(self):
        super().__init__()
        self.PORT = int(os.getenv("RAG_PORT", "8401"))
        
        # OCR Service Configuration
        self.OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
        
        # PostgreSQL Configuration (specific to RAG)
        self.PG_HOST = self.POSTGRES_HOST
        self.PG_PORT = self.POSTGRES_PORT
        self.PG_DATABASE = os.getenv("PG_DATABASE", "vectorsearch")  # RAG uses different default DB
        self.PG_USER = self.POSTGRES_USER
        self.PG_PASSWORD = self.POSTGRES_PASSWORD
        self.PG_SSL_MODE = os.getenv("PG_SSL_MODE", "require")
        
        # Chunking Configuration
        self.CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
        self.CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
        self.MIN_CHUNK_SIZE = int(os.getenv("MIN_CHUNK_SIZE", "50"))
        
        # Azure OpenAI Configuration (RAG specific)
        self.AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "text-embedding-3-small")
        self.AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")

class UnifiedConfig:
    """Unified configuration for all services"""
    
    def __init__(self):
        self.ner = NERConfig()
        self.ocr = OCRConfig()
        self.rag = RAGConfig()
        
        # Main app configuration
        self.MAIN_PORT = int(os.getenv("MAIN_PORT", "8000"))
        self.MAIN_HOST = os.getenv("MAIN_HOST", "0.0.0.0")
        
        # Service URLs (for inter-service communication)
        self.NER_SERVICE_URL = f"http://localhost:{self.ner.PORT}"
        self.OCR_SERVICE_URL = f"http://localhost:{self.ocr.PORT}"
        self.RAG_SERVICE_URL = f"http://localhost:{self.rag.PORT}"
        
        # Service Health Check Configuration
        self.HEALTH_CHECK_TIMEOUT = 30
        self.HEALTH_CHECK_RETRIES = 3
        self.HEALTH_CHECK_INTERVAL = 5
        
        # Load balancing and routing
        self.SERVICE_WEIGHTS = {
            "ner": 1.0,
            "ocr": 1.0,
            "rag": 1.0
        }
    
    def validate_all(self) -> Dict[str, Dict[str, bool]]:
        """Validate all service configurations"""
        validation_results = {
            "ner": {
                "deepseek": self.ner.validate_deepseek(),
                "azure_openai": self.ner.validate_azure_openai(),
                "postgres": self.ner.validate_postgres(),
                "azure_storage": self.ner.validate_azure_storage()
            },
            "ocr": {
                "azure_document_intelligence": self.ocr.validate_azure_document_intelligence()
            },
            "rag": {
                "azure_openai": self.rag.validate_azure_openai(),
                "postgres": self.rag.validate_postgres()
            }
        }
        return validation_results
    
    def get_service_config(self, service_name: str) -> BaseConfig:
        """Get configuration for a specific service"""
        service_configs = {
            "ner": self.ner,
            "ocr": self.ocr,
            "rag": self.rag
        }
        return service_configs.get(service_name.lower())
    
    def get_database_config(self) -> Dict[str, str]:
        """Get database configuration for services that need it"""
        return {
            "host": self.ner.POSTGRES_HOST,
            "port": str(self.ner.POSTGRES_PORT),
            "user": self.ner.POSTGRES_USER,
            "password": self.ner.POSTGRES_PASSWORD,
            "database": self.ner.POSTGRES_DATABASE,
            "ssl_mode": getattr(self.rag, 'PG_SSL_MODE', 'require')
        }
    
    def get_azure_openai_config(self) -> Dict[str, str]:
        """Get Azure OpenAI configuration for services that need it"""
        return {
            "endpoint": self.ner.AZURE_OPENAI_ENDPOINT,
            "api_key": self.ner.AZURE_OPENAI_API_KEY,
            "embedding_model": self.ner.EMBEDDING_MODEL,
            "deployment_name": self.ner.AZURE_OPENAI_DEPLOYMENT_NAME
        }
    
    def print_configuration_summary(self):
        """Print a summary of all configurations"""
        print("🔧 Configuration Summary")
        print("=" * 50)
        
        # Validate all configurations
        validation_results = self.validate_all()
        
        # NER Service
        print(f"📝 NER Service (Port {self.ner.PORT}):")
        print(f"   DeepSeek: {'✅' if validation_results['ner']['deepseek'] else '❌'}")
        print(f"   Azure OpenAI: {'✅' if validation_results['ner']['azure_openai'] else '❌'}")
        print(f"   PostgreSQL: {'✅' if validation_results['ner']['postgres'] else '❌'}")
        print(f"   Azure Storage: {'✅' if validation_results['ner']['azure_storage'] else '❌'}")
        print(f"   OCR Service URL: {self.ner.OCR_SERVICE_URL}")
        
        # OCR Service
        print(f"\n🔍 OCR Service (Port {self.ocr.PORT}):")
        print(f"   Azure Document Intelligence: {'✅' if validation_results['ocr']['azure_document_intelligence'] else '❌'}")
        print(f"   Max File Size: {self.ocr.MAX_FILE_SIZE / (1024*1024):.0f} MB")
        
        # RAG Service
        print(f"\n🧠 RAG Service (Port {self.rag.PORT}):")
        print(f"   Azure OpenAI: {'✅' if validation_results['rag']['azure_openai'] else '❌'}")
        print(f"   PostgreSQL: {'✅' if validation_results['rag']['postgres'] else '❌'}")
        print(f"   OCR Service URL: {self.rag.OCR_SERVICE_URL}")
        print(f"   Chunk Size: {self.rag.CHUNK_SIZE}")
        
        # Main App
        print(f"\n🌐 Main App (Port {self.MAIN_PORT}):")
        print(f"   NER Service: {self.NER_SERVICE_URL}")
        print(f"   OCR Service: {self.OCR_SERVICE_URL}")
        print(f"   RAG Service: {self.RAG_SERVICE_URL}")
        
        # Database Configuration
        print(f"\n🗄️  Database Configuration:")
        print(f"   Host: {self.ner.POSTGRES_HOST}")
        print(f"   Port: {self.ner.POSTGRES_PORT}")
        print(f"   User: {self.ner.POSTGRES_USER}")
        print(f"   NER Database: {self.ner.POSTGRES_DATABASE}")
        print(f"   RAG Database: {self.rag.PG_DATABASE}")
        
        # Critical Issues
        all_validations = []
        for service, validations in validation_results.items():
            all_validations.extend(validations.values())
        
        if not all(all_validations):
            print(f"\n⚠️  CONFIGURATION ISSUES DETECTED:")
            for service, validations in validation_results.items():
                for component, is_valid in validations.items():
                    if not is_valid:
                        print(f"   ❌ {service.upper()}: {component} not configured")
        else:
            print(f"\n✅ All configurations are valid!")

# Global configuration instance
config = UnifiedConfig()

def get_config() -> UnifiedConfig:
    """Get the global configuration instance"""
    return config

def validate_environment() -> bool:
    """Validate the entire environment configuration"""
    validation_results = config.validate_all()
    
    # Check critical components
    critical_components = [
        validation_results['ner']['azure_openai'],
        validation_results['ner']['postgres'],
        validation_results['ocr']['azure_document_intelligence'],
        validation_results['rag']['azure_openai'],
        validation_results['rag']['postgres']
    ]
    
    return all(critical_components)

if __name__ == "__main__":
    """Test configuration loading and validation"""
    print("🧪 Testing Configuration Loading")
    print("=" * 40)
    
    try:
        config.print_configuration_summary()
        
        if validate_environment():
            print("\n🎉 Environment validation passed!")
            print("All critical services are properly configured.")
        else:
            print("\n❌ Environment validation failed!")
            print("Some critical services are not properly configured.")
            print("Please check your .env file and update missing values.")
            
    except Exception as e:
        print(f"\n❌ Configuration loading failed: {e}")
        logger.error(f"Configuration error: {e}")