File size: 17,869 Bytes
963ae98 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 |
#!/usr/bin/env python3
"""
Centralized Configuration Management for Unified AI Services
Manages configuration for NER, OCR, and RAG services
"""
import os
import logging
from pathlib import Path
from typing import Optional, Dict, Any, List
from dotenv import load_dotenv
# Load environment variables
env_path = Path(__file__).parent / '.env'
if env_path.exists():
load_dotenv(dotenv_path=env_path)
else:
load_dotenv() # Load from default location
# Setup logging
logging.basicConfig(
level=getattr(logging, os.getenv("LOG_LEVEL", "INFO").upper()),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class BaseConfig:
"""Base configuration class with common settings"""
def __init__(self):
# Server Configuration
self.HOST = os.getenv("HOST", "0.0.0.0")
self.DEBUG = os.getenv("DEBUG", "False").lower() == "true"
# Database Configuration (shared by NER and RAG)
self.POSTGRES_HOST = os.getenv("POSTGRES_HOST", "")
self.POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
self.POSTGRES_USER = os.getenv("POSTGRES_USER", "")
self.POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
self.POSTGRES_DATABASE = os.getenv("POSTGRES_DATABASE", "postgres")
# Azure OpenAI Configuration (shared by NER and RAG)
self.AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
self.AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
self.AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "text-embedding-3-large")
# Azure Storage Configuration (shared by NER and RAG)
self.AZURE_STORAGE_ACCOUNT_URL = os.getenv("AZURE_STORAGE_ACCOUNT_URL", "")
self.AZURE_BLOB_SAS_TOKEN = os.getenv("AZURE_BLOB_SAS_TOKEN", "")
self.BLOB_CONTAINER = os.getenv("BLOB_CONTAINER", "historylog")
# Processing Configuration
self.MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", "50")) * 1024 * 1024 # Convert MB to bytes
self.REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "300"))
# CORS Configuration
self.ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*")
def validate_azure_openai(self) -> bool:
"""Validate Azure OpenAI configuration"""
return bool(
self.AZURE_OPENAI_ENDPOINT and
self.AZURE_OPENAI_API_KEY and
self.AZURE_OPENAI_ENDPOINT != "YOUR_AZURE_OPENAI_ENDPOINT" and
self.AZURE_OPENAI_API_KEY != "YOUR_AZURE_OPENAI_KEY"
)
def validate_postgres(self) -> bool:
"""Validate PostgreSQL configuration"""
return bool(
self.POSTGRES_HOST and
self.POSTGRES_USER and
self.POSTGRES_PASSWORD and
self.POSTGRES_DATABASE
)
def validate_azure_storage(self) -> bool:
"""Validate Azure Storage configuration"""
return bool(
self.AZURE_STORAGE_ACCOUNT_URL and
self.AZURE_BLOB_SAS_TOKEN
)
class NERConfig(BaseConfig):
"""Configuration for NER Service"""
def __init__(self):
super().__init__()
self.PORT = int(os.getenv("NER_PORT", "8500"))
# DeepSeek Configuration
self.DEEPSEEK_ENDPOINT = os.getenv("DEEPSEEK_ENDPOINT", "")
self.DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
self.DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "DeepSeek-R1-0528")
# OCR Service Configuration
self.OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
# NER Specific Settings
self.MAX_TEXT_LENGTH = 100000 # 100KB
self.SUPPORTED_TEXT_FORMATS = {'.txt', '.doc', '.docx', '.rtf'}
self.SUPPORTED_OCR_FORMATS = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif'}
# Entity and Relationship Types
self.ENTITY_TYPES = [
"PERSON", "ORGANIZATION", "LOCATION", "DATE", "TIME", "MONEY", "PRODUCT", "EVENT",
"VEHICLE", "SUSPICIOUS_OBJECT", "ILLEGAL_ACTIVITY", "EVIDENCE", "ILLEGAL_ITEM",
"WEAPON", "DRUG", "CHEMICAL", "DOCUMENT", "PHONE_NUMBER", "ADDRESS", "EMAIL"
]
self.RELATIONSHIP_TYPES = [
# Standard relationships
"works_for", "founded", "located_in", "part_of", "associated_with", "owns", "manages",
"leads", "reports_to", "collaborates_with", "partners_with", "supplies_to", "acquires",
"invests_in", "headquartered_in", "operates_in", "born_in", "lives_in", "studied_at",
"graduated_from", "worked_at", "visited", "attended", "participated_in", "sponsored",
"developed", "created", "invented", "discovered", "published", "authored", "edited",
# Thai relationships
"ทำงานที่", "ก่อตั้ง", "ตั้งอยู่ที่", "เป็นส่วนหนึ่งของ", "เกี่ยวข้องกับ", "เป็นเจ้าของ", "จัดการ",
"นำโดย", "รายงานต่อ", "ร่วมงานกับ", "เป็นพันธมิตรกับ", "จัดหาให้", "ซื้อกิจการ", "ลงทุนใน",
"สำนักงานใหญ่ที่", "ดำเนินการใน", "เกิดที่", "อาศัยอยู่ที่", "ศึกษาที่", "จบการศึกษาจาก",
# Law enforcement relationships
"arrested_by", "investigated_by", "confiscated_from", "used_in", "evidence_of", "witness_of",
"victim_of", "suspect_in", "charged_with", "convicted_of", "sentenced_by", "defended_by",
"prosecuted_by", "testified_against", "alibi_for", "found_at", "seized_from", "linked_to",
"จับกุมโดย", "สอบสวนโดย", "ยึดจาก", "ใช้ในการ", "หลักฐานของ", "พยานใน", "เหยื่อของ",
"ผู้ต้องสงสัยใน", "ถูกตั้งข้อหา", "ถูกตัดสิน", "ถูกพิพากษาโดย", "ต่อสู้คดีโดย", "ฟ้องร้องโดย",
"ให้การต่อต้าน", "เป็นข้อแก้ตัวสำหรับ", "พบที่", "ยึดจาก", "เชื่อมโยงกับ",
# Criminal relationships
"possess_illegal", "transport_illegal", "sell_illegal", "buy_illegal", "hide_evidence",
"plan_crime", "commit_crime", "flee_from", "escape_from", "hide_at", "meet_with",
"communicate_with", "threaten", "blackmail", "bribe", "corrupt", "money_launder",
"ครอบครองของผิดกฎหมาย", "ขนส่งของผิดกฎหมาย", "ขายของผิดกฎหมาย", "ซื้อของผิดกฎหมาย",
"ซ่อนหลักฐาน", "วางแผนอาชญากรรม", "กระทำอาชญากรรม", "หลบหนีจาก", "แอบซ่อนที่",
"พบปะกับ", "ติดต่อกับ", "ข่มขู่", "แบล็คเมล์", "ให้สินบน", "ทุจริต", "ฟอกเงิน"
]
def validate_deepseek(self) -> bool:
"""Validate DeepSeek configuration"""
return bool(
self.DEEPSEEK_ENDPOINT and
self.DEEPSEEK_API_KEY and
self.DEEPSEEK_ENDPOINT != "YOUR_DEEPSEEK_ENDPOINT" and
self.DEEPSEEK_API_KEY != "YOUR_DEEPSEEK_API_KEY"
)
class OCRConfig(BaseConfig):
"""Configuration for OCR Service"""
def __init__(self):
super().__init__()
self.PORT = int(os.getenv("OCR_PORT", "8400"))
# Azure Document Intelligence Configuration
self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT", "")
self.AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY", "")
# Web scraping configuration
self.MAX_IMAGES_PER_PAGE = int(os.getenv("MAX_IMAGES_PER_PAGE", "10"))
self.USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
def validate_azure_document_intelligence(self) -> bool:
"""Validate Azure Document Intelligence configuration"""
return bool(
self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
self.AZURE_DOCUMENT_INTELLIGENCE_KEY and
self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and
self.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY"
)
class RAGConfig(BaseConfig):
"""Configuration for RAG Service"""
def __init__(self):
super().__init__()
self.PORT = int(os.getenv("RAG_PORT", "8401"))
# OCR Service Configuration
self.OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
# PostgreSQL Configuration (specific to RAG)
self.PG_HOST = self.POSTGRES_HOST
self.PG_PORT = self.POSTGRES_PORT
self.PG_DATABASE = os.getenv("PG_DATABASE", "vectorsearch") # RAG uses different default DB
self.PG_USER = self.POSTGRES_USER
self.PG_PASSWORD = self.POSTGRES_PASSWORD
self.PG_SSL_MODE = os.getenv("PG_SSL_MODE", "require")
# Chunking Configuration
self.CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
self.CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
self.MIN_CHUNK_SIZE = int(os.getenv("MIN_CHUNK_SIZE", "50"))
# Azure OpenAI Configuration (RAG specific)
self.AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "text-embedding-3-small")
self.AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")
class UnifiedConfig:
"""Unified configuration for all services"""
def __init__(self):
self.ner = NERConfig()
self.ocr = OCRConfig()
self.rag = RAGConfig()
# Main app configuration
self.MAIN_PORT = int(os.getenv("MAIN_PORT", "8000"))
self.MAIN_HOST = os.getenv("MAIN_HOST", "0.0.0.0")
# Service URLs (for inter-service communication)
self.NER_SERVICE_URL = f"http://localhost:{self.ner.PORT}"
self.OCR_SERVICE_URL = f"http://localhost:{self.ocr.PORT}"
self.RAG_SERVICE_URL = f"http://localhost:{self.rag.PORT}"
# Service Health Check Configuration
self.HEALTH_CHECK_TIMEOUT = 30
self.HEALTH_CHECK_RETRIES = 3
self.HEALTH_CHECK_INTERVAL = 5
# Load balancing and routing
self.SERVICE_WEIGHTS = {
"ner": 1.0,
"ocr": 1.0,
"rag": 1.0
}
def validate_all(self) -> Dict[str, Dict[str, bool]]:
"""Validate all service configurations"""
validation_results = {
"ner": {
"deepseek": self.ner.validate_deepseek(),
"azure_openai": self.ner.validate_azure_openai(),
"postgres": self.ner.validate_postgres(),
"azure_storage": self.ner.validate_azure_storage()
},
"ocr": {
"azure_document_intelligence": self.ocr.validate_azure_document_intelligence()
},
"rag": {
"azure_openai": self.rag.validate_azure_openai(),
"postgres": self.rag.validate_postgres()
}
}
return validation_results
def get_service_config(self, service_name: str) -> BaseConfig:
"""Get configuration for a specific service"""
service_configs = {
"ner": self.ner,
"ocr": self.ocr,
"rag": self.rag
}
return service_configs.get(service_name.lower())
def get_database_config(self) -> Dict[str, str]:
"""Get database configuration for services that need it"""
return {
"host": self.ner.POSTGRES_HOST,
"port": str(self.ner.POSTGRES_PORT),
"user": self.ner.POSTGRES_USER,
"password": self.ner.POSTGRES_PASSWORD,
"database": self.ner.POSTGRES_DATABASE,
"ssl_mode": getattr(self.rag, 'PG_SSL_MODE', 'require')
}
def get_azure_openai_config(self) -> Dict[str, str]:
"""Get Azure OpenAI configuration for services that need it"""
return {
"endpoint": self.ner.AZURE_OPENAI_ENDPOINT,
"api_key": self.ner.AZURE_OPENAI_API_KEY,
"embedding_model": self.ner.EMBEDDING_MODEL,
"deployment_name": self.ner.AZURE_OPENAI_DEPLOYMENT_NAME
}
def print_configuration_summary(self):
"""Print a summary of all configurations"""
print("🔧 Configuration Summary")
print("=" * 50)
# Validate all configurations
validation_results = self.validate_all()
# NER Service
print(f"📝 NER Service (Port {self.ner.PORT}):")
print(f" DeepSeek: {'✅' if validation_results['ner']['deepseek'] else '❌'}")
print(f" Azure OpenAI: {'✅' if validation_results['ner']['azure_openai'] else '❌'}")
print(f" PostgreSQL: {'✅' if validation_results['ner']['postgres'] else '❌'}")
print(f" Azure Storage: {'✅' if validation_results['ner']['azure_storage'] else '❌'}")
print(f" OCR Service URL: {self.ner.OCR_SERVICE_URL}")
# OCR Service
print(f"\n🔍 OCR Service (Port {self.ocr.PORT}):")
print(f" Azure Document Intelligence: {'✅' if validation_results['ocr']['azure_document_intelligence'] else '❌'}")
print(f" Max File Size: {self.ocr.MAX_FILE_SIZE / (1024*1024):.0f} MB")
# RAG Service
print(f"\n🧠 RAG Service (Port {self.rag.PORT}):")
print(f" Azure OpenAI: {'✅' if validation_results['rag']['azure_openai'] else '❌'}")
print(f" PostgreSQL: {'✅' if validation_results['rag']['postgres'] else '❌'}")
print(f" OCR Service URL: {self.rag.OCR_SERVICE_URL}")
print(f" Chunk Size: {self.rag.CHUNK_SIZE}")
# Main App
print(f"\n🌐 Main App (Port {self.MAIN_PORT}):")
print(f" NER Service: {self.NER_SERVICE_URL}")
print(f" OCR Service: {self.OCR_SERVICE_URL}")
print(f" RAG Service: {self.RAG_SERVICE_URL}")
# Database Configuration
print(f"\n🗄️ Database Configuration:")
print(f" Host: {self.ner.POSTGRES_HOST}")
print(f" Port: {self.ner.POSTGRES_PORT}")
print(f" User: {self.ner.POSTGRES_USER}")
print(f" NER Database: {self.ner.POSTGRES_DATABASE}")
print(f" RAG Database: {self.rag.PG_DATABASE}")
# Critical Issues
all_validations = []
for service, validations in validation_results.items():
all_validations.extend(validations.values())
if not all(all_validations):
print(f"\n⚠️ CONFIGURATION ISSUES DETECTED:")
for service, validations in validation_results.items():
for component, is_valid in validations.items():
if not is_valid:
print(f" ❌ {service.upper()}: {component} not configured")
else:
print(f"\n✅ All configurations are valid!")
# Global configuration instance
config = UnifiedConfig()
def get_config() -> UnifiedConfig:
"""Get the global configuration instance"""
return config
def validate_environment() -> bool:
"""Validate the entire environment configuration"""
validation_results = config.validate_all()
# Check critical components
critical_components = [
validation_results['ner']['azure_openai'],
validation_results['ner']['postgres'],
validation_results['ocr']['azure_document_intelligence'],
validation_results['rag']['azure_openai'],
validation_results['rag']['postgres']
]
return all(critical_components)
if __name__ == "__main__":
"""Test configuration loading and validation"""
print("🧪 Testing Configuration Loading")
print("=" * 40)
try:
config.print_configuration_summary()
if validate_environment():
print("\n🎉 Environment validation passed!")
print("All critical services are properly configured.")
else:
print("\n❌ Environment validation failed!")
print("Some critical services are not properly configured.")
print("Please check your .env file and update missing values.")
except Exception as e:
print(f"\n❌ Configuration loading failed: {e}")
logger.error(f"Configuration error: {e}") |