Upload 3 files
Browse files- service/ner_service.py +1582 -0
- service/ocr_service.py +588 -0
- service/rag_service.py +1367 -0
service/ner_service.py
ADDED
|
@@ -0,0 +1,1582 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Enhanced NER Analysis Service - Cleaned and Optimized
|
| 4 |
+
Advanced Named Entity Recognition with Thai language support,
|
| 5 |
+
relationship extraction, and graph database exports
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import io
|
| 10 |
+
import json
|
| 11 |
+
import logging
|
| 12 |
+
import re
|
| 13 |
+
import csv
|
| 14 |
+
import tempfile
|
| 15 |
+
import zipfile
|
| 16 |
+
from datetime import datetime
|
| 17 |
+
from typing import Optional, List, Dict, Any, Union, Tuple
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from contextlib import asynccontextmanager
|
| 20 |
+
from collections import defaultdict
|
| 21 |
+
import xml.etree.ElementTree as ET
|
| 22 |
+
|
| 23 |
+
import httpx
|
| 24 |
+
import asyncpg
|
| 25 |
+
from azure.storage.blob import BlobServiceClient
|
| 26 |
+
from azure.core.credentials import AzureKeyCredential
|
| 27 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException, Form, BackgroundTasks
|
| 28 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 29 |
+
from fastapi.responses import FileResponse
|
| 30 |
+
from pydantic import BaseModel, HttpUrl, field_validator
|
| 31 |
+
import uvicorn
|
| 32 |
+
import docx
|
| 33 |
+
from azure.ai.inference import ChatCompletionsClient
|
| 34 |
+
from azure.ai.inference.models import SystemMessage, UserMessage
|
| 35 |
+
from openai import AzureOpenAI
|
| 36 |
+
|
| 37 |
+
# Import unified configuration
|
| 38 |
+
try:
|
| 39 |
+
from configs import get_config
|
| 40 |
+
config = get_config().ner
|
| 41 |
+
unified_config = get_config()
|
| 42 |
+
print("β
Using unified configuration")
|
| 43 |
+
except ImportError:
|
| 44 |
+
print("β οΈ Unified config not available, using fallback configuration")
|
| 45 |
+
# Fallback configuration
|
| 46 |
+
from dotenv import load_dotenv
|
| 47 |
+
load_dotenv()
|
| 48 |
+
|
| 49 |
+
class FallbackConfig:
|
| 50 |
+
HOST = os.getenv("HOST", "0.0.0.0")
|
| 51 |
+
PORT = int(os.getenv("NER_PORT", "8500"))
|
| 52 |
+
DEBUG = os.getenv("DEBUG", "False").lower() == "true"
|
| 53 |
+
|
| 54 |
+
# Database
|
| 55 |
+
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "")
|
| 56 |
+
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
| 57 |
+
POSTGRES_USER = os.getenv("POSTGRES_USER", "")
|
| 58 |
+
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
|
| 59 |
+
POSTGRES_DATABASE = os.getenv("POSTGRES_DATABASE", "postgres")
|
| 60 |
+
|
| 61 |
+
# APIs
|
| 62 |
+
OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
|
| 63 |
+
DEEPSEEK_ENDPOINT = os.getenv("DEEPSEEK_ENDPOINT", "")
|
| 64 |
+
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
|
| 65 |
+
DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "DeepSeek-R1-0528")
|
| 66 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
|
| 67 |
+
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
|
| 68 |
+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")
|
| 69 |
+
|
| 70 |
+
# Storage
|
| 71 |
+
AZURE_STORAGE_ACCOUNT_URL = os.getenv("AZURE_STORAGE_ACCOUNT_URL", "")
|
| 72 |
+
AZURE_BLOB_SAS_TOKEN = os.getenv("AZURE_BLOB_SAS_TOKEN", "")
|
| 73 |
+
BLOB_CONTAINER = os.getenv("BLOB_CONTAINER", "historylog")
|
| 74 |
+
|
| 75 |
+
# Limits
|
| 76 |
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
|
| 77 |
+
MAX_TEXT_LENGTH = 100000 # 100KB
|
| 78 |
+
|
| 79 |
+
SUPPORTED_TEXT_FORMATS = {'.txt', '.doc', '.docx', '.rtf'}
|
| 80 |
+
SUPPORTED_OCR_FORMATS = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif'}
|
| 81 |
+
|
| 82 |
+
ENTITY_TYPES = [
|
| 83 |
+
"PERSON", "ORGANIZATION", "LOCATION", "DATE", "TIME", "MONEY", "PRODUCT", "EVENT",
|
| 84 |
+
"VEHICLE", "SUSPICIOUS_OBJECT", "ILLEGAL_ACTIVITY", "EVIDENCE", "ILLEGAL_ITEM",
|
| 85 |
+
"WEAPON", "DRUG", "CHEMICAL", "DOCUMENT", "PHONE_NUMBER", "ADDRESS", "EMAIL"
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
RELATIONSHIP_TYPES = [
|
| 89 |
+
"works_for", "founded", "located_in", "part_of", "associated_with", "owns", "manages",
|
| 90 |
+
"ΰΈΰΈ³ΰΈΰΈ²ΰΈΰΈΰΈ΅ΰΉ", "ΰΈΰΉΰΈΰΈΰΈ±ΰΉΰΈ", "ΰΈΰΈ±ΰΉΰΈΰΈΰΈ’ΰΈΉΰΉΰΈΰΈ΅ΰΉ", "ΰΉΰΈΰΈ΅ΰΉΰΈ’ΰΈ§ΰΈΰΉΰΈΰΈΰΈΰΈ±ΰΈ", "ΰΉΰΈΰΉΰΈΰΉΰΈΰΉΰΈ²ΰΈΰΈΰΈ",
|
| 91 |
+
"arrested_by", "investigated_by", "confiscated_from", "used_in", "evidence_of",
|
| 92 |
+
"ΰΈΰΈ±ΰΈΰΈΰΈΈΰΈ‘ΰΉΰΈΰΈ’", "ΰΈͺΰΈΰΈΰΈͺΰΈ§ΰΈΰΉΰΈΰΈ’", "ΰΈ’ΰΈΆΰΈΰΈΰΈ²ΰΈ", "ΰΈ«ΰΈ₯ΰΈ±ΰΈΰΈΰΈ²ΰΈΰΈΰΈΰΈ"
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
config = FallbackConfig()
|
| 96 |
+
|
| 97 |
+
# Setup logging
|
| 98 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 99 |
+
logger = logging.getLogger(__name__)
|
| 100 |
+
|
| 101 |
+
# Export directories
|
| 102 |
+
EXPORT_DIR = Path("exports")
|
| 103 |
+
EXPORT_DIR.mkdir(exist_ok=True)
|
| 104 |
+
|
| 105 |
+
# Global variables
|
| 106 |
+
pg_pool = None
|
| 107 |
+
vector_available = False
|
| 108 |
+
clients = {}
|
| 109 |
+
|
| 110 |
+
# Pydantic Models
|
| 111 |
+
class NERRequest(BaseModel):
|
| 112 |
+
text: Optional[str] = None
|
| 113 |
+
url: Optional[HttpUrl] = None
|
| 114 |
+
extract_relationships: bool = True
|
| 115 |
+
include_embeddings: bool = True
|
| 116 |
+
include_summary: bool = True
|
| 117 |
+
generate_graph_files: bool = True
|
| 118 |
+
export_formats: List[str] = ["neo4j", "json", "graphml"]
|
| 119 |
+
|
| 120 |
+
@field_validator('text')
|
| 121 |
+
@classmethod
|
| 122 |
+
def validate_text_length(cls, v):
|
| 123 |
+
if v and len(v) > config.MAX_TEXT_LENGTH:
|
| 124 |
+
raise ValueError(f"Text too long (max {config.MAX_TEXT_LENGTH} characters)")
|
| 125 |
+
return v
|
| 126 |
+
|
| 127 |
+
class MultiInputRequest(BaseModel):
|
| 128 |
+
texts: Optional[List[str]] = None
|
| 129 |
+
urls: Optional[List[HttpUrl]] = None
|
| 130 |
+
extract_relationships: bool = True
|
| 131 |
+
include_embeddings: bool = True
|
| 132 |
+
include_summary: bool = True
|
| 133 |
+
combine_results: bool = True
|
| 134 |
+
generate_graph_files: bool = True
|
| 135 |
+
export_formats: List[str] = ["neo4j", "json", "graphml"]
|
| 136 |
+
|
| 137 |
+
class EntityResult(BaseModel):
|
| 138 |
+
id: str
|
| 139 |
+
text: str
|
| 140 |
+
label: str
|
| 141 |
+
confidence: float
|
| 142 |
+
start_pos: int
|
| 143 |
+
end_pos: int
|
| 144 |
+
source_type: Optional[str] = None
|
| 145 |
+
source_index: Optional[int] = None
|
| 146 |
+
frequency: int = 1
|
| 147 |
+
importance_score: float = 0.0
|
| 148 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 149 |
+
|
| 150 |
+
class RelationshipResult(BaseModel):
|
| 151 |
+
id: str
|
| 152 |
+
source_entity_id: str
|
| 153 |
+
target_entity_id: str
|
| 154 |
+
source_entity: str
|
| 155 |
+
target_entity: str
|
| 156 |
+
relationship_type: str
|
| 157 |
+
confidence: float
|
| 158 |
+
strength: float
|
| 159 |
+
context: str
|
| 160 |
+
evidence_count: int = 1
|
| 161 |
+
bidirectional: bool = False
|
| 162 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 163 |
+
|
| 164 |
+
class NodeResult(BaseModel):
|
| 165 |
+
id: str
|
| 166 |
+
label: str
|
| 167 |
+
type: str
|
| 168 |
+
confidence: float
|
| 169 |
+
frequency: int = 1
|
| 170 |
+
importance_score: float = 0.0
|
| 171 |
+
properties: Dict[str, Any]
|
| 172 |
+
|
| 173 |
+
class LinkResult(BaseModel):
|
| 174 |
+
id: str
|
| 175 |
+
source: str
|
| 176 |
+
target: str
|
| 177 |
+
relationship: str
|
| 178 |
+
confidence: float
|
| 179 |
+
strength: float
|
| 180 |
+
evidence_count: int = 1
|
| 181 |
+
properties: Dict[str, Any]
|
| 182 |
+
|
| 183 |
+
class GraphData(BaseModel):
|
| 184 |
+
nodes: List[NodeResult]
|
| 185 |
+
links: List[LinkResult]
|
| 186 |
+
metadata: Dict[str, Any]
|
| 187 |
+
|
| 188 |
+
class ExportFiles(BaseModel):
|
| 189 |
+
neo4j_nodes: Optional[str] = None
|
| 190 |
+
neo4j_relationships: Optional[str] = None
|
| 191 |
+
json_export: Optional[str] = None
|
| 192 |
+
graphml_export: Optional[str] = None
|
| 193 |
+
csv_nodes: Optional[str] = None
|
| 194 |
+
csv_edges: Optional[str] = None
|
| 195 |
+
gexf_export: Optional[str] = None
|
| 196 |
+
analysis_report: Optional[str] = None
|
| 197 |
+
download_bundle: Optional[str] = None
|
| 198 |
+
|
| 199 |
+
class NERResponse(BaseModel):
|
| 200 |
+
success: bool
|
| 201 |
+
analysis_id: str
|
| 202 |
+
source_text: str
|
| 203 |
+
source_type: str
|
| 204 |
+
language: str
|
| 205 |
+
entities: List[EntityResult]
|
| 206 |
+
keywords: List[str]
|
| 207 |
+
relationships: List[RelationshipResult]
|
| 208 |
+
summary: str
|
| 209 |
+
embeddings: Optional[List[float]] = None
|
| 210 |
+
graph_data: GraphData
|
| 211 |
+
export_files: ExportFiles
|
| 212 |
+
processing_time: float
|
| 213 |
+
character_count: int
|
| 214 |
+
word_count: int
|
| 215 |
+
sentence_count: int
|
| 216 |
+
entity_relationship_stats: Dict[str, Any]
|
| 217 |
+
error: Optional[str] = None
|
| 218 |
+
|
| 219 |
+
class MultiNERResponse(BaseModel):
|
| 220 |
+
success: bool
|
| 221 |
+
analysis_id: str
|
| 222 |
+
combined_analysis: NERResponse
|
| 223 |
+
individual_analyses: List[NERResponse]
|
| 224 |
+
processing_time: float
|
| 225 |
+
total_sources: int
|
| 226 |
+
error: Optional[str] = None
|
| 227 |
+
|
| 228 |
+
# Utility Functions
|
| 229 |
+
def generate_unique_id(prefix: str = "item") -> str:
|
| 230 |
+
"""Generate unique ID with timestamp"""
|
| 231 |
+
return f"{prefix}_{int(datetime.utcnow().timestamp() * 1000)}"
|
| 232 |
+
|
| 233 |
+
def normalize_text(text: str) -> str:
|
| 234 |
+
"""Normalize text for comparison"""
|
| 235 |
+
return re.sub(r'\s+', ' ', text.strip().lower())
|
| 236 |
+
|
| 237 |
+
def calculate_text_similarity(text1: str, text2: str) -> float:
|
| 238 |
+
"""Calculate basic text similarity"""
|
| 239 |
+
norm1 = normalize_text(text1)
|
| 240 |
+
norm2 = normalize_text(text2)
|
| 241 |
+
|
| 242 |
+
if norm1 == norm2:
|
| 243 |
+
return 1.0
|
| 244 |
+
|
| 245 |
+
words1 = set(norm1.split())
|
| 246 |
+
words2 = set(norm2.split())
|
| 247 |
+
|
| 248 |
+
if not words1 and not words2:
|
| 249 |
+
return 1.0
|
| 250 |
+
if not words1 or not words2:
|
| 251 |
+
return 0.0
|
| 252 |
+
|
| 253 |
+
intersection = words1.intersection(words2)
|
| 254 |
+
union = words1.union(words2)
|
| 255 |
+
|
| 256 |
+
return len(intersection) / len(union) if union else 0.0
|
| 257 |
+
|
| 258 |
+
def deduplicate_entities(entities: List[Dict[str, Any]], similarity_threshold: float = 0.8) -> List[Dict[str, Any]]:
|
| 259 |
+
"""Remove duplicate entities based on text similarity"""
|
| 260 |
+
if not entities:
|
| 261 |
+
return []
|
| 262 |
+
|
| 263 |
+
deduplicated = []
|
| 264 |
+
processed_texts = set()
|
| 265 |
+
|
| 266 |
+
for entity in entities:
|
| 267 |
+
entity_text = entity.get('text', '').strip()
|
| 268 |
+
normalized_text = normalize_text(entity_text)
|
| 269 |
+
|
| 270 |
+
if not entity_text or normalized_text in processed_texts:
|
| 271 |
+
continue
|
| 272 |
+
|
| 273 |
+
is_duplicate = False
|
| 274 |
+
for existing_entity in deduplicated:
|
| 275 |
+
existing_text = existing_entity.get('text', '')
|
| 276 |
+
similarity = calculate_text_similarity(entity_text, existing_text)
|
| 277 |
+
|
| 278 |
+
if similarity >= similarity_threshold:
|
| 279 |
+
if entity.get('confidence', 0) > existing_entity.get('confidence', 0):
|
| 280 |
+
deduplicated.remove(existing_entity)
|
| 281 |
+
break
|
| 282 |
+
else:
|
| 283 |
+
is_duplicate = True
|
| 284 |
+
break
|
| 285 |
+
|
| 286 |
+
if not is_duplicate:
|
| 287 |
+
entity['id'] = entity.get('id', generate_unique_id('ent'))
|
| 288 |
+
deduplicated.append(entity)
|
| 289 |
+
processed_texts.add(normalized_text)
|
| 290 |
+
|
| 291 |
+
return deduplicated
|
| 292 |
+
|
| 293 |
+
def detect_language(text: str) -> str:
|
| 294 |
+
"""Enhanced language detection"""
|
| 295 |
+
if not text:
|
| 296 |
+
return "en"
|
| 297 |
+
|
| 298 |
+
thai_chars = len(re.findall(r'[ΰΈ-ΰΉ]', text))
|
| 299 |
+
english_chars = len(re.findall(r'[a-zA-Z]', text))
|
| 300 |
+
total_chars = thai_chars + english_chars
|
| 301 |
+
|
| 302 |
+
if total_chars == 0:
|
| 303 |
+
return "en"
|
| 304 |
+
|
| 305 |
+
thai_ratio = thai_chars / total_chars
|
| 306 |
+
|
| 307 |
+
if thai_ratio > 0.3:
|
| 308 |
+
return "th"
|
| 309 |
+
elif thai_ratio > 0.1:
|
| 310 |
+
return "mixed"
|
| 311 |
+
else:
|
| 312 |
+
return "en"
|
| 313 |
+
|
| 314 |
+
def get_text_stats(text: str) -> Dict[str, int]:
|
| 315 |
+
"""Get comprehensive text statistics"""
|
| 316 |
+
return {
|
| 317 |
+
"character_count": len(text),
|
| 318 |
+
"word_count": len(text.split()),
|
| 319 |
+
"sentence_count": len(re.findall(r'[.!?]+', text)),
|
| 320 |
+
"paragraph_count": len([p for p in text.split('\n\n') if p.strip()]),
|
| 321 |
+
"line_count": len(text.split('\n'))
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
# Client Management
|
| 325 |
+
def get_blob_client():
|
| 326 |
+
if clients.get('blob') is None and config.AZURE_STORAGE_ACCOUNT_URL and config.AZURE_BLOB_SAS_TOKEN:
|
| 327 |
+
try:
|
| 328 |
+
clients['blob'] = BlobServiceClient(
|
| 329 |
+
account_url=config.AZURE_STORAGE_ACCOUNT_URL,
|
| 330 |
+
credential=config.AZURE_BLOB_SAS_TOKEN
|
| 331 |
+
)
|
| 332 |
+
except Exception as e:
|
| 333 |
+
logger.error(f"Failed to initialize blob client: {e}")
|
| 334 |
+
return clients.get('blob')
|
| 335 |
+
|
| 336 |
+
def get_deepseek_client():
|
| 337 |
+
if clients.get('deepseek') is None and config.DEEPSEEK_ENDPOINT and config.DEEPSEEK_API_KEY:
|
| 338 |
+
try:
|
| 339 |
+
clients['deepseek'] = ChatCompletionsClient(
|
| 340 |
+
endpoint=config.DEEPSEEK_ENDPOINT,
|
| 341 |
+
credential=AzureKeyCredential(config.DEEPSEEK_API_KEY),
|
| 342 |
+
api_version="2024-05-01-preview"
|
| 343 |
+
)
|
| 344 |
+
except Exception as e:
|
| 345 |
+
logger.error(f"Failed to initialize DeepSeek client: {e}")
|
| 346 |
+
return clients.get('deepseek')
|
| 347 |
+
|
| 348 |
+
def get_openai_client():
|
| 349 |
+
if clients.get('openai') is None and config.AZURE_OPENAI_ENDPOINT and config.AZURE_OPENAI_API_KEY:
|
| 350 |
+
try:
|
| 351 |
+
clients['openai'] = AzureOpenAI(
|
| 352 |
+
api_version="2024-12-01-preview",
|
| 353 |
+
azure_endpoint=config.AZURE_OPENAI_ENDPOINT,
|
| 354 |
+
api_key=config.AZURE_OPENAI_API_KEY
|
| 355 |
+
)
|
| 356 |
+
except Exception as e:
|
| 357 |
+
logger.error(f"Failed to initialize OpenAI client: {e}")
|
| 358 |
+
return clients.get('openai')
|
| 359 |
+
|
| 360 |
+
# Database Operations
|
| 361 |
+
async def init_database():
|
| 362 |
+
global pg_pool, vector_available
|
| 363 |
+
|
| 364 |
+
logger.info("π Connecting to database...")
|
| 365 |
+
try:
|
| 366 |
+
pg_pool = await asyncpg.create_pool(
|
| 367 |
+
host=config.POSTGRES_HOST,
|
| 368 |
+
port=config.POSTGRES_PORT,
|
| 369 |
+
user=config.POSTGRES_USER,
|
| 370 |
+
password=config.POSTGRES_PASSWORD,
|
| 371 |
+
database=config.POSTGRES_DATABASE,
|
| 372 |
+
ssl='require',
|
| 373 |
+
min_size=2,
|
| 374 |
+
max_size=10,
|
| 375 |
+
command_timeout=60
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
async with pg_pool.acquire() as conn:
|
| 379 |
+
logger.info("β
Database connected")
|
| 380 |
+
|
| 381 |
+
# Check vector extension
|
| 382 |
+
try:
|
| 383 |
+
await conn.execute("CREATE EXTENSION IF NOT EXISTS vector;")
|
| 384 |
+
await conn.fetchval("SELECT '[1,2,3]'::vector(3)")
|
| 385 |
+
vector_available = True
|
| 386 |
+
logger.info("β
Vector extension available")
|
| 387 |
+
except:
|
| 388 |
+
vector_available = False
|
| 389 |
+
logger.info("β οΈ Vector extension not available (using JSONB)")
|
| 390 |
+
|
| 391 |
+
# Create tables
|
| 392 |
+
await create_tables(conn)
|
| 393 |
+
logger.info("β
Database setup complete")
|
| 394 |
+
|
| 395 |
+
return True
|
| 396 |
+
except Exception as e:
|
| 397 |
+
logger.error(f"β Database init failed: {e}")
|
| 398 |
+
return False
|
| 399 |
+
|
| 400 |
+
async def create_tables(conn):
|
| 401 |
+
"""Create enhanced database tables for ER model"""
|
| 402 |
+
|
| 403 |
+
await conn.execute("""
|
| 404 |
+
CREATE TABLE IF NOT EXISTS ner_analyses (
|
| 405 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 406 |
+
analysis_id VARCHAR(255) UNIQUE NOT NULL,
|
| 407 |
+
source_text TEXT NOT NULL,
|
| 408 |
+
source_type VARCHAR(50) NOT NULL,
|
| 409 |
+
language VARCHAR(10) DEFAULT 'en',
|
| 410 |
+
entities JSONB NOT NULL DEFAULT '[]',
|
| 411 |
+
keywords JSONB NOT NULL DEFAULT '[]',
|
| 412 |
+
relationships JSONB NOT NULL DEFAULT '[]',
|
| 413 |
+
summary TEXT DEFAULT '',
|
| 414 |
+
embeddings JSONB DEFAULT '[]',
|
| 415 |
+
graph_data JSONB DEFAULT '{}',
|
| 416 |
+
export_files JSONB DEFAULT '{}',
|
| 417 |
+
text_stats JSONB DEFAULT '{}',
|
| 418 |
+
er_stats JSONB DEFAULT '{}',
|
| 419 |
+
processing_time FLOAT DEFAULT 0,
|
| 420 |
+
entity_types JSONB DEFAULT '[]',
|
| 421 |
+
relationship_types JSONB DEFAULT '[]',
|
| 422 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 423 |
+
);
|
| 424 |
+
""")
|
| 425 |
+
|
| 426 |
+
await conn.execute("""
|
| 427 |
+
CREATE TABLE IF NOT EXISTS entities (
|
| 428 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 429 |
+
entity_id VARCHAR(255) NOT NULL,
|
| 430 |
+
analysis_id VARCHAR(255) NOT NULL,
|
| 431 |
+
text VARCHAR(1000) NOT NULL,
|
| 432 |
+
label VARCHAR(100) NOT NULL,
|
| 433 |
+
confidence FLOAT DEFAULT 0,
|
| 434 |
+
start_pos INTEGER DEFAULT 0,
|
| 435 |
+
end_pos INTEGER DEFAULT 0,
|
| 436 |
+
frequency INTEGER DEFAULT 1,
|
| 437 |
+
importance_score FLOAT DEFAULT 0,
|
| 438 |
+
metadata JSONB DEFAULT '{}',
|
| 439 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 440 |
+
FOREIGN KEY (analysis_id) REFERENCES ner_analyses(analysis_id) ON DELETE CASCADE
|
| 441 |
+
);
|
| 442 |
+
""")
|
| 443 |
+
|
| 444 |
+
await conn.execute("""
|
| 445 |
+
CREATE TABLE IF NOT EXISTS relationships (
|
| 446 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 447 |
+
relationship_id VARCHAR(255) NOT NULL,
|
| 448 |
+
analysis_id VARCHAR(255) NOT NULL,
|
| 449 |
+
source_entity_id VARCHAR(255) NOT NULL,
|
| 450 |
+
target_entity_id VARCHAR(255) NOT NULL,
|
| 451 |
+
source_entity VARCHAR(1000) NOT NULL,
|
| 452 |
+
target_entity VARCHAR(1000) NOT NULL,
|
| 453 |
+
relationship_type VARCHAR(200) NOT NULL,
|
| 454 |
+
confidence FLOAT DEFAULT 0,
|
| 455 |
+
strength FLOAT DEFAULT 0,
|
| 456 |
+
context TEXT DEFAULT '',
|
| 457 |
+
evidence_count INTEGER DEFAULT 1,
|
| 458 |
+
bidirectional BOOLEAN DEFAULT FALSE,
|
| 459 |
+
metadata JSONB DEFAULT '{}',
|
| 460 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 461 |
+
FOREIGN KEY (analysis_id) REFERENCES ner_analyses(analysis_id) ON DELETE CASCADE
|
| 462 |
+
);
|
| 463 |
+
""")
|
| 464 |
+
|
| 465 |
+
# Create indexes
|
| 466 |
+
try:
|
| 467 |
+
await conn.execute("""
|
| 468 |
+
CREATE INDEX IF NOT EXISTS idx_analysis_id ON ner_analyses(analysis_id);
|
| 469 |
+
CREATE INDEX IF NOT EXISTS idx_entities_analysis ON entities(analysis_id);
|
| 470 |
+
CREATE INDEX IF NOT EXISTS idx_relationships_analysis ON relationships(analysis_id);
|
| 471 |
+
""")
|
| 472 |
+
except:
|
| 473 |
+
pass
|
| 474 |
+
|
| 475 |
+
# Text Extraction
|
| 476 |
+
def extract_text_from_file(file_content: bytes, filename: str) -> str:
|
| 477 |
+
file_ext = Path(filename).suffix.lower()
|
| 478 |
+
|
| 479 |
+
if file_ext == '.txt':
|
| 480 |
+
return file_content.decode('utf-8', errors='ignore')
|
| 481 |
+
elif file_ext == '.docx':
|
| 482 |
+
doc = docx.Document(io.BytesIO(file_content))
|
| 483 |
+
return '\n'.join([p.text for p in doc.paragraphs])
|
| 484 |
+
else:
|
| 485 |
+
return file_content.decode('utf-8', errors='ignore')
|
| 486 |
+
|
| 487 |
+
async def get_text_from_ocr(file_content: bytes, filename: str) -> str:
|
| 488 |
+
try:
|
| 489 |
+
async with httpx.AsyncClient(timeout=300) as client:
|
| 490 |
+
files = {'file': (filename, file_content)}
|
| 491 |
+
response = await client.post(f"{config.OCR_SERVICE_URL}/ocr/upload", files=files)
|
| 492 |
+
if response.status_code == 200:
|
| 493 |
+
return response.json().get('content', '')
|
| 494 |
+
except Exception as e:
|
| 495 |
+
logger.error(f"OCR service error: {e}")
|
| 496 |
+
pass
|
| 497 |
+
raise HTTPException(status_code=500, detail="OCR processing failed")
|
| 498 |
+
|
| 499 |
+
async def get_text_from_url(url: str) -> str:
|
| 500 |
+
try:
|
| 501 |
+
async with httpx.AsyncClient(timeout=300) as client:
|
| 502 |
+
response = await client.post(f"{config.OCR_SERVICE_URL}/ocr/url",
|
| 503 |
+
json={"url": str(url), "extract_images": True})
|
| 504 |
+
if response.status_code == 200:
|
| 505 |
+
return response.json().get('content', '')
|
| 506 |
+
except Exception as e:
|
| 507 |
+
logger.error(f"URL processing error: {e}")
|
| 508 |
+
pass
|
| 509 |
+
raise HTTPException(status_code=500, detail="URL processing failed")
|
| 510 |
+
|
| 511 |
+
# Enhanced NER and Relationship Analysis
|
| 512 |
+
async def analyze_with_deepseek(text: str, language: str = None) -> Dict[str, Any]:
|
| 513 |
+
"""Enhanced analysis with improved relationship extraction"""
|
| 514 |
+
deepseek_client = get_deepseek_client()
|
| 515 |
+
if not deepseek_client:
|
| 516 |
+
logger.warning("DeepSeek not configured, using manual extraction")
|
| 517 |
+
return extract_manual_entities_and_relationships(text, language)
|
| 518 |
+
|
| 519 |
+
try:
|
| 520 |
+
if not language:
|
| 521 |
+
language = detect_language(text)
|
| 522 |
+
|
| 523 |
+
if language == "th":
|
| 524 |
+
system_prompt = """ΰΈΰΈΈΰΈΰΉΰΈΰΉΰΈΰΈΰΈΉΰΉΰΉΰΈΰΈ΅ΰΉΰΈ’ΰΈ§ΰΈΰΈ²ΰΈΰΉΰΈΰΈΰΈ²ΰΈ£ΰΈΰΈΰΈΰΈ³ΰΈΰΈ²ΰΈ‘ΰΉΰΈΰΈΰΈ₯ΰΈ±ΰΈΰΈ©ΰΈΰΉΰΉΰΈ₯ΰΈ°ΰΈΰΈ²ΰΈ£ΰΈͺΰΈΰΈ±ΰΈΰΈΰΈ§ΰΈ²ΰΈ‘ΰΈͺΰΈ±ΰΈ‘ΰΈΰΈ±ΰΈΰΈΰΉΰΈͺΰΈ³ΰΈ«ΰΈ£ΰΈ±ΰΈΰΈ ΰΈ²ΰΈ©ΰΈ²ΰΉΰΈΰΈ’
|
| 525 |
+
|
| 526 |
+
ΰΈ§ΰΈ΄ΰΉΰΈΰΈ£ΰΈ²ΰΈ°ΰΈ«ΰΉΰΈΰΉΰΈΰΈΰΈ§ΰΈ²ΰΈ‘ΰΉΰΈ₯ΰΈ°ΰΈͺΰΈΰΈ±ΰΈΰΈΰΉΰΈΰΈ‘ΰΈΉΰΈ₯ΰΈΰΈ±ΰΈΰΈΰΈ΅ΰΉ:
|
| 527 |
+
1. ΰΈΰΈ²ΰΈ‘ΰΉΰΈΰΈΰΈ₯ΰΈ±ΰΈΰΈ©ΰΈΰΉΰΈΰΈΈΰΈΰΈΰΈ£ΰΈ°ΰΉΰΈ ΰΈ (ΰΈΰΈΈΰΈΰΈΰΈ₯ ΰΈΰΈΰΈΰΉΰΈΰΈ£ ΰΈͺΰΈΰΈ²ΰΈΰΈΰΈ΅ΰΉ ΰΈ§ΰΈ±ΰΈΰΈΰΈ΅ΰΉ ΰΉΰΈ§ΰΈ₯ΰΈ² ΰΉΰΈΰΈ΄ΰΈ ΰΈ―ΰΈ₯ΰΈ―)
|
| 528 |
+
2. ΰΈΰΈ§ΰΈ²ΰΈ‘ΰΈͺΰΈ±ΰΈ‘ΰΈΰΈ±ΰΈΰΈΰΉΰΈ£ΰΈ°ΰΈ«ΰΈ§ΰΉΰΈ²ΰΈΰΈΰΈ²ΰΈ‘ΰΉΰΈΰΈΰΈ₯ΰΈ±ΰΈΰΈ©ΰΈΰΉ - ΰΈΰΉΰΈΰΈΰΈͺΰΈΰΈ±ΰΈΰΈΰΈΈΰΈΰΈΰΈ§ΰΈ²ΰΈ‘ΰΈͺΰΈ±ΰΈ‘ΰΈΰΈ±ΰΈΰΈΰΉΰΈΰΈ΅ΰΉΰΈΰΈ
|
| 529 |
+
3. ΰΈΰΈ³ΰΈ«ΰΈ₯ΰΈ±ΰΈΰΈͺΰΈ³ΰΈΰΈ±ΰΈΰΈΰΈ²ΰΈΰΈΰΉΰΈΰΈΰΈ§ΰΈ²ΰΈ‘
|
| 530 |
+
4. ΰΈͺΰΈ£ΰΈΈΰΈΰΈΰΈ΅ΰΉΰΈΰΈ£ΰΈΰΈΰΈΰΈ₯ΰΈΈΰΈ‘
|
| 531 |
+
|
| 532 |
+
ΰΉΰΈ«ΰΉΰΈΰΈ₯ΰΈ₯ΰΈ±ΰΈΰΈΰΉΰΉΰΈΰΉΰΈ JSON:
|
| 533 |
+
{
|
| 534 |
+
"entities": [{"text": "ΰΈΰΉΰΈΰΈΰΈ§ΰΈ²ΰΈ‘", "label": "ΰΈΰΈ£ΰΈ°ΰΉΰΈ ΰΈ", "confidence": 0.95, "start_pos": 0, "end_pos": 10}],
|
| 535 |
+
"keywords": ["ΰΈΰΈ³ΰΈ«ΰΈ₯ΰΈ±ΰΈ1", "ΰΈΰΈ³ΰΈ«ΰΈ₯ΰΈ±ΰΈ2"],
|
| 536 |
+
"relationships": [{"source_entity": "A", "target_entity": "B", "relationship_type": "ΰΈΰΈ£ΰΈ°ΰΉΰΈ ΰΈ", "confidence": 0.9, "context": "ΰΈΰΈ£ΰΈ΄ΰΈΰΈ"}],
|
| 537 |
+
"summary": "ΰΈͺΰΈ£ΰΈΈΰΈ"
|
| 538 |
+
}"""
|
| 539 |
+
else:
|
| 540 |
+
system_prompt = """You are an expert in Named Entity Recognition and relationship extraction.
|
| 541 |
+
|
| 542 |
+
Analyze the text and extract:
|
| 543 |
+
1. All named entities (people, organizations, locations, dates, money, etc.)
|
| 544 |
+
2. ALL relationships between entities - extract every relationship found
|
| 545 |
+
3. Important keywords from the text
|
| 546 |
+
4. Comprehensive summary
|
| 547 |
+
|
| 548 |
+
Return ONLY valid JSON:
|
| 549 |
+
{
|
| 550 |
+
"entities": [{"text": "entity text", "label": "TYPE", "confidence": 0.95, "start_pos": 0, "end_pos": 10}],
|
| 551 |
+
"keywords": ["keyword1", "keyword2"],
|
| 552 |
+
"relationships": [{"source_entity": "Entity A", "target_entity": "Entity B", "relationship_type": "relationship_type", "confidence": 0.9, "context": "context"}],
|
| 553 |
+
"summary": "Comprehensive summary"
|
| 554 |
+
}"""
|
| 555 |
+
|
| 556 |
+
user_prompt = f"ΰΈ§ΰΈ΄ΰΉΰΈΰΈ£ΰΈ²ΰΈ°ΰΈ«ΰΉΰΈΰΉΰΈΰΈΰΈ§ΰΈ²ΰΈ‘ΰΈΰΈ΅ΰΉ:\n\n{text[:8000]}" if language == "th" else f"Analyze this text:\n\n{text[:8000]}"
|
| 557 |
+
|
| 558 |
+
response = deepseek_client.complete(
|
| 559 |
+
messages=[
|
| 560 |
+
SystemMessage(content=system_prompt),
|
| 561 |
+
UserMessage(content=user_prompt)
|
| 562 |
+
],
|
| 563 |
+
max_tokens=6000,
|
| 564 |
+
model=config.DEEPSEEK_MODEL,
|
| 565 |
+
temperature=0.1
|
| 566 |
+
)
|
| 567 |
+
|
| 568 |
+
result_text = response.choices[0].message.content.strip()
|
| 569 |
+
|
| 570 |
+
# Extract JSON from response
|
| 571 |
+
start_idx = result_text.find('{')
|
| 572 |
+
end_idx = result_text.rfind('}') + 1
|
| 573 |
+
if start_idx != -1 and end_idx > start_idx:
|
| 574 |
+
json_text = result_text[start_idx:end_idx]
|
| 575 |
+
try:
|
| 576 |
+
json_result = json.loads(json_text)
|
| 577 |
+
logger.info("β
Successfully parsed JSON from DeepSeek")
|
| 578 |
+
except:
|
| 579 |
+
try:
|
| 580 |
+
fixed_json = json_text.replace("'", '"').replace('True', 'true').replace('False', 'false')
|
| 581 |
+
json_result = json.loads(fixed_json)
|
| 582 |
+
logger.info("β
Successfully parsed fixed JSON")
|
| 583 |
+
except:
|
| 584 |
+
json_result = None
|
| 585 |
+
else:
|
| 586 |
+
json_result = None
|
| 587 |
+
|
| 588 |
+
if json_result:
|
| 589 |
+
entities = deduplicate_entities(json_result.get('entities', []))
|
| 590 |
+
keywords = json_result.get('keywords', [])
|
| 591 |
+
relationships = json_result.get('relationships', [])
|
| 592 |
+
summary = json_result.get('summary', '')
|
| 593 |
+
|
| 594 |
+
# Ensure relationships are extracted
|
| 595 |
+
if len(relationships) == 0 and len(entities) >= 2:
|
| 596 |
+
logger.warning("No relationships found by DeepSeek, applying rule-based extraction")
|
| 597 |
+
rule_based_relationships = extract_rule_based_relationships(entities, text, language)
|
| 598 |
+
relationships.extend(rule_based_relationships)
|
| 599 |
+
|
| 600 |
+
# Enhance relationships with IDs
|
| 601 |
+
for rel in relationships:
|
| 602 |
+
if 'id' not in rel:
|
| 603 |
+
rel['id'] = generate_unique_id('rel')
|
| 604 |
+
if 'strength' not in rel:
|
| 605 |
+
rel['strength'] = rel.get('confidence', 0.8)
|
| 606 |
+
if 'evidence_count' not in rel:
|
| 607 |
+
rel['evidence_count'] = 1
|
| 608 |
+
if 'bidirectional' not in rel:
|
| 609 |
+
rel['bidirectional'] = False
|
| 610 |
+
|
| 611 |
+
return {
|
| 612 |
+
"entities": entities,
|
| 613 |
+
"keywords": keywords[:20],
|
| 614 |
+
"relationships": relationships,
|
| 615 |
+
"summary": summary or f"Analysis of {len(text)} characters"
|
| 616 |
+
}
|
| 617 |
+
|
| 618 |
+
logger.warning("JSON parsing failed, using manual extraction")
|
| 619 |
+
return extract_manual_entities_and_relationships(text, language)
|
| 620 |
+
|
| 621 |
+
except Exception as e:
|
| 622 |
+
logger.error(f"DeepSeek analysis error: {e}")
|
| 623 |
+
return extract_manual_entities_and_relationships(text, language)
|
| 624 |
+
|
| 625 |
+
def extract_rule_based_relationships(entities: List[Dict], text: str, language: str) -> List[Dict]:
|
| 626 |
+
"""Extract relationships using rule-based approach"""
|
| 627 |
+
relationships = []
|
| 628 |
+
|
| 629 |
+
if len(entities) < 2:
|
| 630 |
+
return relationships
|
| 631 |
+
|
| 632 |
+
# Define relationship patterns
|
| 633 |
+
if language == "th":
|
| 634 |
+
patterns = [
|
| 635 |
+
(r'(.+?)\s*ΰΈΰΈ³ΰΈΰΈ²ΰΈ(?:ΰΈΰΈ΅ΰΉ|ΰΉΰΈ|ΰΈΰΈ±ΰΈ)\s*(.+)', 'ΰΈΰΈ³ΰΈΰΈ²ΰΈΰΈΰΈ΅ΰΉ'),
|
| 636 |
+
(r'(.+?)\s*ΰΉΰΈΰΉΰΈ(?:ΰΉΰΈΰΉΰΈ²ΰΈΰΈΰΈ|ΰΈΰΈΰΈ)\s*(.+)', 'ΰΉΰΈΰΉΰΈΰΉΰΈΰΉΰΈ²ΰΈΰΈΰΈ'),
|
| 637 |
+
(r'(.+?)\s*ΰΈΰΈ±ΰΉΰΈΰΈΰΈ’ΰΈΉΰΉ(?:ΰΈΰΈ΅ΰΉ|ΰΉΰΈ)\s*(.+)', 'ΰΈΰΈ±ΰΉΰΈΰΈΰΈ’ΰΈΉΰΉΰΈΰΈ΅ΰΉ'),
|
| 638 |
+
(r'(.+?)\s*(?:ΰΈΰΈ±ΰΈΰΈΰΈΈΰΈ‘|ΰΈΰΈ±ΰΈ)\s*(.+)', 'ΰΈΰΈ±ΰΈΰΈΰΈΈΰΈ‘ΰΉΰΈΰΈ’'),
|
| 639 |
+
]
|
| 640 |
+
else:
|
| 641 |
+
patterns = [
|
| 642 |
+
(r'(.+?)\s*(?:works?\s+(?:for|at|in)|employed\s+by)\s*(.+)', 'works_for'),
|
| 643 |
+
(r'(.+?)\s*(?:owns?|possesses?)\s*(.+)', 'owns'),
|
| 644 |
+
(r'(.+?)\s*(?:located\s+(?:in|at)|based\s+in)\s*(.+)', 'located_in'),
|
| 645 |
+
(r'(.+?)\s*(?:arrested\s+by|detained\s+by)\s*(.+)', 'arrested_by'),
|
| 646 |
+
]
|
| 647 |
+
|
| 648 |
+
for pattern, rel_type in patterns:
|
| 649 |
+
for match in re.finditer(pattern, text, re.IGNORECASE | re.UNICODE):
|
| 650 |
+
source_text = match.group(1).strip()
|
| 651 |
+
target_text = match.group(2).strip()
|
| 652 |
+
|
| 653 |
+
source_entity = find_best_entity_match(source_text, entities)
|
| 654 |
+
target_entity = find_best_entity_match(target_text, entities)
|
| 655 |
+
|
| 656 |
+
if source_entity and target_entity and source_entity != target_entity:
|
| 657 |
+
relationship = {
|
| 658 |
+
'id': generate_unique_id('rel'),
|
| 659 |
+
'source_entity': source_entity['text'],
|
| 660 |
+
'target_entity': target_entity['text'],
|
| 661 |
+
'relationship_type': rel_type,
|
| 662 |
+
'confidence': 0.7,
|
| 663 |
+
'strength': 0.7,
|
| 664 |
+
'context': match.group(0),
|
| 665 |
+
'evidence_count': 1,
|
| 666 |
+
'bidirectional': False,
|
| 667 |
+
'metadata': {'extraction_method': 'rule_based'}
|
| 668 |
+
}
|
| 669 |
+
relationships.append(relationship)
|
| 670 |
+
|
| 671 |
+
return relationships
|
| 672 |
+
|
| 673 |
+
def find_best_entity_match(text: str, entities: List[Dict]) -> Optional[Dict]:
|
| 674 |
+
"""Find the best matching entity for given text"""
|
| 675 |
+
text_norm = normalize_text(text)
|
| 676 |
+
|
| 677 |
+
for entity in entities:
|
| 678 |
+
if normalize_text(entity['text']) == text_norm:
|
| 679 |
+
return entity
|
| 680 |
+
|
| 681 |
+
best_match = None
|
| 682 |
+
best_score = 0
|
| 683 |
+
|
| 684 |
+
for entity in entities:
|
| 685 |
+
score = calculate_text_similarity(text, entity['text'])
|
| 686 |
+
if score > best_score and score > 0.6:
|
| 687 |
+
best_score = score
|
| 688 |
+
best_match = entity
|
| 689 |
+
|
| 690 |
+
return best_match
|
| 691 |
+
|
| 692 |
+
def extract_manual_entities_and_relationships(text: str, language: str = None) -> Dict[str, Any]:
|
| 693 |
+
"""Enhanced manual extraction with relationship detection"""
|
| 694 |
+
if not language:
|
| 695 |
+
language = detect_language(text)
|
| 696 |
+
|
| 697 |
+
entities = []
|
| 698 |
+
keywords = []
|
| 699 |
+
|
| 700 |
+
# Enhanced patterns for different languages
|
| 701 |
+
if language == "th":
|
| 702 |
+
patterns = {
|
| 703 |
+
'PERSON': [r'(?:ΰΈΰΈΈΰΈ|ΰΈΰΈ²ΰΈ’|ΰΈΰΈ²ΰΈ|ΰΈΰΈ²ΰΈΰΈͺΰΈ²ΰΈ§|ΰΈΰΈ£\.?)\s*[ΰΈ-ΰΉ\w\s]+'],
|
| 704 |
+
'ORGANIZATION': [r'ΰΈΰΈ£ΰΈ΄ΰΈ©ΰΈ±ΰΈ\s+[ΰΈ-ΰΉ\w\s]+(?:ΰΈΰΈ³ΰΈΰΈ±ΰΈ|ΰΈ‘ΰΈ«ΰΈ²ΰΈΰΈ)', r'ΰΈͺΰΈΰΈ²ΰΈΰΈ΅ΰΈΰΈ³ΰΈ£ΰΈ§ΰΈ[ΰΈ-ΰΉ\w\s]+'],
|
| 705 |
+
'LOCATION': [r'ΰΈΰΈ±ΰΈΰΈ«ΰΈ§ΰΈ±ΰΈ[ΰΈ-ΰΉ\w\s]+', r'ΰΈΰΈ£ΰΈΈΰΈΰΉΰΈΰΈΰΈ‘ΰΈ«ΰΈ²ΰΈΰΈΰΈ£|ΰΈΰΈ£ΰΈΈΰΈΰΉΰΈΰΈΰΈ―?'],
|
| 706 |
+
'MONEY': [r'\d+(?:,\d{3})*\s*(?:ΰΈΰΈ²ΰΈ|ΰΈ₯ΰΉΰΈ²ΰΈΰΈΰΈ²ΰΈ|ΰΈΰΈ±ΰΈΰΈΰΈ²ΰΈ)'],
|
| 707 |
+
'DATE': [r'\d{1,2}\/\d{1,2}\/\d{4}'],
|
| 708 |
+
}
|
| 709 |
+
words = re.findall(r'[ΰΈ-ΰΉ]+', text)
|
| 710 |
+
thai_stop_words = {'ΰΉΰΈ₯ΰΈ°', 'ΰΈ«ΰΈ£ΰΈ·ΰΈ', 'ΰΉΰΈΰΉ', 'ΰΉΰΈ', 'ΰΈΰΈ΅ΰΉ', 'ΰΉΰΈΰΈ·ΰΉΰΈ', 'ΰΈΰΈ±ΰΈ', 'ΰΈΰΈ²ΰΈ', 'ΰΉΰΈΰΈ’', 'ΰΈΰΈΰΈ'}
|
| 711 |
+
keywords = [word for word in words if word not in thai_stop_words and len(word) > 2]
|
| 712 |
+
else:
|
| 713 |
+
patterns = {
|
| 714 |
+
'PERSON': [r'\b(?:Mr|Mrs|Ms|Dr|Prof)\.\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*'],
|
| 715 |
+
'ORGANIZATION': [r'\b[A-Z][a-zA-Z]+\s+(?:Inc|Corp|Company|Ltd|Co|LLC|Corporation|Limited|University)\b'],
|
| 716 |
+
'LOCATION': [r'\b(?:New York|Los Angeles|Chicago|Bangkok|London|Paris|Berlin)\b'],
|
| 717 |
+
'MONEY': [r'\$[\d,]+\.?\d*', r'\b\d+(?:,\d{3})*\s*(?:dollars?|USD|million|billion)\b'],
|
| 718 |
+
'DATE': [r'\b\d{1,2}\/\d{1,2}\/\d{4}\b'],
|
| 719 |
+
}
|
| 720 |
+
words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
|
| 721 |
+
english_stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
|
| 722 |
+
keywords = [word.lower() for word in words if word.lower() not in english_stop_words]
|
| 723 |
+
|
| 724 |
+
# Extract entities
|
| 725 |
+
for label, pattern_list in patterns.items():
|
| 726 |
+
for pattern in pattern_list:
|
| 727 |
+
for match in re.finditer(pattern, text, re.UNICODE | re.IGNORECASE):
|
| 728 |
+
entity_text = match.group().strip()
|
| 729 |
+
if len(entity_text) > 1:
|
| 730 |
+
entities.append({
|
| 731 |
+
"id": generate_unique_id('ent'),
|
| 732 |
+
"text": entity_text,
|
| 733 |
+
"label": label,
|
| 734 |
+
"confidence": 0.8,
|
| 735 |
+
"start_pos": match.start(),
|
| 736 |
+
"end_pos": match.end(),
|
| 737 |
+
"frequency": 1,
|
| 738 |
+
"importance_score": 0.7,
|
| 739 |
+
"metadata": {"source": "manual_extraction"}
|
| 740 |
+
})
|
| 741 |
+
|
| 742 |
+
# Deduplicate
|
| 743 |
+
entities = deduplicate_entities(entities)
|
| 744 |
+
keywords = list(set(keywords))[:20]
|
| 745 |
+
|
| 746 |
+
# Extract relationships
|
| 747 |
+
relationships = []
|
| 748 |
+
if len(entities) >= 2:
|
| 749 |
+
relationships = extract_rule_based_relationships(entities, text, language)
|
| 750 |
+
|
| 751 |
+
summary = f"Analysis of {len(text)} characters found {len(entities)} entities and {len(relationships)} relationships"
|
| 752 |
+
|
| 753 |
+
return {
|
| 754 |
+
"entities": entities,
|
| 755 |
+
"keywords": keywords,
|
| 756 |
+
"relationships": relationships,
|
| 757 |
+
"summary": summary
|
| 758 |
+
}
|
| 759 |
+
|
| 760 |
+
async def generate_embeddings(text: str) -> List[float]:
|
| 761 |
+
openai_client = get_openai_client()
|
| 762 |
+
if not openai_client:
|
| 763 |
+
return []
|
| 764 |
+
|
| 765 |
+
try:
|
| 766 |
+
response = openai_client.embeddings.create(
|
| 767 |
+
input=[text[:8000]],
|
| 768 |
+
model=config.EMBEDDING_MODEL,
|
| 769 |
+
dimensions=1536
|
| 770 |
+
)
|
| 771 |
+
return response.data[0].embedding
|
| 772 |
+
except Exception as e:
|
| 773 |
+
logger.error(f"Embedding failed: {e}")
|
| 774 |
+
return []
|
| 775 |
+
|
| 776 |
+
def create_enhanced_graph_data(entities: List[Dict], relationships: List[Dict]) -> GraphData:
|
| 777 |
+
"""Create enhanced graph data with comprehensive ER model"""
|
| 778 |
+
nodes = []
|
| 779 |
+
links = []
|
| 780 |
+
entity_map = {}
|
| 781 |
+
|
| 782 |
+
# Create nodes
|
| 783 |
+
for entity in entities:
|
| 784 |
+
node_id = entity.get('id', generate_unique_id('ent'))
|
| 785 |
+
entity_map[entity['text']] = node_id
|
| 786 |
+
|
| 787 |
+
node_properties = {
|
| 788 |
+
"original_text": entity['text'],
|
| 789 |
+
"entity_type": entity['label'],
|
| 790 |
+
"confidence": entity.get('confidence', 0.0),
|
| 791 |
+
"start_position": entity.get('start_pos', 0),
|
| 792 |
+
"end_position": entity.get('end_pos', 0),
|
| 793 |
+
"frequency": entity.get('frequency', 1),
|
| 794 |
+
"importance_score": entity.get('importance_score', 0.0),
|
| 795 |
+
"metadata": entity.get('metadata', {})
|
| 796 |
+
}
|
| 797 |
+
|
| 798 |
+
nodes.append(NodeResult(
|
| 799 |
+
id=node_id,
|
| 800 |
+
label=entity['text'],
|
| 801 |
+
type=entity['label'],
|
| 802 |
+
confidence=entity.get('confidence', 0.0),
|
| 803 |
+
frequency=entity.get('frequency', 1),
|
| 804 |
+
importance_score=entity.get('importance_score', 0.0),
|
| 805 |
+
properties=node_properties
|
| 806 |
+
))
|
| 807 |
+
|
| 808 |
+
# Create links
|
| 809 |
+
for rel in relationships:
|
| 810 |
+
source_id = entity_map.get(rel['source_entity'])
|
| 811 |
+
target_id = entity_map.get(rel['target_entity'])
|
| 812 |
+
|
| 813 |
+
if source_id and target_id:
|
| 814 |
+
link_id = rel.get('id', generate_unique_id('link'))
|
| 815 |
+
|
| 816 |
+
link_properties = {
|
| 817 |
+
"relationship_type": rel['relationship_type'],
|
| 818 |
+
"confidence": rel.get('confidence', 0.0),
|
| 819 |
+
"strength": rel.get('strength', rel.get('confidence', 0.0)),
|
| 820 |
+
"context": rel.get('context', ''),
|
| 821 |
+
"evidence_count": rel.get('evidence_count', 1),
|
| 822 |
+
"bidirectional": rel.get('bidirectional', False),
|
| 823 |
+
"metadata": rel.get('metadata', {})
|
| 824 |
+
}
|
| 825 |
+
|
| 826 |
+
links.append(LinkResult(
|
| 827 |
+
id=link_id,
|
| 828 |
+
source=source_id,
|
| 829 |
+
target=target_id,
|
| 830 |
+
relationship=rel['relationship_type'],
|
| 831 |
+
confidence=rel.get('confidence', 0.0),
|
| 832 |
+
strength=rel.get('strength', rel.get('confidence', 0.0)),
|
| 833 |
+
evidence_count=rel.get('evidence_count', 1),
|
| 834 |
+
properties=link_properties
|
| 835 |
+
))
|
| 836 |
+
|
| 837 |
+
# Calculate metadata
|
| 838 |
+
entity_types = defaultdict(int)
|
| 839 |
+
relationship_types = defaultdict(int)
|
| 840 |
+
|
| 841 |
+
for entity in entities:
|
| 842 |
+
entity_types[entity['label']] += 1
|
| 843 |
+
|
| 844 |
+
for rel in relationships:
|
| 845 |
+
relationship_types[rel['relationship_type']] += 1
|
| 846 |
+
|
| 847 |
+
metadata = {
|
| 848 |
+
"total_entities": len(entities),
|
| 849 |
+
"total_relationships": len(relationships),
|
| 850 |
+
"entity_type_distribution": dict(entity_types),
|
| 851 |
+
"relationship_type_distribution": dict(relationship_types),
|
| 852 |
+
"graph_density": len(relationships) / (len(entities) * (len(entities) - 1) / 2) if len(entities) > 1 else 0,
|
| 853 |
+
"average_entity_confidence": sum(entity.get('confidence', 0) for entity in entities) / len(entities) if entities else 0,
|
| 854 |
+
"average_relationship_confidence": sum(rel.get('confidence', 0) for rel in relationships) / len(relationships) if relationships else 0,
|
| 855 |
+
"unique_entity_types": len(entity_types),
|
| 856 |
+
"unique_relationship_types": len(relationship_types)
|
| 857 |
+
}
|
| 858 |
+
|
| 859 |
+
return GraphData(
|
| 860 |
+
nodes=nodes,
|
| 861 |
+
links=links,
|
| 862 |
+
metadata=metadata
|
| 863 |
+
)
|
| 864 |
+
|
| 865 |
+
# Export Functions (simplified)
|
| 866 |
+
async def generate_export_files(analysis_id: str, entities: List[Dict], relationships: List[Dict],
|
| 867 |
+
graph_data: GraphData, formats: List[str]) -> ExportFiles:
|
| 868 |
+
"""Generate export files for various formats"""
|
| 869 |
+
|
| 870 |
+
export_files = ExportFiles()
|
| 871 |
+
analysis_dir = EXPORT_DIR / analysis_id
|
| 872 |
+
analysis_dir.mkdir(exist_ok=True)
|
| 873 |
+
|
| 874 |
+
try:
|
| 875 |
+
if "neo4j" in formats:
|
| 876 |
+
nodes_file, rels_file = await generate_neo4j_csv(analysis_dir, entities, relationships)
|
| 877 |
+
export_files.neo4j_nodes = str(nodes_file)
|
| 878 |
+
export_files.neo4j_relationships = str(rels_file)
|
| 879 |
+
|
| 880 |
+
if "json" in formats:
|
| 881 |
+
json_file = await generate_json_export(analysis_dir, entities, relationships, graph_data)
|
| 882 |
+
export_files.json_export = str(json_file)
|
| 883 |
+
|
| 884 |
+
if "graphml" in formats:
|
| 885 |
+
graphml_file = await generate_graphml_export(analysis_dir, entities, relationships)
|
| 886 |
+
export_files.graphml_export = str(graphml_file)
|
| 887 |
+
|
| 888 |
+
logger.info(f"β
Generated export files for analysis {analysis_id}")
|
| 889 |
+
|
| 890 |
+
except Exception as e:
|
| 891 |
+
logger.error(f"β Export file generation failed: {e}")
|
| 892 |
+
|
| 893 |
+
return export_files
|
| 894 |
+
|
| 895 |
+
async def generate_neo4j_csv(export_dir: Path, entities: List[Dict], relationships: List[Dict]) -> Tuple[Path, Path]:
|
| 896 |
+
"""Generate Neo4j compatible CSV files"""
|
| 897 |
+
|
| 898 |
+
nodes_file = export_dir / "neo4j_nodes.csv"
|
| 899 |
+
with open(nodes_file, 'w', newline='', encoding='utf-8') as f:
|
| 900 |
+
writer = csv.writer(f)
|
| 901 |
+
writer.writerow([
|
| 902 |
+
'nodeId:ID', 'text', 'label:LABEL', 'confidence:float',
|
| 903 |
+
'frequency:int', 'importance:float'
|
| 904 |
+
])
|
| 905 |
+
|
| 906 |
+
for entity in entities:
|
| 907 |
+
writer.writerow([
|
| 908 |
+
entity.get('id', generate_unique_id('ent')),
|
| 909 |
+
entity['text'],
|
| 910 |
+
entity['label'],
|
| 911 |
+
entity.get('confidence', 0.0),
|
| 912 |
+
entity.get('frequency', 1),
|
| 913 |
+
entity.get('importance_score', 0.0)
|
| 914 |
+
])
|
| 915 |
+
|
| 916 |
+
rels_file = export_dir / "neo4j_relationships.csv"
|
| 917 |
+
entity_map = {entity['text']: entity.get('id', generate_unique_id('ent')) for entity in entities}
|
| 918 |
+
|
| 919 |
+
with open(rels_file, 'w', newline='', encoding='utf-8') as f:
|
| 920 |
+
writer = csv.writer(f)
|
| 921 |
+
writer.writerow([
|
| 922 |
+
':START_ID', ':END_ID', ':TYPE', 'confidence:float',
|
| 923 |
+
'strength:float', 'context'
|
| 924 |
+
])
|
| 925 |
+
|
| 926 |
+
for rel in relationships:
|
| 927 |
+
source_id = entity_map.get(rel['source_entity'])
|
| 928 |
+
target_id = entity_map.get(rel['target_entity'])
|
| 929 |
+
|
| 930 |
+
if source_id and target_id:
|
| 931 |
+
writer.writerow([
|
| 932 |
+
source_id,
|
| 933 |
+
target_id,
|
| 934 |
+
rel['relationship_type'].upper().replace(' ', '_'),
|
| 935 |
+
rel.get('confidence', 0.0),
|
| 936 |
+
rel.get('strength', rel.get('confidence', 0.0)),
|
| 937 |
+
rel.get('context', '')
|
| 938 |
+
])
|
| 939 |
+
|
| 940 |
+
return nodes_file, rels_file
|
| 941 |
+
|
| 942 |
+
async def generate_json_export(export_dir: Path, entities: List[Dict], relationships: List[Dict], graph_data: GraphData) -> Path:
|
| 943 |
+
"""Generate comprehensive JSON export"""
|
| 944 |
+
|
| 945 |
+
json_file = export_dir / "analysis_export.json"
|
| 946 |
+
|
| 947 |
+
export_data = {
|
| 948 |
+
"metadata": {
|
| 949 |
+
"export_timestamp": datetime.utcnow().isoformat(),
|
| 950 |
+
"format_version": "1.0",
|
| 951 |
+
"total_entities": len(entities),
|
| 952 |
+
"total_relationships": len(relationships)
|
| 953 |
+
},
|
| 954 |
+
"entities": entities,
|
| 955 |
+
"relationships": relationships,
|
| 956 |
+
"graph_data": graph_data.dict(),
|
| 957 |
+
"statistics": {
|
| 958 |
+
"entity_types": list(set(e['label'] for e in entities)),
|
| 959 |
+
"relationship_types": list(set(r['relationship_type'] for r in relationships)),
|
| 960 |
+
"average_confidence": sum(e.get('confidence', 0) for e in entities) / len(entities) if entities else 0
|
| 961 |
+
}
|
| 962 |
+
}
|
| 963 |
+
|
| 964 |
+
with open(json_file, 'w', encoding='utf-8') as f:
|
| 965 |
+
json.dump(export_data, f, indent=2, ensure_ascii=False)
|
| 966 |
+
|
| 967 |
+
return json_file
|
| 968 |
+
|
| 969 |
+
async def generate_graphml_export(export_dir: Path, entities: List[Dict], relationships: List[Dict]) -> Path:
|
| 970 |
+
"""Generate GraphML format"""
|
| 971 |
+
|
| 972 |
+
graphml_file = export_dir / "graph_export.graphml"
|
| 973 |
+
|
| 974 |
+
# Create GraphML structure
|
| 975 |
+
root = ET.Element('graphml')
|
| 976 |
+
root.set('xmlns', 'http://graphml.graphdrawing.org/xmlns')
|
| 977 |
+
|
| 978 |
+
# Define attributes
|
| 979 |
+
ET.SubElement(root, 'key', id='label', **{'for': 'node', 'attr.name': 'label', 'attr.type': 'string'})
|
| 980 |
+
ET.SubElement(root, 'key', id='type', **{'for': 'node', 'attr.name': 'type', 'attr.type': 'string'})
|
| 981 |
+
ET.SubElement(root, 'key', id='rel_type', **{'for': 'edge', 'attr.name': 'relationship', 'attr.type': 'string'})
|
| 982 |
+
|
| 983 |
+
graph = ET.SubElement(root, 'graph', id='G', edgedefault='directed')
|
| 984 |
+
|
| 985 |
+
# Add nodes
|
| 986 |
+
entity_map = {}
|
| 987 |
+
for entity in entities:
|
| 988 |
+
node_id = entity.get('id', generate_unique_id('ent'))
|
| 989 |
+
entity_map[entity['text']] = node_id
|
| 990 |
+
|
| 991 |
+
node = ET.SubElement(graph, 'node', id=node_id)
|
| 992 |
+
|
| 993 |
+
label_data = ET.SubElement(node, 'data', key='label')
|
| 994 |
+
label_data.text = entity['text']
|
| 995 |
+
|
| 996 |
+
type_data = ET.SubElement(node, 'data', key='type')
|
| 997 |
+
type_data.text = entity['label']
|
| 998 |
+
|
| 999 |
+
# Add edges
|
| 1000 |
+
for i, rel in enumerate(relationships):
|
| 1001 |
+
source_id = entity_map.get(rel['source_entity'])
|
| 1002 |
+
target_id = entity_map.get(rel['target_entity'])
|
| 1003 |
+
|
| 1004 |
+
if source_id and target_id:
|
| 1005 |
+
edge = ET.SubElement(graph, 'edge', id=f"e{i}", source=source_id, target=target_id)
|
| 1006 |
+
|
| 1007 |
+
rel_data = ET.SubElement(edge, 'data', key='rel_type')
|
| 1008 |
+
rel_data.text = rel['relationship_type']
|
| 1009 |
+
|
| 1010 |
+
# Write to file
|
| 1011 |
+
tree = ET.ElementTree(root)
|
| 1012 |
+
tree.write(graphml_file, encoding='utf-8', xml_declaration=True)
|
| 1013 |
+
|
| 1014 |
+
return graphml_file
|
| 1015 |
+
|
| 1016 |
+
def calculate_er_stats(entities: List[Dict], relationships: List[Dict]) -> Dict[str, Any]:
|
| 1017 |
+
"""Calculate Entity-Relationship statistics"""
|
| 1018 |
+
|
| 1019 |
+
if not entities:
|
| 1020 |
+
return {}
|
| 1021 |
+
|
| 1022 |
+
entity_types = defaultdict(int)
|
| 1023 |
+
relationship_types = defaultdict(int)
|
| 1024 |
+
|
| 1025 |
+
for entity in entities:
|
| 1026 |
+
entity_types[entity['label']] += 1
|
| 1027 |
+
|
| 1028 |
+
for rel in relationships:
|
| 1029 |
+
relationship_types[rel['relationship_type']] += 1
|
| 1030 |
+
|
| 1031 |
+
return {
|
| 1032 |
+
"total_entities": len(entities),
|
| 1033 |
+
"total_relationships": len(relationships),
|
| 1034 |
+
"entity_type_distribution": dict(entity_types),
|
| 1035 |
+
"relationship_type_distribution": dict(relationship_types),
|
| 1036 |
+
"graph_density": len(relationships) / (len(entities) * (len(entities) - 1) / 2) if len(entities) > 1 else 0,
|
| 1037 |
+
"unique_entity_types": len(entity_types),
|
| 1038 |
+
"unique_relationship_types": len(relationship_types)
|
| 1039 |
+
}
|
| 1040 |
+
|
| 1041 |
+
async def save_to_database(data: Dict[str, Any]) -> bool:
|
| 1042 |
+
if not pg_pool:
|
| 1043 |
+
logger.error("No database pool available")
|
| 1044 |
+
return False
|
| 1045 |
+
|
| 1046 |
+
try:
|
| 1047 |
+
async with pg_pool.acquire() as conn:
|
| 1048 |
+
await conn.execute("""
|
| 1049 |
+
INSERT INTO ner_analyses (
|
| 1050 |
+
analysis_id, source_text, source_type, language, entities, keywords,
|
| 1051 |
+
relationships, summary, embeddings, graph_data, export_files, text_stats,
|
| 1052 |
+
er_stats, processing_time, entity_types, relationship_types
|
| 1053 |
+
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16)
|
| 1054 |
+
ON CONFLICT (analysis_id) DO UPDATE SET
|
| 1055 |
+
entities = EXCLUDED.entities,
|
| 1056 |
+
relationships = EXCLUDED.relationships,
|
| 1057 |
+
summary = EXCLUDED.summary
|
| 1058 |
+
""",
|
| 1059 |
+
data['analysis_id'],
|
| 1060 |
+
data['source_text'][:10000],
|
| 1061 |
+
data['source_type'],
|
| 1062 |
+
data['language'],
|
| 1063 |
+
json.dumps(data['entities'], ensure_ascii=False),
|
| 1064 |
+
json.dumps(data['keywords'], ensure_ascii=False),
|
| 1065 |
+
json.dumps(data['relationships'], ensure_ascii=False),
|
| 1066 |
+
data['summary'],
|
| 1067 |
+
json.dumps(data.get('embeddings', [])),
|
| 1068 |
+
json.dumps(data.get('graph_data', {}), ensure_ascii=False, default=str),
|
| 1069 |
+
json.dumps(data.get('export_files', {}), ensure_ascii=False, default=str),
|
| 1070 |
+
json.dumps(data.get('text_stats', {})),
|
| 1071 |
+
json.dumps(data.get('er_stats', {})),
|
| 1072 |
+
float(data.get('processing_time', 0)),
|
| 1073 |
+
json.dumps(list(set(entity.get('label', '') for entity in data.get('entities', [])))),
|
| 1074 |
+
json.dumps(list(set(rel.get('relationship_type', '') for rel in data.get('relationships', []))))
|
| 1075 |
+
)
|
| 1076 |
+
|
| 1077 |
+
logger.info(f"β
Analysis {data['analysis_id']} saved to database")
|
| 1078 |
+
return True
|
| 1079 |
+
except Exception as e:
|
| 1080 |
+
logger.error(f"β DB save failed for {data.get('analysis_id', 'unknown')}: {e}")
|
| 1081 |
+
return False
|
| 1082 |
+
|
| 1083 |
+
async def save_to_blob(analysis_id: str, data: Dict[str, Any]) -> bool:
|
| 1084 |
+
blob_client = get_blob_client()
|
| 1085 |
+
if not blob_client:
|
| 1086 |
+
return False
|
| 1087 |
+
|
| 1088 |
+
try:
|
| 1089 |
+
blob_name = f"ner_analysis/{analysis_id}_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.json"
|
| 1090 |
+
blob_client_obj = blob_client.get_blob_client(container=config.BLOB_CONTAINER, blob=blob_name)
|
| 1091 |
+
blob_client_obj.upload_blob(json.dumps(data, indent=2, ensure_ascii=False, default=str), overwrite=True)
|
| 1092 |
+
return True
|
| 1093 |
+
except Exception as e:
|
| 1094 |
+
logger.error(f"Blob save failed: {e}")
|
| 1095 |
+
return False
|
| 1096 |
+
|
| 1097 |
+
# App Lifecycle
|
| 1098 |
+
@asynccontextmanager
|
| 1099 |
+
async def lifespan(app: FastAPI):
|
| 1100 |
+
logger.info("π Starting Enhanced NER Analysis Service...")
|
| 1101 |
+
|
| 1102 |
+
logger.info("π Database initialization...")
|
| 1103 |
+
db_ok = await init_database()
|
| 1104 |
+
if not db_ok:
|
| 1105 |
+
logger.error("β Database initialization failed!")
|
| 1106 |
+
raise RuntimeError("Database initialization failed")
|
| 1107 |
+
|
| 1108 |
+
logger.info("π Initializing API clients...")
|
| 1109 |
+
get_deepseek_client()
|
| 1110 |
+
get_openai_client()
|
| 1111 |
+
get_blob_client()
|
| 1112 |
+
|
| 1113 |
+
logger.info("π Creating export directories...")
|
| 1114 |
+
EXPORT_DIR.mkdir(exist_ok=True)
|
| 1115 |
+
|
| 1116 |
+
logger.info("π Enhanced NER Analysis Service is ready!")
|
| 1117 |
+
logger.info(f"π‘ Server running on http://{config.HOST}:{config.PORT}")
|
| 1118 |
+
|
| 1119 |
+
yield
|
| 1120 |
+
|
| 1121 |
+
logger.info("π Shutting down...")
|
| 1122 |
+
if pg_pool:
|
| 1123 |
+
await pg_pool.close()
|
| 1124 |
+
logger.info("β
Database connections closed")
|
| 1125 |
+
|
| 1126 |
+
# FastAPI App
|
| 1127 |
+
app = FastAPI(
|
| 1128 |
+
title="Enhanced NER Analysis Service",
|
| 1129 |
+
description="Advanced Named Entity Recognition with relationship extraction and graph exports",
|
| 1130 |
+
version="2.0.0",
|
| 1131 |
+
lifespan=lifespan
|
| 1132 |
+
)
|
| 1133 |
+
|
| 1134 |
+
app.add_middleware(
|
| 1135 |
+
CORSMiddleware,
|
| 1136 |
+
allow_origins=["*"],
|
| 1137 |
+
allow_credentials=True,
|
| 1138 |
+
allow_methods=["*"],
|
| 1139 |
+
allow_headers=["*"],
|
| 1140 |
+
)
|
| 1141 |
+
|
| 1142 |
+
# API Endpoints
|
| 1143 |
+
@app.get("/")
|
| 1144 |
+
async def root():
|
| 1145 |
+
deepseek_available = bool(config.DEEPSEEK_ENDPOINT and config.DEEPSEEK_API_KEY)
|
| 1146 |
+
openai_available = bool(config.AZURE_OPENAI_ENDPOINT and config.AZURE_OPENAI_API_KEY)
|
| 1147 |
+
blob_available = bool(config.AZURE_STORAGE_ACCOUNT_URL and config.AZURE_BLOB_SAS_TOKEN)
|
| 1148 |
+
|
| 1149 |
+
return {
|
| 1150 |
+
"message": "Enhanced NER Analysis Service",
|
| 1151 |
+
"version": "2.0.0",
|
| 1152 |
+
"status": "operational",
|
| 1153 |
+
"supported_entities": config.ENTITY_TYPES,
|
| 1154 |
+
"supported_relationships": config.RELATIONSHIP_TYPES[:10],
|
| 1155 |
+
"export_formats": ["neo4j", "json", "graphml"],
|
| 1156 |
+
"features": {
|
| 1157 |
+
"ner_analysis": True,
|
| 1158 |
+
"relationship_extraction": True,
|
| 1159 |
+
"thai_language_support": True,
|
| 1160 |
+
"graph_database_export": True,
|
| 1161 |
+
"embedding_generation": openai_available,
|
| 1162 |
+
"deepseek_analysis": deepseek_available,
|
| 1163 |
+
"blob_storage": blob_available
|
| 1164 |
+
}
|
| 1165 |
+
}
|
| 1166 |
+
|
| 1167 |
+
@app.get("/health")
|
| 1168 |
+
async def health():
|
| 1169 |
+
deepseek_available = bool(config.DEEPSEEK_ENDPOINT and config.DEEPSEEK_API_KEY)
|
| 1170 |
+
openai_available = bool(config.AZURE_OPENAI_ENDPOINT and config.AZURE_OPENAI_API_KEY)
|
| 1171 |
+
blob_available = bool(config.AZURE_STORAGE_ACCOUNT_URL and config.AZURE_BLOB_SAS_TOKEN)
|
| 1172 |
+
|
| 1173 |
+
return {
|
| 1174 |
+
"status": "healthy",
|
| 1175 |
+
"service": "NER Analysis Service",
|
| 1176 |
+
"version": "2.0.0",
|
| 1177 |
+
"database": pg_pool is not None,
|
| 1178 |
+
"vector_extension": vector_available,
|
| 1179 |
+
"deepseek": deepseek_available,
|
| 1180 |
+
"openai": openai_available,
|
| 1181 |
+
"blob_storage": blob_available,
|
| 1182 |
+
"supported_entity_count": len(config.ENTITY_TYPES),
|
| 1183 |
+
"supported_relationship_count": len(config.RELATIONSHIP_TYPES),
|
| 1184 |
+
"export_formats": ["neo4j", "json", "graphml"]
|
| 1185 |
+
}
|
| 1186 |
+
|
| 1187 |
+
@app.post("/analyze/text", response_model=NERResponse)
|
| 1188 |
+
async def analyze_text(request: NERRequest, background_tasks: BackgroundTasks):
|
| 1189 |
+
"""Analyze text for entities and relationships"""
|
| 1190 |
+
start_time = datetime.utcnow()
|
| 1191 |
+
analysis_id = f"text_{int(start_time.timestamp())}"
|
| 1192 |
+
|
| 1193 |
+
if not request.text or not request.text.strip():
|
| 1194 |
+
raise HTTPException(status_code=400, detail="Text is required")
|
| 1195 |
+
|
| 1196 |
+
try:
|
| 1197 |
+
language = detect_language(request.text)
|
| 1198 |
+
text_stats = get_text_stats(request.text)
|
| 1199 |
+
|
| 1200 |
+
# Enhanced analysis
|
| 1201 |
+
analysis_result = await analyze_with_deepseek(request.text, language)
|
| 1202 |
+
|
| 1203 |
+
# Generate embeddings if requested
|
| 1204 |
+
embeddings = []
|
| 1205 |
+
if request.include_embeddings:
|
| 1206 |
+
embeddings = await generate_embeddings(request.text)
|
| 1207 |
+
|
| 1208 |
+
# Create enhanced graph
|
| 1209 |
+
graph_data = create_enhanced_graph_data(
|
| 1210 |
+
analysis_result.get('entities', []),
|
| 1211 |
+
analysis_result.get('relationships', [])
|
| 1212 |
+
)
|
| 1213 |
+
|
| 1214 |
+
# Calculate ER statistics
|
| 1215 |
+
er_stats = calculate_er_stats(
|
| 1216 |
+
analysis_result.get('entities', []),
|
| 1217 |
+
analysis_result.get('relationships', [])
|
| 1218 |
+
)
|
| 1219 |
+
|
| 1220 |
+
# Generate export files if requested
|
| 1221 |
+
export_files = ExportFiles()
|
| 1222 |
+
if request.generate_graph_files:
|
| 1223 |
+
export_files = await generate_export_files(
|
| 1224 |
+
analysis_id,
|
| 1225 |
+
analysis_result.get('entities', []),
|
| 1226 |
+
analysis_result.get('relationships', []),
|
| 1227 |
+
graph_data,
|
| 1228 |
+
request.export_formats
|
| 1229 |
+
)
|
| 1230 |
+
|
| 1231 |
+
processing_time = (datetime.utcnow() - start_time).total_seconds()
|
| 1232 |
+
|
| 1233 |
+
response_data = {
|
| 1234 |
+
"analysis_id": analysis_id,
|
| 1235 |
+
"source_text": request.text,
|
| 1236 |
+
"source_type": "text_input",
|
| 1237 |
+
"language": language,
|
| 1238 |
+
"entities": analysis_result.get('entities', []),
|
| 1239 |
+
"keywords": analysis_result.get('keywords', []),
|
| 1240 |
+
"relationships": analysis_result.get('relationships', []),
|
| 1241 |
+
"summary": analysis_result.get('summary', ''),
|
| 1242 |
+
"embeddings": embeddings,
|
| 1243 |
+
"graph_data": graph_data,
|
| 1244 |
+
"export_files": export_files,
|
| 1245 |
+
"text_stats": text_stats,
|
| 1246 |
+
"er_stats": er_stats,
|
| 1247 |
+
"processing_time": processing_time,
|
| 1248 |
+
"character_count": text_stats["character_count"],
|
| 1249 |
+
"word_count": text_stats["word_count"],
|
| 1250 |
+
"sentence_count": text_stats["sentence_count"]
|
| 1251 |
+
}
|
| 1252 |
+
|
| 1253 |
+
# Save to database in background
|
| 1254 |
+
background_tasks.add_task(save_to_database, response_data)
|
| 1255 |
+
background_tasks.add_task(save_to_blob, analysis_id, response_data)
|
| 1256 |
+
|
| 1257 |
+
return NERResponse(
|
| 1258 |
+
success=True,
|
| 1259 |
+
entity_relationship_stats=er_stats,
|
| 1260 |
+
**response_data
|
| 1261 |
+
)
|
| 1262 |
+
|
| 1263 |
+
except HTTPException:
|
| 1264 |
+
raise
|
| 1265 |
+
except Exception as e:
|
| 1266 |
+
logger.error(f"Text analysis failed: {e}")
|
| 1267 |
+
return NERResponse(
|
| 1268 |
+
success=False,
|
| 1269 |
+
analysis_id=analysis_id,
|
| 1270 |
+
source_text=request.text[:1000],
|
| 1271 |
+
source_type="text_input",
|
| 1272 |
+
language="unknown",
|
| 1273 |
+
entities=[],
|
| 1274 |
+
keywords=[],
|
| 1275 |
+
relationships=[],
|
| 1276 |
+
summary="",
|
| 1277 |
+
graph_data=GraphData(nodes=[], links=[], metadata={}),
|
| 1278 |
+
export_files=ExportFiles(),
|
| 1279 |
+
processing_time=(datetime.utcnow() - start_time).total_seconds(),
|
| 1280 |
+
character_count=0,
|
| 1281 |
+
word_count=0,
|
| 1282 |
+
sentence_count=0,
|
| 1283 |
+
entity_relationship_stats={},
|
| 1284 |
+
error=str(e)
|
| 1285 |
+
)
|
| 1286 |
+
|
| 1287 |
+
@app.post("/analyze/file", response_model=NERResponse)
|
| 1288 |
+
async def analyze_file(
|
| 1289 |
+
file: UploadFile = File(...),
|
| 1290 |
+
extract_relationships: bool = Form(True),
|
| 1291 |
+
include_embeddings: bool = Form(True),
|
| 1292 |
+
include_summary: bool = Form(True),
|
| 1293 |
+
generate_graph_files: bool = Form(True),
|
| 1294 |
+
export_formats: str = Form("neo4j,json"),
|
| 1295 |
+
background_tasks: BackgroundTasks = None
|
| 1296 |
+
):
|
| 1297 |
+
"""Analyze uploaded file for entities and relationships"""
|
| 1298 |
+
start_time = datetime.utcnow()
|
| 1299 |
+
analysis_id = f"file_{int(start_time.timestamp())}"
|
| 1300 |
+
|
| 1301 |
+
if not file.filename:
|
| 1302 |
+
raise HTTPException(status_code=400, detail="No filename")
|
| 1303 |
+
|
| 1304 |
+
try:
|
| 1305 |
+
file_content = await file.read()
|
| 1306 |
+
if len(file_content) > config.MAX_FILE_SIZE:
|
| 1307 |
+
raise HTTPException(status_code=400, detail="File too large")
|
| 1308 |
+
|
| 1309 |
+
file_ext = Path(file.filename).suffix.lower()
|
| 1310 |
+
export_format_list = export_formats.split(',') if export_formats else ["json"]
|
| 1311 |
+
|
| 1312 |
+
if file_ext in config.SUPPORTED_TEXT_FORMATS:
|
| 1313 |
+
text = extract_text_from_file(file_content, file.filename)
|
| 1314 |
+
source_type = "text_file"
|
| 1315 |
+
elif file_ext in config.SUPPORTED_OCR_FORMATS:
|
| 1316 |
+
text = await get_text_from_ocr(file_content, file.filename)
|
| 1317 |
+
source_type = "ocr_file"
|
| 1318 |
+
else:
|
| 1319 |
+
raise HTTPException(status_code=400, detail=f"Unsupported format: {file_ext}")
|
| 1320 |
+
|
| 1321 |
+
if not text.strip():
|
| 1322 |
+
raise HTTPException(status_code=400, detail="No text extracted")
|
| 1323 |
+
|
| 1324 |
+
language = detect_language(text)
|
| 1325 |
+
text_stats = get_text_stats(text)
|
| 1326 |
+
|
| 1327 |
+
# Enhanced analysis
|
| 1328 |
+
analysis_result = await analyze_with_deepseek(text, language)
|
| 1329 |
+
|
| 1330 |
+
# Generate embeddings
|
| 1331 |
+
embeddings = []
|
| 1332 |
+
if include_embeddings:
|
| 1333 |
+
embeddings = await generate_embeddings(text)
|
| 1334 |
+
|
| 1335 |
+
# Create enhanced graph
|
| 1336 |
+
graph_data = create_enhanced_graph_data(
|
| 1337 |
+
analysis_result.get('entities', []),
|
| 1338 |
+
analysis_result.get('relationships', [])
|
| 1339 |
+
)
|
| 1340 |
+
|
| 1341 |
+
# Calculate ER statistics
|
| 1342 |
+
er_stats = calculate_er_stats(
|
| 1343 |
+
analysis_result.get('entities', []),
|
| 1344 |
+
analysis_result.get('relationships', [])
|
| 1345 |
+
)
|
| 1346 |
+
|
| 1347 |
+
# Generate export files
|
| 1348 |
+
export_files = ExportFiles()
|
| 1349 |
+
if generate_graph_files:
|
| 1350 |
+
export_files = await generate_export_files(
|
| 1351 |
+
analysis_id,
|
| 1352 |
+
analysis_result.get('entities', []),
|
| 1353 |
+
analysis_result.get('relationships', []),
|
| 1354 |
+
graph_data,
|
| 1355 |
+
export_format_list
|
| 1356 |
+
)
|
| 1357 |
+
|
| 1358 |
+
processing_time = (datetime.utcnow() - start_time).total_seconds()
|
| 1359 |
+
|
| 1360 |
+
response_data = {
|
| 1361 |
+
"analysis_id": analysis_id,
|
| 1362 |
+
"source_text": text,
|
| 1363 |
+
"source_type": source_type,
|
| 1364 |
+
"language": language,
|
| 1365 |
+
"entities": analysis_result.get('entities', []),
|
| 1366 |
+
"keywords": analysis_result.get('keywords', []),
|
| 1367 |
+
"relationships": analysis_result.get('relationships', []),
|
| 1368 |
+
"summary": analysis_result.get('summary', ''),
|
| 1369 |
+
"embeddings": embeddings,
|
| 1370 |
+
"graph_data": graph_data,
|
| 1371 |
+
"export_files": export_files,
|
| 1372 |
+
"text_stats": text_stats,
|
| 1373 |
+
"er_stats": er_stats,
|
| 1374 |
+
"processing_time": processing_time,
|
| 1375 |
+
"character_count": text_stats["character_count"],
|
| 1376 |
+
"word_count": text_stats["word_count"],
|
| 1377 |
+
"sentence_count": text_stats["sentence_count"]
|
| 1378 |
+
}
|
| 1379 |
+
|
| 1380 |
+
# Save in background
|
| 1381 |
+
if background_tasks:
|
| 1382 |
+
background_tasks.add_task(save_to_database, response_data)
|
| 1383 |
+
background_tasks.add_task(save_to_blob, analysis_id, response_data)
|
| 1384 |
+
|
| 1385 |
+
return NERResponse(
|
| 1386 |
+
success=True,
|
| 1387 |
+
entity_relationship_stats=er_stats,
|
| 1388 |
+
**response_data
|
| 1389 |
+
)
|
| 1390 |
+
|
| 1391 |
+
except HTTPException:
|
| 1392 |
+
raise
|
| 1393 |
+
except Exception as e:
|
| 1394 |
+
logger.error(f"File analysis failed: {e}")
|
| 1395 |
+
return NERResponse(
|
| 1396 |
+
success=False,
|
| 1397 |
+
analysis_id=analysis_id,
|
| 1398 |
+
source_text="",
|
| 1399 |
+
source_type="file_input",
|
| 1400 |
+
language="unknown",
|
| 1401 |
+
entities=[],
|
| 1402 |
+
keywords=[],
|
| 1403 |
+
relationships=[],
|
| 1404 |
+
summary="",
|
| 1405 |
+
graph_data=GraphData(nodes=[], links=[], metadata={}),
|
| 1406 |
+
export_files=ExportFiles(),
|
| 1407 |
+
processing_time=(datetime.utcnow() - start_time).total_seconds(),
|
| 1408 |
+
character_count=0,
|
| 1409 |
+
word_count=0,
|
| 1410 |
+
sentence_count=0,
|
| 1411 |
+
entity_relationship_stats={},
|
| 1412 |
+
error=str(e)
|
| 1413 |
+
)
|
| 1414 |
+
|
| 1415 |
+
@app.post("/analyze/url", response_model=NERResponse)
|
| 1416 |
+
async def analyze_url(request: NERRequest, background_tasks: BackgroundTasks):
|
| 1417 |
+
"""Analyze URL content for entities and relationships"""
|
| 1418 |
+
start_time = datetime.utcnow()
|
| 1419 |
+
analysis_id = f"url_{int(start_time.timestamp())}"
|
| 1420 |
+
|
| 1421 |
+
if not request.url:
|
| 1422 |
+
raise HTTPException(status_code=400, detail="URL is required")
|
| 1423 |
+
|
| 1424 |
+
try:
|
| 1425 |
+
text = await get_text_from_url(str(request.url))
|
| 1426 |
+
|
| 1427 |
+
if not text.strip():
|
| 1428 |
+
raise HTTPException(status_code=400, detail="No text extracted from URL")
|
| 1429 |
+
|
| 1430 |
+
language = detect_language(text)
|
| 1431 |
+
text_stats = get_text_stats(text)
|
| 1432 |
+
|
| 1433 |
+
# Enhanced analysis
|
| 1434 |
+
analysis_result = await analyze_with_deepseek(text, language)
|
| 1435 |
+
|
| 1436 |
+
# Generate embeddings
|
| 1437 |
+
embeddings = []
|
| 1438 |
+
if request.include_embeddings:
|
| 1439 |
+
embeddings = await generate_embeddings(text)
|
| 1440 |
+
|
| 1441 |
+
# Create enhanced graph
|
| 1442 |
+
graph_data = create_enhanced_graph_data(
|
| 1443 |
+
analysis_result.get('entities', []),
|
| 1444 |
+
analysis_result.get('relationships', [])
|
| 1445 |
+
)
|
| 1446 |
+
|
| 1447 |
+
# Calculate ER statistics
|
| 1448 |
+
er_stats = calculate_er_stats(
|
| 1449 |
+
analysis_result.get('entities', []),
|
| 1450 |
+
analysis_result.get('relationships', [])
|
| 1451 |
+
)
|
| 1452 |
+
|
| 1453 |
+
# Generate export files
|
| 1454 |
+
export_files = ExportFiles()
|
| 1455 |
+
if request.generate_graph_files:
|
| 1456 |
+
export_files = await generate_export_files(
|
| 1457 |
+
analysis_id,
|
| 1458 |
+
analysis_result.get('entities', []),
|
| 1459 |
+
analysis_result.get('relationships', []),
|
| 1460 |
+
graph_data,
|
| 1461 |
+
request.export_formats
|
| 1462 |
+
)
|
| 1463 |
+
|
| 1464 |
+
processing_time = (datetime.utcnow() - start_time).total_seconds()
|
| 1465 |
+
|
| 1466 |
+
response_data = {
|
| 1467 |
+
"analysis_id": analysis_id,
|
| 1468 |
+
"source_text": text,
|
| 1469 |
+
"source_type": "url_content",
|
| 1470 |
+
"language": language,
|
| 1471 |
+
"entities": analysis_result.get('entities', []),
|
| 1472 |
+
"keywords": analysis_result.get('keywords', []),
|
| 1473 |
+
"relationships": analysis_result.get('relationships', []),
|
| 1474 |
+
"summary": analysis_result.get('summary', ''),
|
| 1475 |
+
"embeddings": embeddings,
|
| 1476 |
+
"graph_data": graph_data,
|
| 1477 |
+
"export_files": export_files,
|
| 1478 |
+
"text_stats": text_stats,
|
| 1479 |
+
"er_stats": er_stats,
|
| 1480 |
+
"processing_time": processing_time,
|
| 1481 |
+
"character_count": text_stats["character_count"],
|
| 1482 |
+
"word_count": text_stats["word_count"],
|
| 1483 |
+
"sentence_count": text_stats["sentence_count"]
|
| 1484 |
+
}
|
| 1485 |
+
|
| 1486 |
+
# Save in background
|
| 1487 |
+
background_tasks.add_task(save_to_database, response_data)
|
| 1488 |
+
background_tasks.add_task(save_to_blob, analysis_id, response_data)
|
| 1489 |
+
|
| 1490 |
+
return NERResponse(
|
| 1491 |
+
success=True,
|
| 1492 |
+
entity_relationship_stats=er_stats,
|
| 1493 |
+
**response_data
|
| 1494 |
+
)
|
| 1495 |
+
|
| 1496 |
+
except HTTPException:
|
| 1497 |
+
raise
|
| 1498 |
+
except Exception as e:
|
| 1499 |
+
logger.error(f"URL analysis failed: {e}")
|
| 1500 |
+
return NERResponse(
|
| 1501 |
+
success=False,
|
| 1502 |
+
analysis_id=analysis_id,
|
| 1503 |
+
source_text="",
|
| 1504 |
+
source_type="url_content",
|
| 1505 |
+
language="unknown",
|
| 1506 |
+
entities=[],
|
| 1507 |
+
keywords=[],
|
| 1508 |
+
relationships=[],
|
| 1509 |
+
summary="",
|
| 1510 |
+
graph_data=GraphData(nodes=[], links=[], metadata={}),
|
| 1511 |
+
export_files=ExportFiles(),
|
| 1512 |
+
processing_time=(datetime.utcnow() - start_time).total_seconds(),
|
| 1513 |
+
character_count=0,
|
| 1514 |
+
word_count=0,
|
| 1515 |
+
sentence_count=0,
|
| 1516 |
+
entity_relationship_stats={},
|
| 1517 |
+
error=str(e)
|
| 1518 |
+
)
|
| 1519 |
+
|
| 1520 |
+
@app.get("/download/{analysis_id}/{file_type}")
|
| 1521 |
+
async def download_export_file(analysis_id: str, file_type: str):
|
| 1522 |
+
"""Download specific export file for an analysis"""
|
| 1523 |
+
try:
|
| 1524 |
+
analysis_dir = EXPORT_DIR / analysis_id
|
| 1525 |
+
|
| 1526 |
+
if not analysis_dir.exists():
|
| 1527 |
+
raise HTTPException(status_code=404, detail=f"Analysis {analysis_id} not found")
|
| 1528 |
+
|
| 1529 |
+
file_mapping = {
|
| 1530 |
+
"neo4j_nodes": "neo4j_nodes.csv",
|
| 1531 |
+
"neo4j_relationships": "neo4j_relationships.csv",
|
| 1532 |
+
"json": "analysis_export.json",
|
| 1533 |
+
"graphml": "graph_export.graphml"
|
| 1534 |
+
}
|
| 1535 |
+
|
| 1536 |
+
if file_type not in file_mapping:
|
| 1537 |
+
raise HTTPException(status_code=400, detail=f"Invalid file type: {file_type}")
|
| 1538 |
+
|
| 1539 |
+
file_path = analysis_dir / file_mapping[file_type]
|
| 1540 |
+
|
| 1541 |
+
if not file_path.exists():
|
| 1542 |
+
raise HTTPException(status_code=404, detail=f"File {file_type} not found")
|
| 1543 |
+
|
| 1544 |
+
return FileResponse(path=file_path, filename=file_mapping[file_type])
|
| 1545 |
+
|
| 1546 |
+
except HTTPException:
|
| 1547 |
+
raise
|
| 1548 |
+
except Exception as e:
|
| 1549 |
+
logger.error(f"Download failed for {analysis_id}/{file_type}: {e}")
|
| 1550 |
+
raise HTTPException(status_code=500, detail=f"Download failed: {str(e)}")
|
| 1551 |
+
|
| 1552 |
+
@app.get("/entity-types")
|
| 1553 |
+
async def get_entity_types():
|
| 1554 |
+
"""Get all supported entity types"""
|
| 1555 |
+
return {
|
| 1556 |
+
"success": True,
|
| 1557 |
+
"entity_types": config.ENTITY_TYPES,
|
| 1558 |
+
"total_count": len(config.ENTITY_TYPES)
|
| 1559 |
+
}
|
| 1560 |
+
|
| 1561 |
+
@app.get("/relationship-types")
|
| 1562 |
+
async def get_relationship_types():
|
| 1563 |
+
"""Get all supported relationship types"""
|
| 1564 |
+
return {
|
| 1565 |
+
"success": True,
|
| 1566 |
+
"relationship_types": config.RELATIONSHIP_TYPES,
|
| 1567 |
+
"total_count": len(config.RELATIONSHIP_TYPES)
|
| 1568 |
+
}
|
| 1569 |
+
|
| 1570 |
+
if __name__ == "__main__":
|
| 1571 |
+
print("π§ Loading enhanced NER configuration...")
|
| 1572 |
+
print(f"π Will start server on {config.HOST}:{config.PORT}")
|
| 1573 |
+
print(f"π·οΈ Enhanced with {len(config.ENTITY_TYPES)} entity types")
|
| 1574 |
+
print(f"π Enhanced with {len(config.RELATIONSHIP_TYPES)} relationship types")
|
| 1575 |
+
|
| 1576 |
+
uvicorn.run(
|
| 1577 |
+
"ner_service:app",
|
| 1578 |
+
host=config.HOST,
|
| 1579 |
+
port=config.PORT,
|
| 1580 |
+
reload=config.DEBUG,
|
| 1581 |
+
log_level="info"
|
| 1582 |
+
)
|
service/ocr_service.py
ADDED
|
@@ -0,0 +1,588 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OCR Backend API with Azure Document Intelligence - Cleaned and Optimized
|
| 4 |
+
Supports file uploads, URL processing, and web scraping fallback
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import io
|
| 9 |
+
import requests
|
| 10 |
+
import numpy as np
|
| 11 |
+
import logging
|
| 12 |
+
from typing import Optional, List, Dict, Any
|
| 13 |
+
from urllib.parse import urlparse, urljoin
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import mimetypes
|
| 16 |
+
|
| 17 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
| 18 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 19 |
+
from pydantic import BaseModel, HttpUrl
|
| 20 |
+
import uvicorn
|
| 21 |
+
|
| 22 |
+
# Import unified configuration
|
| 23 |
+
try:
|
| 24 |
+
from configs import get_config
|
| 25 |
+
config = get_config().ocr
|
| 26 |
+
print("β
Using unified configuration")
|
| 27 |
+
except ImportError:
|
| 28 |
+
print("β οΈ Unified config not available, using fallback configuration")
|
| 29 |
+
from dotenv import load_dotenv
|
| 30 |
+
load_dotenv()
|
| 31 |
+
|
| 32 |
+
class FallbackConfig:
|
| 33 |
+
HOST = os.getenv("HOST", "0.0.0.0")
|
| 34 |
+
PORT = int(os.getenv("OCR_PORT", "8400"))
|
| 35 |
+
DEBUG = os.getenv("DEBUG", "True").lower() == "true"
|
| 36 |
+
|
| 37 |
+
# Azure Document Intelligence configuration
|
| 38 |
+
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT", "")
|
| 39 |
+
AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY", "")
|
| 40 |
+
|
| 41 |
+
# Web scraping configuration
|
| 42 |
+
MAX_IMAGES_PER_PAGE = int(os.getenv("MAX_IMAGES_PER_PAGE", "10"))
|
| 43 |
+
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "30"))
|
| 44 |
+
USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
| 45 |
+
|
| 46 |
+
# File size limits
|
| 47 |
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
|
| 48 |
+
|
| 49 |
+
config = FallbackConfig()
|
| 50 |
+
|
| 51 |
+
from azure.core.credentials import AzureKeyCredential
|
| 52 |
+
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
| 53 |
+
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
| 54 |
+
from azure.core.exceptions import HttpResponseError
|
| 55 |
+
|
| 56 |
+
from bs4 import BeautifulSoup
|
| 57 |
+
from PIL import Image
|
| 58 |
+
|
| 59 |
+
# Configure logging
|
| 60 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 61 |
+
logger = logging.getLogger(__name__)
|
| 62 |
+
|
| 63 |
+
# Initialize FastAPI app
|
| 64 |
+
app = FastAPI(
|
| 65 |
+
title="OCR Backend API",
|
| 66 |
+
description="OCR service with Azure Document Intelligence, supporting file uploads, URLs, and web scraping",
|
| 67 |
+
version="2.0.0",
|
| 68 |
+
debug=config.DEBUG
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# CORS configuration
|
| 72 |
+
app.add_middleware(
|
| 73 |
+
CORSMiddleware,
|
| 74 |
+
allow_origins=["*"],
|
| 75 |
+
allow_credentials=True,
|
| 76 |
+
allow_methods=["*"],
|
| 77 |
+
allow_headers=["*"],
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Pydantic models
|
| 81 |
+
class URLRequest(BaseModel):
|
| 82 |
+
url: HttpUrl
|
| 83 |
+
extract_images: bool = True
|
| 84 |
+
|
| 85 |
+
class OCRResponse(BaseModel):
|
| 86 |
+
success: bool
|
| 87 |
+
content: str
|
| 88 |
+
pages: List[Dict[str, Any]]
|
| 89 |
+
source_type: str # 'file_upload', 'direct_url', 'web_scraped'
|
| 90 |
+
source_url: Optional[str] = None
|
| 91 |
+
error: Optional[str] = None
|
| 92 |
+
|
| 93 |
+
class WebScrapingResult(BaseModel):
|
| 94 |
+
text_content: str
|
| 95 |
+
images_found: List[str]
|
| 96 |
+
ocr_results: List[Dict[str, Any]]
|
| 97 |
+
|
| 98 |
+
# Utility functions
|
| 99 |
+
def format_bounding_box(bounding_box):
|
| 100 |
+
"""Format bounding box coordinates for display"""
|
| 101 |
+
if not bounding_box:
|
| 102 |
+
return "N/A"
|
| 103 |
+
reshaped_bounding_box = np.array(bounding_box).reshape(-1, 2)
|
| 104 |
+
return ", ".join(["[{}, {}]".format(x, y) for x, y in reshaped_bounding_box])
|
| 105 |
+
|
| 106 |
+
def is_supported_file_type(content_type: str, filename: str = "") -> bool:
|
| 107 |
+
"""Check if the file type is supported for OCR"""
|
| 108 |
+
supported_types = {
|
| 109 |
+
'application/pdf',
|
| 110 |
+
'image/jpeg',
|
| 111 |
+
'image/jpg',
|
| 112 |
+
'image/png',
|
| 113 |
+
'image/tiff',
|
| 114 |
+
'image/bmp',
|
| 115 |
+
'image/gif'
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
if content_type and content_type.lower() in supported_types:
|
| 119 |
+
return True
|
| 120 |
+
|
| 121 |
+
# Check by file extension if content type is unclear
|
| 122 |
+
if filename:
|
| 123 |
+
supported_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp', '.gif'}
|
| 124 |
+
file_ext = Path(filename).suffix.lower()
|
| 125 |
+
return file_ext in supported_extensions
|
| 126 |
+
|
| 127 |
+
return False
|
| 128 |
+
|
| 129 |
+
def get_document_intelligence_client():
|
| 130 |
+
"""Initialize Azure Document Intelligence client"""
|
| 131 |
+
if (config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT == "" or
|
| 132 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_KEY == "" or
|
| 133 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT == "YOUR_FORM_RECOGNIZER_ENDPOINT" or
|
| 134 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_KEY == "YOUR_FORM_RECOGNIZER_KEY"):
|
| 135 |
+
raise HTTPException(
|
| 136 |
+
status_code=500,
|
| 137 |
+
detail="Azure Document Intelligence credentials not configured"
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
return DocumentIntelligenceClient(
|
| 141 |
+
endpoint=config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
|
| 142 |
+
credential=AzureKeyCredential(config.AZURE_DOCUMENT_INTELLIGENCE_KEY)
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
async def process_ocr_from_url(url: str) -> Dict[str, Any]:
|
| 146 |
+
"""Process OCR from a direct URL"""
|
| 147 |
+
try:
|
| 148 |
+
client = get_document_intelligence_client()
|
| 149 |
+
|
| 150 |
+
logger.info(f"Processing OCR from URL: {url}")
|
| 151 |
+
poller = client.begin_analyze_document(
|
| 152 |
+
"prebuilt-read",
|
| 153 |
+
AnalyzeDocumentRequest(url_source=url)
|
| 154 |
+
)
|
| 155 |
+
result = poller.result()
|
| 156 |
+
|
| 157 |
+
return format_ocr_result(result, "direct_url", url)
|
| 158 |
+
|
| 159 |
+
except HttpResponseError as e:
|
| 160 |
+
logger.error(f"Azure OCR error for URL {url}: {e}")
|
| 161 |
+
raise HTTPException(status_code=400, detail=f"OCR processing failed: {e}")
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.error(f"Unexpected error processing URL {url}: {e}")
|
| 164 |
+
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
| 165 |
+
|
| 166 |
+
async def process_ocr_from_bytes(file_bytes: bytes, filename: str = "") -> Dict[str, Any]:
|
| 167 |
+
"""Process OCR from file bytes"""
|
| 168 |
+
try:
|
| 169 |
+
client = get_document_intelligence_client()
|
| 170 |
+
|
| 171 |
+
logger.info(f"Processing OCR from file: {filename} ({len(file_bytes)} bytes)")
|
| 172 |
+
poller = client.begin_analyze_document(
|
| 173 |
+
"prebuilt-read",
|
| 174 |
+
AnalyzeDocumentRequest(bytes_source=file_bytes)
|
| 175 |
+
)
|
| 176 |
+
result = poller.result()
|
| 177 |
+
|
| 178 |
+
return format_ocr_result(result, "file_upload", filename)
|
| 179 |
+
|
| 180 |
+
except HttpResponseError as e:
|
| 181 |
+
logger.error(f"Azure OCR error for file {filename}: {e}")
|
| 182 |
+
raise HTTPException(status_code=400, detail=f"OCR processing failed: {e}")
|
| 183 |
+
except Exception as e:
|
| 184 |
+
logger.error(f"Unexpected error processing file {filename}: {e}")
|
| 185 |
+
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
| 186 |
+
|
| 187 |
+
def format_ocr_result(result, source_type: str, source_identifier: str = "") -> Dict[str, Any]:
|
| 188 |
+
"""Format Azure Document Intelligence result into standardized response"""
|
| 189 |
+
pages_data = []
|
| 190 |
+
|
| 191 |
+
for page in result.pages:
|
| 192 |
+
page_data = {
|
| 193 |
+
"page_number": page.page_number,
|
| 194 |
+
"width": page.width,
|
| 195 |
+
"height": page.height,
|
| 196 |
+
"unit": page.unit,
|
| 197 |
+
"lines": [],
|
| 198 |
+
"words": []
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
# Process lines
|
| 202 |
+
if hasattr(page, 'lines') and page.lines:
|
| 203 |
+
for line_idx, line in enumerate(page.lines):
|
| 204 |
+
page_data["lines"].append({
|
| 205 |
+
"line_number": line_idx,
|
| 206 |
+
"content": line.content,
|
| 207 |
+
"bounding_box": format_bounding_box(line.polygon) if hasattr(line, 'polygon') else "N/A"
|
| 208 |
+
})
|
| 209 |
+
|
| 210 |
+
# Process words
|
| 211 |
+
if hasattr(page, 'words') and page.words:
|
| 212 |
+
for word in page.words:
|
| 213 |
+
page_data["words"].append({
|
| 214 |
+
"content": word.content,
|
| 215 |
+
"confidence": word.confidence if hasattr(word, 'confidence') else None
|
| 216 |
+
})
|
| 217 |
+
|
| 218 |
+
pages_data.append(page_data)
|
| 219 |
+
|
| 220 |
+
# Check for handwritten content
|
| 221 |
+
handwritten_detected = False
|
| 222 |
+
if hasattr(result, 'styles') and result.styles:
|
| 223 |
+
for style in result.styles:
|
| 224 |
+
if hasattr(style, 'is_handwritten') and style.is_handwritten:
|
| 225 |
+
handwritten_detected = True
|
| 226 |
+
break
|
| 227 |
+
|
| 228 |
+
return {
|
| 229 |
+
"success": True,
|
| 230 |
+
"content": result.content if hasattr(result, 'content') else "",
|
| 231 |
+
"pages": pages_data,
|
| 232 |
+
"source_type": source_type,
|
| 233 |
+
"source_url": source_identifier if source_type == "direct_url" else None,
|
| 234 |
+
"handwritten_detected": handwritten_detected,
|
| 235 |
+
"error": None
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
async def scrape_web_content(url: str, extract_images: bool = True) -> WebScrapingResult:
|
| 239 |
+
"""Scrape web content and extract text and images"""
|
| 240 |
+
try:
|
| 241 |
+
headers = {
|
| 242 |
+
'User-Agent': config.USER_AGENT
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
logger.info(f"Scraping web content from: {url}")
|
| 246 |
+
response = requests.get(url, headers=headers, timeout=config.REQUEST_TIMEOUT)
|
| 247 |
+
response.raise_for_status()
|
| 248 |
+
|
| 249 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 250 |
+
|
| 251 |
+
# Extract text content
|
| 252 |
+
text_content = soup.get_text(separator=' ', strip=True)
|
| 253 |
+
|
| 254 |
+
images_found = []
|
| 255 |
+
ocr_results = []
|
| 256 |
+
|
| 257 |
+
if extract_images:
|
| 258 |
+
# Find all images
|
| 259 |
+
img_tags = soup.find_all('img')
|
| 260 |
+
|
| 261 |
+
for img in img_tags[:config.MAX_IMAGES_PER_PAGE]:
|
| 262 |
+
img_src = img.get('src')
|
| 263 |
+
if img_src:
|
| 264 |
+
# Make absolute URL
|
| 265 |
+
img_url = urljoin(url, img_src)
|
| 266 |
+
images_found.append(img_url)
|
| 267 |
+
|
| 268 |
+
# Try to process image with OCR
|
| 269 |
+
try:
|
| 270 |
+
# Check if image URL is accessible and is an image
|
| 271 |
+
img_response = requests.head(img_url, headers=headers, timeout=10)
|
| 272 |
+
content_type = img_response.headers.get('content-type', '')
|
| 273 |
+
|
| 274 |
+
if is_supported_file_type(content_type):
|
| 275 |
+
ocr_result = await process_ocr_from_url(img_url)
|
| 276 |
+
if ocr_result['content'].strip(): # Only add if there's actual text
|
| 277 |
+
ocr_results.append({
|
| 278 |
+
"image_url": img_url,
|
| 279 |
+
"ocr_content": ocr_result['content'],
|
| 280 |
+
"pages": ocr_result['pages']
|
| 281 |
+
})
|
| 282 |
+
except Exception as e:
|
| 283 |
+
logger.warning(f"Failed to process image {img_url}: {e}")
|
| 284 |
+
continue
|
| 285 |
+
|
| 286 |
+
return WebScrapingResult(
|
| 287 |
+
text_content=text_content,
|
| 288 |
+
images_found=images_found,
|
| 289 |
+
ocr_results=ocr_results
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
except requests.RequestException as e:
|
| 293 |
+
logger.error(f"Failed to scrape URL {url}: {e}")
|
| 294 |
+
raise HTTPException(status_code=400, detail=f"Failed to scrape URL: {e}")
|
| 295 |
+
except Exception as e:
|
| 296 |
+
logger.error(f"Unexpected error scraping URL {url}: {e}")
|
| 297 |
+
raise HTTPException(status_code=500, detail=f"Unexpected error during web scraping: {e}")
|
| 298 |
+
|
| 299 |
+
def check_url_is_direct_file(url: str) -> tuple[bool, str]:
|
| 300 |
+
"""Check if URL points directly to a file"""
|
| 301 |
+
try:
|
| 302 |
+
headers = {
|
| 303 |
+
'User-Agent': config.USER_AGENT
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
|
| 307 |
+
content_type = response.headers.get('content-type', '').lower()
|
| 308 |
+
|
| 309 |
+
# Check content disposition for filename
|
| 310 |
+
content_disposition = response.headers.get('content-disposition', '')
|
| 311 |
+
filename = ""
|
| 312 |
+
if 'filename=' in content_disposition:
|
| 313 |
+
filename = content_disposition.split('filename=')[1].strip('"')
|
| 314 |
+
|
| 315 |
+
# Parse URL for filename
|
| 316 |
+
if not filename:
|
| 317 |
+
parsed_url = urlparse(url)
|
| 318 |
+
filename = Path(parsed_url.path).name
|
| 319 |
+
|
| 320 |
+
is_file = is_supported_file_type(content_type, filename)
|
| 321 |
+
return is_file, content_type
|
| 322 |
+
|
| 323 |
+
except Exception as e:
|
| 324 |
+
logger.warning(f"Failed to check URL {url}: {e}")
|
| 325 |
+
return False, ""
|
| 326 |
+
|
| 327 |
+
# API Endpoints
|
| 328 |
+
@app.get("/")
|
| 329 |
+
async def root():
|
| 330 |
+
azure_di_available = bool(
|
| 331 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
|
| 332 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_KEY and
|
| 333 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and
|
| 334 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY"
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
return {
|
| 338 |
+
"message": "OCR Backend API",
|
| 339 |
+
"version": "2.0.0",
|
| 340 |
+
"status": "operational",
|
| 341 |
+
"features": {
|
| 342 |
+
"file_upload": True,
|
| 343 |
+
"url_processing": True,
|
| 344 |
+
"web_scraping": True,
|
| 345 |
+
"azure_document_intelligence": azure_di_available,
|
| 346 |
+
"supported_formats": ["PDF", "JPEG", "PNG", "TIFF", "BMP", "GIF"]
|
| 347 |
+
},
|
| 348 |
+
"limits": {
|
| 349 |
+
"max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
|
| 350 |
+
"max_images_per_page": config.MAX_IMAGES_PER_PAGE,
|
| 351 |
+
"request_timeout_seconds": config.REQUEST_TIMEOUT
|
| 352 |
+
}
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
@app.get("/health")
|
| 356 |
+
async def health_check():
|
| 357 |
+
azure_di_available = bool(
|
| 358 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
|
| 359 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_KEY and
|
| 360 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and
|
| 361 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY"
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
# Test Azure DI connection if configured
|
| 365 |
+
azure_di_status = "not_configured"
|
| 366 |
+
if azure_di_available:
|
| 367 |
+
try:
|
| 368 |
+
# Quick test of Azure DI client initialization
|
| 369 |
+
get_document_intelligence_client()
|
| 370 |
+
azure_di_status = "configured"
|
| 371 |
+
except Exception as e:
|
| 372 |
+
azure_di_status = f"error: {str(e)[:100]}"
|
| 373 |
+
|
| 374 |
+
return {
|
| 375 |
+
"status": "healthy",
|
| 376 |
+
"service": "OCR Backend API",
|
| 377 |
+
"version": "2.0.0",
|
| 378 |
+
"azure_document_intelligence": azure_di_status,
|
| 379 |
+
"configuration": {
|
| 380 |
+
"max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
|
| 381 |
+
"max_images_per_page": config.MAX_IMAGES_PER_PAGE,
|
| 382 |
+
"request_timeout": config.REQUEST_TIMEOUT,
|
| 383 |
+
"endpoint_configured": bool(config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT),
|
| 384 |
+
"key_configured": bool(config.AZURE_DOCUMENT_INTELLIGENCE_KEY)
|
| 385 |
+
}
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
@app.post("/ocr/upload", response_model=OCRResponse)
|
| 389 |
+
async def ocr_upload_file(file: UploadFile = File(...)):
|
| 390 |
+
"""Upload a file for OCR processing"""
|
| 391 |
+
|
| 392 |
+
# Validate file type
|
| 393 |
+
if not is_supported_file_type(file.content_type, file.filename):
|
| 394 |
+
raise HTTPException(
|
| 395 |
+
status_code=400,
|
| 396 |
+
detail=f"Unsupported file type: {file.content_type}. Supported types: PDF, JPEG, PNG, TIFF, BMP, GIF"
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
try:
|
| 400 |
+
# Read file content
|
| 401 |
+
file_bytes = await file.read()
|
| 402 |
+
|
| 403 |
+
# Check file size
|
| 404 |
+
if len(file_bytes) > config.MAX_FILE_SIZE:
|
| 405 |
+
raise HTTPException(
|
| 406 |
+
status_code=400,
|
| 407 |
+
detail=f"File too large. Maximum size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB"
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
# Process OCR
|
| 411 |
+
result = await process_ocr_from_bytes(file_bytes, file.filename)
|
| 412 |
+
|
| 413 |
+
return OCRResponse(**result)
|
| 414 |
+
|
| 415 |
+
except HTTPException:
|
| 416 |
+
raise
|
| 417 |
+
except Exception as e:
|
| 418 |
+
logger.error(f"Unexpected error processing uploaded file: {e}")
|
| 419 |
+
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
| 420 |
+
|
| 421 |
+
@app.post("/ocr/url", response_model=OCRResponse)
|
| 422 |
+
async def ocr_from_url(request: URLRequest):
|
| 423 |
+
"""Process OCR from URL - either direct file or web scraping"""
|
| 424 |
+
|
| 425 |
+
url_str = str(request.url)
|
| 426 |
+
|
| 427 |
+
# Check if URL points to a direct file
|
| 428 |
+
is_direct_file, content_type = check_url_is_direct_file(url_str)
|
| 429 |
+
|
| 430 |
+
if is_direct_file:
|
| 431 |
+
# Process as direct file URL
|
| 432 |
+
try:
|
| 433 |
+
result = await process_ocr_from_url(url_str)
|
| 434 |
+
return OCRResponse(**result)
|
| 435 |
+
except HTTPException:
|
| 436 |
+
raise
|
| 437 |
+
except Exception as e:
|
| 438 |
+
logger.error(f"Failed to process direct file URL: {e}")
|
| 439 |
+
# Fall back to web scraping
|
| 440 |
+
pass
|
| 441 |
+
|
| 442 |
+
# Web scraping approach
|
| 443 |
+
try:
|
| 444 |
+
scraping_result = await scrape_web_content(url_str, request.extract_images)
|
| 445 |
+
|
| 446 |
+
# Combine text content and OCR results
|
| 447 |
+
combined_content = scraping_result.text_content
|
| 448 |
+
|
| 449 |
+
if scraping_result.ocr_results:
|
| 450 |
+
ocr_content = "\n\n--- OCR from Images ---\n"
|
| 451 |
+
for ocr_result in scraping_result.ocr_results:
|
| 452 |
+
ocr_content += f"\nImage: {ocr_result['image_url']}\n"
|
| 453 |
+
ocr_content += ocr_result['ocr_content'] + "\n"
|
| 454 |
+
combined_content += ocr_content
|
| 455 |
+
|
| 456 |
+
# Format response
|
| 457 |
+
pages_data = [{
|
| 458 |
+
"page_number": 1,
|
| 459 |
+
"content_type": "web_scraped",
|
| 460 |
+
"text_content": scraping_result.text_content,
|
| 461 |
+
"images_found": len(scraping_result.images_found),
|
| 462 |
+
"ocr_results": len(scraping_result.ocr_results)
|
| 463 |
+
}]
|
| 464 |
+
|
| 465 |
+
return OCRResponse(
|
| 466 |
+
success=True,
|
| 467 |
+
content=combined_content,
|
| 468 |
+
pages=pages_data,
|
| 469 |
+
source_type="web_scraped",
|
| 470 |
+
source_url=url_str,
|
| 471 |
+
error=None
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
except HTTPException:
|
| 475 |
+
raise
|
| 476 |
+
except Exception as e:
|
| 477 |
+
logger.error(f"Failed to process URL {url_str}: {e}")
|
| 478 |
+
return OCRResponse(
|
| 479 |
+
success=False,
|
| 480 |
+
content="",
|
| 481 |
+
pages=[],
|
| 482 |
+
source_type="web_scraped",
|
| 483 |
+
source_url=url_str,
|
| 484 |
+
error=str(e)
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
@app.post("/ocr/analyze")
|
| 488 |
+
async def analyze_document(
|
| 489 |
+
file: Optional[UploadFile] = File(None),
|
| 490 |
+
url: Optional[str] = Form(None),
|
| 491 |
+
extract_images: bool = Form(True)
|
| 492 |
+
):
|
| 493 |
+
"""Unified endpoint for document analysis - accepts either file upload or URL"""
|
| 494 |
+
|
| 495 |
+
if not file and not url:
|
| 496 |
+
raise HTTPException(status_code=400, detail="Either file or URL must be provided")
|
| 497 |
+
|
| 498 |
+
if file and url:
|
| 499 |
+
raise HTTPException(status_code=400, detail="Provide either file or URL, not both")
|
| 500 |
+
|
| 501 |
+
try:
|
| 502 |
+
if file:
|
| 503 |
+
# Process uploaded file
|
| 504 |
+
if not is_supported_file_type(file.content_type, file.filename):
|
| 505 |
+
raise HTTPException(
|
| 506 |
+
status_code=400,
|
| 507 |
+
detail=f"Unsupported file type: {file.content_type}"
|
| 508 |
+
)
|
| 509 |
+
|
| 510 |
+
file_bytes = await file.read()
|
| 511 |
+
|
| 512 |
+
# Check file size
|
| 513 |
+
if len(file_bytes) > config.MAX_FILE_SIZE:
|
| 514 |
+
raise HTTPException(
|
| 515 |
+
status_code=400,
|
| 516 |
+
detail=f"File too large. Maximum size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB"
|
| 517 |
+
)
|
| 518 |
+
|
| 519 |
+
result = await process_ocr_from_bytes(file_bytes, file.filename)
|
| 520 |
+
return result
|
| 521 |
+
|
| 522 |
+
else:
|
| 523 |
+
# Process URL
|
| 524 |
+
url_request = URLRequest(url=url, extract_images=extract_images)
|
| 525 |
+
response = await ocr_from_url(url_request)
|
| 526 |
+
return response.dict()
|
| 527 |
+
|
| 528 |
+
except HTTPException:
|
| 529 |
+
raise
|
| 530 |
+
except Exception as e:
|
| 531 |
+
logger.error(f"Unexpected error in analyze_document: {e}")
|
| 532 |
+
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
| 533 |
+
|
| 534 |
+
# Additional utility endpoints
|
| 535 |
+
@app.get("/supported-formats")
|
| 536 |
+
async def get_supported_formats():
|
| 537 |
+
"""Get list of supported file formats"""
|
| 538 |
+
return {
|
| 539 |
+
"supported_formats": {
|
| 540 |
+
"documents": ["PDF"],
|
| 541 |
+
"images": ["JPEG", "JPG", "PNG", "TIFF", "TIF", "BMP", "GIF"]
|
| 542 |
+
},
|
| 543 |
+
"content_types": [
|
| 544 |
+
"application/pdf",
|
| 545 |
+
"image/jpeg",
|
| 546 |
+
"image/jpg",
|
| 547 |
+
"image/png",
|
| 548 |
+
"image/tiff",
|
| 549 |
+
"image/bmp",
|
| 550 |
+
"image/gif"
|
| 551 |
+
],
|
| 552 |
+
"max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
|
| 553 |
+
"max_images_per_page": config.MAX_IMAGES_PER_PAGE
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
@app.get("/config")
|
| 557 |
+
async def get_configuration():
|
| 558 |
+
"""Get current service configuration (for debugging)"""
|
| 559 |
+
return {
|
| 560 |
+
"service": "OCR Backend API",
|
| 561 |
+
"version": "2.0.0",
|
| 562 |
+
"configuration": {
|
| 563 |
+
"host": config.HOST,
|
| 564 |
+
"port": config.PORT,
|
| 565 |
+
"debug": config.DEBUG,
|
| 566 |
+
"max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
|
| 567 |
+
"max_images_per_page": config.MAX_IMAGES_PER_PAGE,
|
| 568 |
+
"request_timeout": config.REQUEST_TIMEOUT,
|
| 569 |
+
"azure_di_configured": bool(
|
| 570 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
|
| 571 |
+
config.AZURE_DOCUMENT_INTELLIGENCE_KEY
|
| 572 |
+
)
|
| 573 |
+
}
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
if __name__ == "__main__":
|
| 577 |
+
print("π§ Loading OCR service configuration...")
|
| 578 |
+
print(f"π Will start server on {config.HOST}:{config.PORT}")
|
| 579 |
+
print(f"π Azure Document Intelligence: {'β
Configured' if config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT else 'β Not configured'}")
|
| 580 |
+
print(f"π Max file size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB")
|
| 581 |
+
|
| 582 |
+
uvicorn.run(
|
| 583 |
+
"ocr_service:app",
|
| 584 |
+
host=config.HOST,
|
| 585 |
+
port=config.PORT,
|
| 586 |
+
reload=config.DEBUG,
|
| 587 |
+
log_level="info"
|
| 588 |
+
)
|
service/rag_service.py
ADDED
|
@@ -0,0 +1,1367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
RAG (Retrieval-Augmented Generation) Backend API - Cleaned and Optimized
|
| 4 |
+
Integrates OCR, Azure OpenAI embeddings, and PostgreSQL vector storage
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import uuid
|
| 9 |
+
import asyncio
|
| 10 |
+
import requests
|
| 11 |
+
import json
|
| 12 |
+
import tempfile
|
| 13 |
+
import traceback
|
| 14 |
+
import logging
|
| 15 |
+
from typing import Optional, List, Dict, Any, Union
|
| 16 |
+
from datetime import datetime
|
| 17 |
+
|
| 18 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException, Form, Query, Depends
|
| 19 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 20 |
+
from pydantic import BaseModel, HttpUrl
|
| 21 |
+
import uvicorn
|
| 22 |
+
|
| 23 |
+
# Import unified configuration
|
| 24 |
+
try:
|
| 25 |
+
from configs import get_config
|
| 26 |
+
config = get_config().rag
|
| 27 |
+
unified_config = get_config()
|
| 28 |
+
print("β
Using unified configuration")
|
| 29 |
+
except ImportError:
|
| 30 |
+
print("β οΈ Unified config not available, using fallback configuration")
|
| 31 |
+
from dotenv import load_dotenv
|
| 32 |
+
load_dotenv()
|
| 33 |
+
|
| 34 |
+
class FallbackConfig:
|
| 35 |
+
HOST = os.getenv("HOST", "0.0.0.0")
|
| 36 |
+
PORT = int(os.getenv("RAG_PORT", "8401"))
|
| 37 |
+
DEBUG = os.getenv("DEBUG", "True").lower() == "true"
|
| 38 |
+
|
| 39 |
+
# OCR Service Configuration
|
| 40 |
+
OCR_SERVICE_URL = os.getenv("OCR_SERVICE_URL", "http://localhost:8400")
|
| 41 |
+
|
| 42 |
+
# PostgreSQL Configuration
|
| 43 |
+
PG_HOST = os.getenv("POSTGRES_HOST", "")
|
| 44 |
+
PG_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
| 45 |
+
PG_DATABASE = os.getenv("PG_DATABASE", "vectorsearch")
|
| 46 |
+
PG_USER = os.getenv("POSTGRES_USER", "")
|
| 47 |
+
PG_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
|
| 48 |
+
PG_SSL_MODE = os.getenv("PG_SSL_MODE", "require")
|
| 49 |
+
|
| 50 |
+
# Azure OpenAI Configuration
|
| 51 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "")
|
| 52 |
+
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "")
|
| 53 |
+
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "text-embedding-3-small")
|
| 54 |
+
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")
|
| 55 |
+
|
| 56 |
+
# Chunking Configuration
|
| 57 |
+
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
|
| 58 |
+
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
|
| 59 |
+
MIN_CHUNK_SIZE = int(os.getenv("MIN_CHUNK_SIZE", "50"))
|
| 60 |
+
|
| 61 |
+
# Processing limits
|
| 62 |
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
|
| 63 |
+
REQUEST_TIMEOUT = 300
|
| 64 |
+
|
| 65 |
+
config = FallbackConfig()
|
| 66 |
+
|
| 67 |
+
import asyncpg
|
| 68 |
+
import numpy as np
|
| 69 |
+
from openai import AzureOpenAI
|
| 70 |
+
import re
|
| 71 |
+
from pathlib import Path
|
| 72 |
+
from urllib.parse import urlparse
|
| 73 |
+
|
| 74 |
+
# Configure logging
|
| 75 |
+
logging.basicConfig(
|
| 76 |
+
level=logging.INFO,
|
| 77 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 78 |
+
)
|
| 79 |
+
logger = logging.getLogger(__name__)
|
| 80 |
+
|
| 81 |
+
# Initialize FastAPI app
|
| 82 |
+
app = FastAPI(
|
| 83 |
+
title="RAG Backend API",
|
| 84 |
+
description="Retrieval-Augmented Generation service with OCR, embeddings, and vector search",
|
| 85 |
+
version="2.0.0",
|
| 86 |
+
debug=config.DEBUG
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# CORS configuration
|
| 90 |
+
app.add_middleware(
|
| 91 |
+
CORSMiddleware,
|
| 92 |
+
allow_origins=["*"],
|
| 93 |
+
allow_credentials=True,
|
| 94 |
+
allow_methods=["*"],
|
| 95 |
+
allow_headers=["*"],
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Pydantic Models
|
| 99 |
+
class DocumentUploadRequest(BaseModel):
|
| 100 |
+
title: Optional[str] = None
|
| 101 |
+
keywords: Optional[List[str]] = None
|
| 102 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 103 |
+
chunk_size: Optional[int] = None
|
| 104 |
+
chunk_overlap: Optional[int] = None
|
| 105 |
+
|
| 106 |
+
class URLProcessRequest(BaseModel):
|
| 107 |
+
url: HttpUrl
|
| 108 |
+
title: Optional[str] = None
|
| 109 |
+
keywords: Optional[List[str]] = None
|
| 110 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 111 |
+
extract_images: bool = True
|
| 112 |
+
chunk_size: Optional[int] = None
|
| 113 |
+
chunk_overlap: Optional[int] = None
|
| 114 |
+
|
| 115 |
+
class SearchRequest(BaseModel):
|
| 116 |
+
query: str
|
| 117 |
+
limit: int = 10
|
| 118 |
+
similarity_threshold: float = 0.2
|
| 119 |
+
filter_metadata: Optional[Dict[str, Any]] = None
|
| 120 |
+
|
| 121 |
+
class DocumentChunk(BaseModel):
|
| 122 |
+
id: str
|
| 123 |
+
document_id: str
|
| 124 |
+
content: str
|
| 125 |
+
chunk_index: int
|
| 126 |
+
embedding: Optional[List[float]] = None
|
| 127 |
+
metadata: Dict[str, Any]
|
| 128 |
+
created_at: datetime
|
| 129 |
+
|
| 130 |
+
class DocumentResponse(BaseModel):
|
| 131 |
+
id: str
|
| 132 |
+
title: str
|
| 133 |
+
source_type: str
|
| 134 |
+
source_url: Optional[str]
|
| 135 |
+
total_chunks: int
|
| 136 |
+
keywords: List[str]
|
| 137 |
+
metadata: Dict[str, Any]
|
| 138 |
+
created_at: datetime
|
| 139 |
+
processing_status: str
|
| 140 |
+
|
| 141 |
+
class SearchResult(BaseModel):
|
| 142 |
+
chunk: DocumentChunk
|
| 143 |
+
similarity_score: float
|
| 144 |
+
document_info: Dict[str, Any]
|
| 145 |
+
|
| 146 |
+
class SearchResponse(BaseModel):
|
| 147 |
+
query: str
|
| 148 |
+
results: List[SearchResult]
|
| 149 |
+
total_results: int
|
| 150 |
+
processing_time: float
|
| 151 |
+
|
| 152 |
+
# Database connection pool
|
| 153 |
+
db_pool = None
|
| 154 |
+
|
| 155 |
+
# UUID generation method cache
|
| 156 |
+
_uuid_method = None
|
| 157 |
+
|
| 158 |
+
async def detect_uuid_method(conn) -> str:
|
| 159 |
+
"""Detect and cache the best available UUID generation method"""
|
| 160 |
+
global _uuid_method
|
| 161 |
+
|
| 162 |
+
if _uuid_method is not None:
|
| 163 |
+
return _uuid_method
|
| 164 |
+
|
| 165 |
+
# Test built-in gen_random_uuid() first (PostgreSQL 13+)
|
| 166 |
+
try:
|
| 167 |
+
await conn.fetchval("SELECT gen_random_uuid()")
|
| 168 |
+
_uuid_method = "built-in"
|
| 169 |
+
logger.info("Using built-in gen_random_uuid() for UUID generation")
|
| 170 |
+
return _uuid_method
|
| 171 |
+
except Exception:
|
| 172 |
+
pass
|
| 173 |
+
|
| 174 |
+
# Test uuid-ossp extension
|
| 175 |
+
try:
|
| 176 |
+
await conn.execute("CREATE EXTENSION IF NOT EXISTS \"uuid-ossp\"")
|
| 177 |
+
await conn.fetchval("SELECT uuid_generate_v4()")
|
| 178 |
+
_uuid_method = "uuid-ossp"
|
| 179 |
+
logger.info("Using uuid-ossp extension for UUID generation")
|
| 180 |
+
return _uuid_method
|
| 181 |
+
except Exception as e:
|
| 182 |
+
if "not allow-listed" in str(e) or "not allowlisted" in str(e).lower():
|
| 183 |
+
logger.info("uuid-ossp extension not allowlisted (normal for Azure PostgreSQL)")
|
| 184 |
+
else:
|
| 185 |
+
logger.warning(f"uuid-ossp extension not available: {e}")
|
| 186 |
+
|
| 187 |
+
# Fall back to Python UUID generation
|
| 188 |
+
_uuid_method = "python"
|
| 189 |
+
logger.info("Using Python-generated UUIDs")
|
| 190 |
+
return _uuid_method
|
| 191 |
+
|
| 192 |
+
async def get_db_pool():
|
| 193 |
+
"""Get database connection pool"""
|
| 194 |
+
global db_pool
|
| 195 |
+
if db_pool is None:
|
| 196 |
+
try:
|
| 197 |
+
logger.info(f"Creating database pool with host: {config.PG_HOST}:{config.PG_PORT}")
|
| 198 |
+
db_pool = await asyncpg.create_pool(
|
| 199 |
+
host=config.PG_HOST,
|
| 200 |
+
port=config.PG_PORT,
|
| 201 |
+
database=config.PG_DATABASE,
|
| 202 |
+
user=config.PG_USER,
|
| 203 |
+
password=config.PG_PASSWORD,
|
| 204 |
+
ssl=config.PG_SSL_MODE,
|
| 205 |
+
min_size=1,
|
| 206 |
+
max_size=10,
|
| 207 |
+
command_timeout=60
|
| 208 |
+
)
|
| 209 |
+
except Exception as e:
|
| 210 |
+
logger.error(f"Failed to create database pool: {e}")
|
| 211 |
+
raise
|
| 212 |
+
return db_pool
|
| 213 |
+
|
| 214 |
+
async def get_db_connection():
|
| 215 |
+
"""Get database connection from pool"""
|
| 216 |
+
pool = await get_db_pool()
|
| 217 |
+
return await pool.acquire()
|
| 218 |
+
|
| 219 |
+
async def release_db_connection(connection):
|
| 220 |
+
"""Release database connection back to pool"""
|
| 221 |
+
pool = await get_db_pool()
|
| 222 |
+
await pool.release(connection)
|
| 223 |
+
|
| 224 |
+
# Azure OpenAI Client
|
| 225 |
+
def get_openai_client():
|
| 226 |
+
"""Initialize Azure OpenAI client"""
|
| 227 |
+
if (config.AZURE_OPENAI_ENDPOINT == "" or
|
| 228 |
+
config.AZURE_OPENAI_API_KEY == "" or
|
| 229 |
+
config.AZURE_OPENAI_ENDPOINT == "YOUR_AZURE_OPENAI_ENDPOINT" or
|
| 230 |
+
config.AZURE_OPENAI_API_KEY == "YOUR_AZURE_OPENAI_KEY"):
|
| 231 |
+
raise HTTPException(
|
| 232 |
+
status_code=500,
|
| 233 |
+
detail="Azure OpenAI credentials not configured"
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
return AzureOpenAI(
|
| 237 |
+
api_version=config.AZURE_OPENAI_API_VERSION,
|
| 238 |
+
azure_endpoint=config.AZURE_OPENAI_ENDPOINT,
|
| 239 |
+
api_key=config.AZURE_OPENAI_API_KEY
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
# Text Processing Functions
|
| 243 |
+
def clean_text(text: str) -> str:
|
| 244 |
+
"""Clean and normalize text"""
|
| 245 |
+
# Remove extra whitespace
|
| 246 |
+
text = re.sub(r'\s+', ' ', text)
|
| 247 |
+
# Remove special characters but keep basic punctuation
|
| 248 |
+
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', '', text)
|
| 249 |
+
return text.strip()
|
| 250 |
+
|
| 251 |
+
def embedding_to_vector_string(embedding: List[float]) -> str:
|
| 252 |
+
"""Convert embedding list to PostgreSQL vector format"""
|
| 253 |
+
if not embedding or len(embedding) == 0:
|
| 254 |
+
raise ValueError("Embedding cannot be empty")
|
| 255 |
+
|
| 256 |
+
# Convert to PostgreSQL vector format: '[1.0, 2.0, 3.0]'
|
| 257 |
+
vector_str = '[' + ','.join(str(float(x)) for x in embedding) + ']'
|
| 258 |
+
return vector_str
|
| 259 |
+
|
| 260 |
+
def create_text_chunks(text: str, chunk_size: int = None, chunk_overlap: int = None) -> List[str]:
|
| 261 |
+
"""Split text into overlapping chunks"""
|
| 262 |
+
if chunk_size is None:
|
| 263 |
+
chunk_size = config.CHUNK_SIZE
|
| 264 |
+
if chunk_overlap is None:
|
| 265 |
+
chunk_overlap = config.CHUNK_OVERLAP
|
| 266 |
+
|
| 267 |
+
if len(text) <= chunk_size:
|
| 268 |
+
return [text]
|
| 269 |
+
|
| 270 |
+
chunks = []
|
| 271 |
+
start = 0
|
| 272 |
+
|
| 273 |
+
while start < len(text):
|
| 274 |
+
end = start + chunk_size
|
| 275 |
+
|
| 276 |
+
# Try to break at sentence boundary
|
| 277 |
+
if end < len(text):
|
| 278 |
+
# Look for sentence endings
|
| 279 |
+
sentence_endings = ['. ', '! ', '? ', '\n\n']
|
| 280 |
+
for ending in sentence_endings:
|
| 281 |
+
last_ending = text.rfind(ending, start, end)
|
| 282 |
+
if last_ending != -1:
|
| 283 |
+
end = last_ending + len(ending)
|
| 284 |
+
break
|
| 285 |
+
|
| 286 |
+
chunk = text[start:end].strip()
|
| 287 |
+
if len(chunk) >= config.MIN_CHUNK_SIZE:
|
| 288 |
+
chunks.append(chunk)
|
| 289 |
+
|
| 290 |
+
# Calculate next start position with overlap
|
| 291 |
+
start = end - chunk_overlap
|
| 292 |
+
if start >= len(text):
|
| 293 |
+
break
|
| 294 |
+
|
| 295 |
+
return chunks
|
| 296 |
+
|
| 297 |
+
async def generate_embedding(text: str) -> List[float]:
|
| 298 |
+
"""Generate embedding using Azure OpenAI"""
|
| 299 |
+
try:
|
| 300 |
+
if not text or not text.strip():
|
| 301 |
+
raise ValueError("Text cannot be empty")
|
| 302 |
+
|
| 303 |
+
# Truncate text if it's too long
|
| 304 |
+
if len(text) > 8000:
|
| 305 |
+
text = text[:8000]
|
| 306 |
+
logger.warning("Truncated text for embedding generation")
|
| 307 |
+
|
| 308 |
+
client = get_openai_client()
|
| 309 |
+
|
| 310 |
+
response = client.embeddings.create(
|
| 311 |
+
input=[text.strip()],
|
| 312 |
+
model=config.AZURE_OPENAI_DEPLOYMENT
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
if not response.data or len(response.data) == 0:
|
| 316 |
+
raise ValueError("No embedding data returned from Azure OpenAI")
|
| 317 |
+
|
| 318 |
+
embedding = response.data[0].embedding
|
| 319 |
+
|
| 320 |
+
if not embedding or len(embedding) == 0:
|
| 321 |
+
raise ValueError("Empty embedding returned from Azure OpenAI")
|
| 322 |
+
|
| 323 |
+
logger.debug(f"Generated embedding with {len(embedding)} dimensions")
|
| 324 |
+
return embedding
|
| 325 |
+
|
| 326 |
+
except Exception as e:
|
| 327 |
+
logger.error(f"Failed to generate embedding: {e}")
|
| 328 |
+
logger.error(f"Text length: {len(text) if text else 0}")
|
| 329 |
+
raise HTTPException(status_code=500, detail=f"Embedding generation failed: {e}")
|
| 330 |
+
|
| 331 |
+
# OCR Integration
|
| 332 |
+
async def process_with_ocr(file_bytes: bytes = None, url: str = None, extract_images: bool = True, filename: str = None) -> Dict[str, Any]:
|
| 333 |
+
"""Process document using OCR service"""
|
| 334 |
+
try:
|
| 335 |
+
logger.info(f"Processing with OCR service at {config.OCR_SERVICE_URL}")
|
| 336 |
+
|
| 337 |
+
if file_bytes:
|
| 338 |
+
# Check if it's a plain text file
|
| 339 |
+
is_text_file = False
|
| 340 |
+
if filename:
|
| 341 |
+
text_extensions = ['.txt', '.md', '.rst', '.log']
|
| 342 |
+
if any(filename.lower().endswith(ext) for ext in text_extensions):
|
| 343 |
+
is_text_file = True
|
| 344 |
+
|
| 345 |
+
# For plain text files, bypass OCR
|
| 346 |
+
if is_text_file:
|
| 347 |
+
try:
|
| 348 |
+
content = file_bytes.decode('utf-8')
|
| 349 |
+
logger.info(f"Processing plain text file directly: {filename}")
|
| 350 |
+
|
| 351 |
+
if len(content.strip()) < config.MIN_CHUNK_SIZE:
|
| 352 |
+
logger.info(f"Text file {filename} is short ({len(content)} chars) but will process anyway")
|
| 353 |
+
|
| 354 |
+
return {
|
| 355 |
+
'success': True,
|
| 356 |
+
'content': content,
|
| 357 |
+
'pages': [{
|
| 358 |
+
'page_number': 1,
|
| 359 |
+
'content_type': 'text',
|
| 360 |
+
'text_content': content,
|
| 361 |
+
'source': 'direct_text',
|
| 362 |
+
'character_count': len(content)
|
| 363 |
+
}],
|
| 364 |
+
'source_type': 'text_file',
|
| 365 |
+
'source_url': None,
|
| 366 |
+
'error': None
|
| 367 |
+
}
|
| 368 |
+
except UnicodeDecodeError:
|
| 369 |
+
logger.warning(f"Failed to decode {filename} as UTF-8, sending to OCR service")
|
| 370 |
+
|
| 371 |
+
# Use OCR service
|
| 372 |
+
logger.info(f"Uploading file for OCR processing ({len(file_bytes)} bytes)")
|
| 373 |
+
|
| 374 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.tmp') as temp_file:
|
| 375 |
+
temp_file.write(file_bytes)
|
| 376 |
+
temp_file.flush()
|
| 377 |
+
|
| 378 |
+
try:
|
| 379 |
+
with open(temp_file.name, 'rb') as f:
|
| 380 |
+
files = {
|
| 381 |
+
'file': (filename or 'document.pdf', f, 'application/octet-stream')
|
| 382 |
+
}
|
| 383 |
+
data = {
|
| 384 |
+
'extract_images': str(extract_images).lower()
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
response = requests.post(
|
| 388 |
+
f"{config.OCR_SERVICE_URL}/ocr/analyze",
|
| 389 |
+
files=files,
|
| 390 |
+
data=data,
|
| 391 |
+
timeout=config.REQUEST_TIMEOUT
|
| 392 |
+
)
|
| 393 |
+
finally:
|
| 394 |
+
try:
|
| 395 |
+
os.unlink(temp_file.name)
|
| 396 |
+
except:
|
| 397 |
+
pass
|
| 398 |
+
|
| 399 |
+
elif url:
|
| 400 |
+
# Process URL with OCR service
|
| 401 |
+
logger.info(f"Processing URL for OCR: {url}")
|
| 402 |
+
|
| 403 |
+
data = {
|
| 404 |
+
'url': url,
|
| 405 |
+
'extract_images': str(extract_images).lower()
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
response = requests.post(
|
| 409 |
+
f"{config.OCR_SERVICE_URL}/ocr/analyze",
|
| 410 |
+
data=data,
|
| 411 |
+
timeout=config.REQUEST_TIMEOUT
|
| 412 |
+
)
|
| 413 |
+
else:
|
| 414 |
+
raise ValueError("Either file_bytes or url must be provided")
|
| 415 |
+
|
| 416 |
+
# Check response
|
| 417 |
+
logger.info(f"OCR service response status: {response.status_code}")
|
| 418 |
+
|
| 419 |
+
if response.status_code != 200:
|
| 420 |
+
logger.error(f"OCR service error: {response.status_code} - {response.text}")
|
| 421 |
+
raise HTTPException(
|
| 422 |
+
status_code=500,
|
| 423 |
+
detail=f"OCR processing failed: {response.status_code} {response.reason}"
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
result = response.json()
|
| 427 |
+
logger.info(f"OCR processing completed successfully. Success: {result.get('success', False)}")
|
| 428 |
+
|
| 429 |
+
return result
|
| 430 |
+
|
| 431 |
+
except requests.RequestException as e:
|
| 432 |
+
logger.error(f"OCR service request error: {e}")
|
| 433 |
+
raise HTTPException(status_code=500, detail=f"OCR service connection failed: {e}")
|
| 434 |
+
except Exception as e:
|
| 435 |
+
logger.error(f"OCR processing error: {e}")
|
| 436 |
+
logger.error(traceback.format_exc())
|
| 437 |
+
raise HTTPException(status_code=500, detail=f"OCR processing failed: {e}")
|
| 438 |
+
|
| 439 |
+
# UUID Generation Helper
|
| 440 |
+
async def generate_uuid(conn) -> str:
|
| 441 |
+
"""Generate UUID using the best available method"""
|
| 442 |
+
try:
|
| 443 |
+
uuid_method = await detect_uuid_method(conn)
|
| 444 |
+
|
| 445 |
+
if uuid_method == "built-in":
|
| 446 |
+
uuid_val = await conn.fetchval("SELECT gen_random_uuid()")
|
| 447 |
+
return str(uuid_val)
|
| 448 |
+
elif uuid_method == "uuid-ossp":
|
| 449 |
+
uuid_val = await conn.fetchval("SELECT uuid_generate_v4()")
|
| 450 |
+
return str(uuid_val)
|
| 451 |
+
else:
|
| 452 |
+
return str(uuid.uuid4())
|
| 453 |
+
|
| 454 |
+
except Exception as e:
|
| 455 |
+
logger.warning(f"Database UUID generation failed, using Python fallback: {e}")
|
| 456 |
+
return str(uuid.uuid4())
|
| 457 |
+
|
| 458 |
+
# Database Operations
|
| 459 |
+
async def create_document_record(
|
| 460 |
+
title: str,
|
| 461 |
+
source_type: str,
|
| 462 |
+
source_url: str = None,
|
| 463 |
+
keywords: List[str] = None,
|
| 464 |
+
metadata: Dict[str, Any] = None
|
| 465 |
+
) -> str:
|
| 466 |
+
"""Create document record in database"""
|
| 467 |
+
conn = await get_db_connection()
|
| 468 |
+
try:
|
| 469 |
+
document_id = await generate_uuid(conn)
|
| 470 |
+
|
| 471 |
+
await conn.execute("""
|
| 472 |
+
INSERT INTO documents (id, title, source_type, source_url, keywords, metadata, created_at, processing_status)
|
| 473 |
+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
| 474 |
+
""", document_id, title, source_type, source_url, keywords or [],
|
| 475 |
+
json.dumps(metadata or {}), datetime.utcnow(), "processing")
|
| 476 |
+
|
| 477 |
+
return document_id
|
| 478 |
+
|
| 479 |
+
finally:
|
| 480 |
+
await release_db_connection(conn)
|
| 481 |
+
|
| 482 |
+
async def store_document_chunk(
|
| 483 |
+
document_id: str,
|
| 484 |
+
content: str,
|
| 485 |
+
chunk_index: int,
|
| 486 |
+
embedding: List[float],
|
| 487 |
+
metadata: Dict[str, Any] = None
|
| 488 |
+
) -> str:
|
| 489 |
+
"""Store document chunk with embedding"""
|
| 490 |
+
conn = await get_db_connection()
|
| 491 |
+
try:
|
| 492 |
+
chunk_id = await generate_uuid(conn)
|
| 493 |
+
|
| 494 |
+
# Convert embedding to PostgreSQL vector format
|
| 495 |
+
embedding_vector = embedding_to_vector_string(embedding)
|
| 496 |
+
|
| 497 |
+
await conn.execute("""
|
| 498 |
+
INSERT INTO document_chunks (id, document_id, content, chunk_index, embedding, metadata, created_at)
|
| 499 |
+
VALUES ($1, $2, $3, $4, $5::vector, $6, $7)
|
| 500 |
+
""", chunk_id, document_id, content, chunk_index, embedding_vector,
|
| 501 |
+
json.dumps(metadata or {}), datetime.utcnow())
|
| 502 |
+
|
| 503 |
+
return chunk_id
|
| 504 |
+
|
| 505 |
+
finally:
|
| 506 |
+
await release_db_connection(conn)
|
| 507 |
+
|
| 508 |
+
async def update_document_status(document_id: str, status: str, total_chunks: int = None):
|
| 509 |
+
"""Update document processing status"""
|
| 510 |
+
conn = await get_db_connection()
|
| 511 |
+
try:
|
| 512 |
+
if total_chunks is not None:
|
| 513 |
+
await conn.execute("""
|
| 514 |
+
UPDATE documents SET processing_status = $1, total_chunks = $2 WHERE id = $3
|
| 515 |
+
""", status, total_chunks, document_id)
|
| 516 |
+
else:
|
| 517 |
+
await conn.execute("""
|
| 518 |
+
UPDATE documents SET processing_status = $1 WHERE id = $2
|
| 519 |
+
""", status, document_id)
|
| 520 |
+
|
| 521 |
+
finally:
|
| 522 |
+
await release_db_connection(conn)
|
| 523 |
+
|
| 524 |
+
async def search_similar_chunks(
|
| 525 |
+
query_embedding: List[float],
|
| 526 |
+
limit: int = 10,
|
| 527 |
+
similarity_threshold: float = 0.2,
|
| 528 |
+
filter_metadata: Dict[str, Any] = None
|
| 529 |
+
) -> List[Dict[str, Any]]:
|
| 530 |
+
"""Search for similar document chunks using vector similarity"""
|
| 531 |
+
conn = await get_db_connection()
|
| 532 |
+
try:
|
| 533 |
+
logger.info(f"Searching for similar chunks with threshold {similarity_threshold}, limit {limit}")
|
| 534 |
+
|
| 535 |
+
# Validate inputs
|
| 536 |
+
if not query_embedding or len(query_embedding) == 0:
|
| 537 |
+
raise ValueError("Query embedding cannot be empty")
|
| 538 |
+
|
| 539 |
+
logger.info(f"Query embedding dimensions: {len(query_embedding)}")
|
| 540 |
+
|
| 541 |
+
# Convert query embedding to PostgreSQL vector format
|
| 542 |
+
query_vector = embedding_to_vector_string(query_embedding)
|
| 543 |
+
|
| 544 |
+
# Check if we have any chunks
|
| 545 |
+
total_chunks = await conn.fetchval("""
|
| 546 |
+
SELECT COUNT(*) FROM document_chunks dc
|
| 547 |
+
JOIN documents d ON dc.document_id = d.id
|
| 548 |
+
WHERE d.processing_status = 'completed' AND dc.embedding IS NOT NULL
|
| 549 |
+
""")
|
| 550 |
+
|
| 551 |
+
logger.info(f"Total available chunks for search: {total_chunks}")
|
| 552 |
+
|
| 553 |
+
if total_chunks == 0:
|
| 554 |
+
logger.warning("No chunks available for search")
|
| 555 |
+
return []
|
| 556 |
+
|
| 557 |
+
# Build the query
|
| 558 |
+
base_query = """
|
| 559 |
+
SELECT
|
| 560 |
+
dc.id, dc.document_id, dc.content, dc.chunk_index, dc.embedding,
|
| 561 |
+
dc.metadata as chunk_metadata, dc.created_at,
|
| 562 |
+
d.title, d.source_type, d.source_url, d.keywords, d.metadata as doc_metadata,
|
| 563 |
+
1 - (dc.embedding <=> $1::vector) as similarity_score
|
| 564 |
+
FROM document_chunks dc
|
| 565 |
+
JOIN documents d ON dc.document_id = d.id
|
| 566 |
+
WHERE d.processing_status = 'completed'
|
| 567 |
+
AND dc.embedding IS NOT NULL
|
| 568 |
+
"""
|
| 569 |
+
|
| 570 |
+
params = [query_vector]
|
| 571 |
+
param_count = 1
|
| 572 |
+
|
| 573 |
+
# Add similarity threshold
|
| 574 |
+
if similarity_threshold > 0:
|
| 575 |
+
base_query += " AND 1 - (dc.embedding <=> $1::vector) >= $2"
|
| 576 |
+
params.append(similarity_threshold)
|
| 577 |
+
param_count += 1
|
| 578 |
+
|
| 579 |
+
# Add metadata filtering
|
| 580 |
+
if filter_metadata:
|
| 581 |
+
for key, value in filter_metadata.items():
|
| 582 |
+
base_query += f" AND d.metadata->>$" + str(param_count + 1) + " = $" + str(param_count + 2)
|
| 583 |
+
params.extend([key, str(value)])
|
| 584 |
+
param_count += 2
|
| 585 |
+
break # Handle only one filter for now
|
| 586 |
+
|
| 587 |
+
base_query += " ORDER BY similarity_score DESC LIMIT $" + str(param_count + 1)
|
| 588 |
+
params.append(limit)
|
| 589 |
+
|
| 590 |
+
logger.info(f"Executing vector search query with {len(params)} parameters")
|
| 591 |
+
|
| 592 |
+
try:
|
| 593 |
+
rows = await conn.fetch(base_query, *params)
|
| 594 |
+
logger.info(f"Vector search query returned {len(rows)} rows")
|
| 595 |
+
except Exception as db_error:
|
| 596 |
+
logger.error(f"Database query error: {db_error}")
|
| 597 |
+
raise HTTPException(status_code=500, detail=f"Vector search query failed: {db_error}")
|
| 598 |
+
|
| 599 |
+
# Debug: show similarity scores if no results
|
| 600 |
+
if len(rows) == 0 and similarity_threshold > 0:
|
| 601 |
+
logger.warning(f"No results found with threshold {similarity_threshold}, trying without threshold")
|
| 602 |
+
debug_query = """
|
| 603 |
+
SELECT
|
| 604 |
+
dc.id, dc.content,
|
| 605 |
+
1 - (dc.embedding <=> $1::vector) as similarity_score
|
| 606 |
+
FROM document_chunks dc
|
| 607 |
+
JOIN documents d ON dc.document_id = d.id
|
| 608 |
+
WHERE d.processing_status = 'completed'
|
| 609 |
+
AND dc.embedding IS NOT NULL
|
| 610 |
+
ORDER BY similarity_score DESC
|
| 611 |
+
LIMIT 3
|
| 612 |
+
"""
|
| 613 |
+
debug_rows = await conn.fetch(debug_query, query_vector)
|
| 614 |
+
logger.info(f"Debug: Top 3 similarity scores: {[(r['similarity_score'], r['content'][:50]) for r in debug_rows]}")
|
| 615 |
+
|
| 616 |
+
results = []
|
| 617 |
+
for row in rows:
|
| 618 |
+
try:
|
| 619 |
+
# Safely parse JSON metadata
|
| 620 |
+
chunk_metadata = {}
|
| 621 |
+
doc_metadata = {}
|
| 622 |
+
|
| 623 |
+
if row['chunk_metadata']:
|
| 624 |
+
try:
|
| 625 |
+
chunk_metadata = json.loads(row['chunk_metadata'])
|
| 626 |
+
except json.JSONDecodeError:
|
| 627 |
+
logger.warning(f"Invalid chunk metadata JSON for chunk {row['id']}")
|
| 628 |
+
|
| 629 |
+
if row['doc_metadata']:
|
| 630 |
+
try:
|
| 631 |
+
doc_metadata = json.loads(row['doc_metadata'])
|
| 632 |
+
except json.JSONDecodeError:
|
| 633 |
+
logger.warning(f"Invalid document metadata JSON for document {row['document_id']}")
|
| 634 |
+
|
| 635 |
+
# Convert UUID objects to strings
|
| 636 |
+
chunk_id = str(row['id']) if row['id'] else None
|
| 637 |
+
document_id = str(row['document_id']) if row['document_id'] else None
|
| 638 |
+
|
| 639 |
+
results.append({
|
| 640 |
+
'chunk_id': chunk_id,
|
| 641 |
+
'document_id': document_id,
|
| 642 |
+
'content': row['content'],
|
| 643 |
+
'chunk_index': row['chunk_index'],
|
| 644 |
+
'chunk_metadata': chunk_metadata,
|
| 645 |
+
'created_at': row['created_at'],
|
| 646 |
+
'document_title': row['title'],
|
| 647 |
+
'source_type': row['source_type'],
|
| 648 |
+
'source_url': row['source_url'],
|
| 649 |
+
'keywords': row['keywords'] or [],
|
| 650 |
+
'document_metadata': doc_metadata,
|
| 651 |
+
'similarity_score': float(row['similarity_score'])
|
| 652 |
+
})
|
| 653 |
+
except Exception as row_error:
|
| 654 |
+
logger.error(f"Error processing search result row: {row_error}")
|
| 655 |
+
continue
|
| 656 |
+
|
| 657 |
+
logger.info(f"Vector search returned {len(results)} results")
|
| 658 |
+
if results:
|
| 659 |
+
logger.info(f"Top result similarity: {results[0]['similarity_score']:.4f}")
|
| 660 |
+
|
| 661 |
+
return results
|
| 662 |
+
|
| 663 |
+
except HTTPException:
|
| 664 |
+
raise
|
| 665 |
+
except Exception as e:
|
| 666 |
+
logger.error(f"Vector search failed: {e}")
|
| 667 |
+
logger.error(traceback.format_exc())
|
| 668 |
+
raise HTTPException(status_code=500, detail=f"Vector search failed: {e}")
|
| 669 |
+
finally:
|
| 670 |
+
await release_db_connection(conn)
|
| 671 |
+
|
| 672 |
+
# Database initialization
|
| 673 |
+
async def init_database():
|
| 674 |
+
"""Initialize database tables"""
|
| 675 |
+
conn = await get_db_connection()
|
| 676 |
+
try:
|
| 677 |
+
logger.info("π Initializing database tables...")
|
| 678 |
+
|
| 679 |
+
# Create documents table
|
| 680 |
+
await conn.execute("""
|
| 681 |
+
CREATE TABLE IF NOT EXISTS documents (
|
| 682 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 683 |
+
title VARCHAR(500) NOT NULL,
|
| 684 |
+
source_type VARCHAR(50) NOT NULL,
|
| 685 |
+
source_url TEXT,
|
| 686 |
+
keywords TEXT[] DEFAULT '{}',
|
| 687 |
+
metadata JSONB DEFAULT '{}',
|
| 688 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 689 |
+
processing_status VARCHAR(20) DEFAULT 'processing',
|
| 690 |
+
total_chunks INTEGER DEFAULT 0
|
| 691 |
+
);
|
| 692 |
+
""")
|
| 693 |
+
|
| 694 |
+
# Create document_chunks table
|
| 695 |
+
await conn.execute("""
|
| 696 |
+
CREATE TABLE IF NOT EXISTS document_chunks (
|
| 697 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 698 |
+
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
| 699 |
+
content TEXT NOT NULL,
|
| 700 |
+
chunk_index INTEGER NOT NULL,
|
| 701 |
+
embedding vector(1536),
|
| 702 |
+
metadata JSONB DEFAULT '{}',
|
| 703 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 704 |
+
);
|
| 705 |
+
""")
|
| 706 |
+
|
| 707 |
+
# Create indexes
|
| 708 |
+
try:
|
| 709 |
+
await conn.execute("""
|
| 710 |
+
CREATE INDEX IF NOT EXISTS idx_documents_status ON documents(processing_status);
|
| 711 |
+
CREATE INDEX IF NOT EXISTS idx_chunks_document ON document_chunks(document_id);
|
| 712 |
+
CREATE INDEX IF NOT EXISTS idx_chunks_embedding ON document_chunks USING ivfflat (embedding vector_cosine_ops);
|
| 713 |
+
""")
|
| 714 |
+
except Exception as e:
|
| 715 |
+
logger.warning(f"Could not create some indexes (vector extension may not be available): {e}")
|
| 716 |
+
|
| 717 |
+
logger.info("β
Database tables initialized")
|
| 718 |
+
|
| 719 |
+
finally:
|
| 720 |
+
await release_db_connection(conn)
|
| 721 |
+
|
| 722 |
+
# App Lifecycle
|
| 723 |
+
@app.on_event("startup")
|
| 724 |
+
async def startup_event():
|
| 725 |
+
"""Application startup"""
|
| 726 |
+
logger.info("π Starting RAG Backend API...")
|
| 727 |
+
|
| 728 |
+
try:
|
| 729 |
+
# Test database connection
|
| 730 |
+
await get_db_pool()
|
| 731 |
+
logger.info("β
Database connection established")
|
| 732 |
+
|
| 733 |
+
# Initialize database
|
| 734 |
+
await init_database()
|
| 735 |
+
|
| 736 |
+
# Test Azure OpenAI
|
| 737 |
+
try:
|
| 738 |
+
get_openai_client()
|
| 739 |
+
logger.info("β
Azure OpenAI client configured")
|
| 740 |
+
except Exception as e:
|
| 741 |
+
logger.warning(f"β οΈ Azure OpenAI client configuration issue: {e}")
|
| 742 |
+
|
| 743 |
+
logger.info("π RAG Backend API is ready!")
|
| 744 |
+
|
| 745 |
+
except Exception as e:
|
| 746 |
+
logger.error(f"β Startup failed: {e}")
|
| 747 |
+
raise
|
| 748 |
+
|
| 749 |
+
@app.on_event("shutdown")
|
| 750 |
+
async def shutdown_event():
|
| 751 |
+
"""Application shutdown"""
|
| 752 |
+
logger.info("π Shutting down RAG Backend API...")
|
| 753 |
+
|
| 754 |
+
if db_pool:
|
| 755 |
+
await db_pool.close()
|
| 756 |
+
logger.info("β
Database connections closed")
|
| 757 |
+
|
| 758 |
+
# API Endpoints
|
| 759 |
+
@app.get("/")
|
| 760 |
+
async def root():
|
| 761 |
+
return {
|
| 762 |
+
"message": "RAG Backend API",
|
| 763 |
+
"version": "2.0.0",
|
| 764 |
+
"status": "running",
|
| 765 |
+
"features": {
|
| 766 |
+
"document_upload": True,
|
| 767 |
+
"url_processing": True,
|
| 768 |
+
"vector_search": True,
|
| 769 |
+
"ocr_integration": True,
|
| 770 |
+
"azure_openai_embeddings": True,
|
| 771 |
+
"postgresql_vector_storage": True
|
| 772 |
+
},
|
| 773 |
+
"configuration": {
|
| 774 |
+
"chunk_size": config.CHUNK_SIZE,
|
| 775 |
+
"chunk_overlap": config.CHUNK_OVERLAP,
|
| 776 |
+
"min_chunk_size": config.MIN_CHUNK_SIZE,
|
| 777 |
+
"max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024)
|
| 778 |
+
},
|
| 779 |
+
"endpoints": {
|
| 780 |
+
"health": "/health",
|
| 781 |
+
"docs": "/docs",
|
| 782 |
+
"upload": "/documents/upload",
|
| 783 |
+
"url_process": "/documents/url",
|
| 784 |
+
"search": "/search",
|
| 785 |
+
"list_documents": "/documents"
|
| 786 |
+
}
|
| 787 |
+
}
|
| 788 |
+
|
| 789 |
+
@app.get("/health")
|
| 790 |
+
async def health_check():
|
| 791 |
+
"""Health check endpoint"""
|
| 792 |
+
health_status = {
|
| 793 |
+
"status": "unknown",
|
| 794 |
+
"service": "RAG Backend API",
|
| 795 |
+
"version": "2.0.0",
|
| 796 |
+
"timestamp": datetime.utcnow().isoformat(),
|
| 797 |
+
"database": "unknown",
|
| 798 |
+
"openai": "unknown",
|
| 799 |
+
"uuid_method": "unknown",
|
| 800 |
+
"ocr_service": "unknown",
|
| 801 |
+
"configuration": {
|
| 802 |
+
"pg_host": config.PG_HOST,
|
| 803 |
+
"pg_port": config.PG_PORT,
|
| 804 |
+
"pg_database": config.PG_DATABASE,
|
| 805 |
+
"ocr_service_url": config.OCR_SERVICE_URL,
|
| 806 |
+
"chunk_size": config.CHUNK_SIZE
|
| 807 |
+
},
|
| 808 |
+
"errors": []
|
| 809 |
+
}
|
| 810 |
+
|
| 811 |
+
# Test database connection
|
| 812 |
+
try:
|
| 813 |
+
test_conn = await asyncpg.connect(
|
| 814 |
+
host=config.PG_HOST,
|
| 815 |
+
port=config.PG_PORT,
|
| 816 |
+
database=config.PG_DATABASE,
|
| 817 |
+
user=config.PG_USER,
|
| 818 |
+
password=config.PG_PASSWORD,
|
| 819 |
+
ssl=config.PG_SSL_MODE,
|
| 820 |
+
timeout=10
|
| 821 |
+
)
|
| 822 |
+
|
| 823 |
+
db_version = await test_conn.fetchval("SELECT version()")
|
| 824 |
+
health_status["database"] = "connected"
|
| 825 |
+
health_status["database_version"] = db_version
|
| 826 |
+
|
| 827 |
+
# Check UUID generation method
|
| 828 |
+
uuid_method = await detect_uuid_method(test_conn)
|
| 829 |
+
health_status["uuid_method"] = uuid_method
|
| 830 |
+
|
| 831 |
+
await test_conn.close()
|
| 832 |
+
|
| 833 |
+
except Exception as db_error:
|
| 834 |
+
health_status["database"] = "failed"
|
| 835 |
+
health_status["errors"].append(f"Database connection failed: {db_error}")
|
| 836 |
+
|
| 837 |
+
# Test OpenAI
|
| 838 |
+
try:
|
| 839 |
+
if (config.AZURE_OPENAI_ENDPOINT == "" or
|
| 840 |
+
config.AZURE_OPENAI_API_KEY == ""):
|
| 841 |
+
health_status["openai"] = "not_configured"
|
| 842 |
+
else:
|
| 843 |
+
client = get_openai_client()
|
| 844 |
+
# Test with a simple embedding request
|
| 845 |
+
test_response = client.embeddings.create(
|
| 846 |
+
input=["Health check test"],
|
| 847 |
+
model=config.AZURE_OPENAI_DEPLOYMENT
|
| 848 |
+
)
|
| 849 |
+
if test_response.data:
|
| 850 |
+
health_status["openai"] = "configured"
|
| 851 |
+
health_status["embedding_dimensions"] = len(test_response.data[0].embedding)
|
| 852 |
+
else:
|
| 853 |
+
health_status["openai"] = "failed"
|
| 854 |
+
health_status["errors"].append("OpenAI embedding test failed")
|
| 855 |
+
except Exception as openai_error:
|
| 856 |
+
health_status["openai"] = "failed"
|
| 857 |
+
health_status["errors"].append(f"OpenAI configuration failed: {openai_error}")
|
| 858 |
+
|
| 859 |
+
# Test OCR service
|
| 860 |
+
try:
|
| 861 |
+
ocr_response = requests.get(f"{config.OCR_SERVICE_URL}/health", timeout=5)
|
| 862 |
+
if ocr_response.status_code == 200:
|
| 863 |
+
health_status["ocr_service"] = "available"
|
| 864 |
+
else:
|
| 865 |
+
health_status["ocr_service"] = "unavailable"
|
| 866 |
+
except:
|
| 867 |
+
health_status["ocr_service"] = "unavailable"
|
| 868 |
+
|
| 869 |
+
# Determine overall status
|
| 870 |
+
if health_status["database"] == "connected" and health_status["openai"] in ["configured", "not_configured"]:
|
| 871 |
+
health_status["status"] = "healthy"
|
| 872 |
+
elif health_status["database"] == "connected":
|
| 873 |
+
health_status["status"] = "degraded"
|
| 874 |
+
else:
|
| 875 |
+
health_status["status"] = "unhealthy"
|
| 876 |
+
|
| 877 |
+
return health_status
|
| 878 |
+
|
| 879 |
+
@app.post("/documents/upload")
|
| 880 |
+
async def upload_document(
|
| 881 |
+
file: UploadFile = File(...),
|
| 882 |
+
title: str = Form(None),
|
| 883 |
+
keywords: str = Form(None), # JSON string of list
|
| 884 |
+
metadata: str = Form(None), # JSON string
|
| 885 |
+
chunk_size: int = Form(None),
|
| 886 |
+
chunk_overlap: int = Form(None)
|
| 887 |
+
):
|
| 888 |
+
"""Upload and process a document"""
|
| 889 |
+
document_id = None
|
| 890 |
+
try:
|
| 891 |
+
# Parse form data
|
| 892 |
+
keywords_list = json.loads(keywords) if keywords else []
|
| 893 |
+
metadata_dict = json.loads(metadata) if metadata else {}
|
| 894 |
+
|
| 895 |
+
# Set default title
|
| 896 |
+
if not title:
|
| 897 |
+
title = file.filename or "Untitled Document"
|
| 898 |
+
|
| 899 |
+
# Read file content
|
| 900 |
+
logger.info(f"Processing uploaded file: {file.filename} ({file.content_type})")
|
| 901 |
+
file_bytes = await file.read()
|
| 902 |
+
|
| 903 |
+
if not file_bytes or len(file_bytes) == 0:
|
| 904 |
+
raise HTTPException(status_code=400, detail="Empty file uploaded")
|
| 905 |
+
|
| 906 |
+
if len(file_bytes) > config.MAX_FILE_SIZE:
|
| 907 |
+
raise HTTPException(status_code=400, detail="File too large")
|
| 908 |
+
|
| 909 |
+
# Process with OCR
|
| 910 |
+
logger.info(f"Processing document with OCR: {title}")
|
| 911 |
+
ocr_result = await process_with_ocr(file_bytes=file_bytes, filename=file.filename)
|
| 912 |
+
|
| 913 |
+
if not ocr_result.get('success', False):
|
| 914 |
+
error_msg = ocr_result.get('error', 'Unknown OCR error')
|
| 915 |
+
logger.error(f"OCR processing failed: {error_msg}")
|
| 916 |
+
raise HTTPException(status_code=400, detail=f"OCR processing failed: {error_msg}")
|
| 917 |
+
|
| 918 |
+
# Extract text content
|
| 919 |
+
content = ocr_result.get('content', '')
|
| 920 |
+
if not content or not content.strip():
|
| 921 |
+
raise HTTPException(status_code=400, detail="No text content extracted from document")
|
| 922 |
+
|
| 923 |
+
# Clean the text
|
| 924 |
+
cleaned_content = clean_text(content)
|
| 925 |
+
|
| 926 |
+
if not cleaned_content or len(cleaned_content.strip()) == 0:
|
| 927 |
+
raise HTTPException(status_code=400, detail="No text content after cleaning")
|
| 928 |
+
|
| 929 |
+
# Allow shorter content for testing
|
| 930 |
+
if len(cleaned_content.strip()) < config.MIN_CHUNK_SIZE:
|
| 931 |
+
logger.warning(f"Content is short ({len(cleaned_content)} chars) but processing anyway")
|
| 932 |
+
|
| 933 |
+
# Create document record
|
| 934 |
+
document_id = await create_document_record(
|
| 935 |
+
title=title,
|
| 936 |
+
source_type='file_upload',
|
| 937 |
+
keywords=keywords_list,
|
| 938 |
+
metadata={
|
| 939 |
+
**metadata_dict,
|
| 940 |
+
'filename': file.filename,
|
| 941 |
+
'content_type': file.content_type,
|
| 942 |
+
'file_size': len(file_bytes),
|
| 943 |
+
'ocr_pages': len(ocr_result.get('pages', []))
|
| 944 |
+
}
|
| 945 |
+
)
|
| 946 |
+
|
| 947 |
+
# Create text chunks
|
| 948 |
+
chunks = create_text_chunks(
|
| 949 |
+
cleaned_content,
|
| 950 |
+
chunk_size=chunk_size,
|
| 951 |
+
chunk_overlap=chunk_overlap
|
| 952 |
+
)
|
| 953 |
+
|
| 954 |
+
if not chunks:
|
| 955 |
+
raise HTTPException(status_code=400, detail="No valid chunks created from document")
|
| 956 |
+
|
| 957 |
+
# Process chunks and generate embeddings
|
| 958 |
+
logger.info(f"Processing {len(chunks)} chunks for document {document_id}")
|
| 959 |
+
|
| 960 |
+
successful_chunks = 0
|
| 961 |
+
for i, chunk_content in enumerate(chunks):
|
| 962 |
+
try:
|
| 963 |
+
if not chunk_content or len(chunk_content.strip()) < 10:
|
| 964 |
+
logger.warning(f"Skipping chunk {i} - too small")
|
| 965 |
+
continue
|
| 966 |
+
|
| 967 |
+
# Generate embedding
|
| 968 |
+
embedding = await generate_embedding(chunk_content)
|
| 969 |
+
|
| 970 |
+
# Store chunk
|
| 971 |
+
await store_document_chunk(
|
| 972 |
+
document_id=document_id,
|
| 973 |
+
content=chunk_content,
|
| 974 |
+
chunk_index=i,
|
| 975 |
+
embedding=embedding,
|
| 976 |
+
metadata={
|
| 977 |
+
'chunk_size': len(chunk_content),
|
| 978 |
+
'position': i
|
| 979 |
+
}
|
| 980 |
+
)
|
| 981 |
+
|
| 982 |
+
successful_chunks += 1
|
| 983 |
+
|
| 984 |
+
except Exception as e:
|
| 985 |
+
logger.error(f"Failed to process chunk {i} for document {document_id}: {e}")
|
| 986 |
+
continue
|
| 987 |
+
|
| 988 |
+
if successful_chunks == 0:
|
| 989 |
+
await update_document_status(document_id, "failed")
|
| 990 |
+
raise HTTPException(status_code=500, detail="No chunks could be processed successfully")
|
| 991 |
+
|
| 992 |
+
# Update document status
|
| 993 |
+
await update_document_status(document_id, "completed", successful_chunks)
|
| 994 |
+
|
| 995 |
+
logger.info(f"Document {document_id} processed successfully with {successful_chunks} chunks")
|
| 996 |
+
|
| 997 |
+
return {
|
| 998 |
+
"success": True,
|
| 999 |
+
"document_id": document_id,
|
| 1000 |
+
"title": title,
|
| 1001 |
+
"total_chunks": successful_chunks,
|
| 1002 |
+
"message": "Document processed successfully"
|
| 1003 |
+
}
|
| 1004 |
+
|
| 1005 |
+
except HTTPException:
|
| 1006 |
+
if document_id:
|
| 1007 |
+
try:
|
| 1008 |
+
await update_document_status(document_id, "failed")
|
| 1009 |
+
except:
|
| 1010 |
+
pass
|
| 1011 |
+
raise
|
| 1012 |
+
except Exception as e:
|
| 1013 |
+
if document_id:
|
| 1014 |
+
try:
|
| 1015 |
+
await update_document_status(document_id, "failed")
|
| 1016 |
+
except:
|
| 1017 |
+
pass
|
| 1018 |
+
|
| 1019 |
+
logger.error(f"Unexpected error processing document: {e}")
|
| 1020 |
+
logger.error(traceback.format_exc())
|
| 1021 |
+
raise HTTPException(status_code=500, detail=f"Document processing failed: {e}")
|
| 1022 |
+
|
| 1023 |
+
@app.post("/documents/url")
|
| 1024 |
+
async def process_url(request: URLProcessRequest):
|
| 1025 |
+
"""Process document from URL"""
|
| 1026 |
+
document_id = None
|
| 1027 |
+
try:
|
| 1028 |
+
url_str = str(request.url)
|
| 1029 |
+
|
| 1030 |
+
# Set default title
|
| 1031 |
+
title = request.title or f"Document from {urlparse(url_str).netloc}"
|
| 1032 |
+
|
| 1033 |
+
# Process with OCR
|
| 1034 |
+
logger.info(f"Processing URL with OCR: {url_str}")
|
| 1035 |
+
ocr_result = await process_with_ocr(url=url_str, extract_images=request.extract_images)
|
| 1036 |
+
|
| 1037 |
+
if not ocr_result.get('success', False):
|
| 1038 |
+
error_msg = ocr_result.get('error', 'Unknown OCR error')
|
| 1039 |
+
logger.error(f"OCR processing failed for URL: {error_msg}")
|
| 1040 |
+
raise HTTPException(status_code=400, detail=f"OCR processing failed: {error_msg}")
|
| 1041 |
+
|
| 1042 |
+
# Extract text content
|
| 1043 |
+
content = ocr_result.get('content', '')
|
| 1044 |
+
if not content or not content.strip():
|
| 1045 |
+
raise HTTPException(status_code=400, detail="No text content extracted from URL")
|
| 1046 |
+
|
| 1047 |
+
# Clean the text
|
| 1048 |
+
cleaned_content = clean_text(content)
|
| 1049 |
+
|
| 1050 |
+
if not cleaned_content or len(cleaned_content.strip()) == 0:
|
| 1051 |
+
raise HTTPException(status_code=400, detail="No text content after cleaning")
|
| 1052 |
+
|
| 1053 |
+
# Allow shorter content for testing
|
| 1054 |
+
if len(cleaned_content.strip()) < config.MIN_CHUNK_SIZE:
|
| 1055 |
+
logger.warning(f"URL content is short ({len(cleaned_content)} chars) but processing anyway")
|
| 1056 |
+
|
| 1057 |
+
# Create document record
|
| 1058 |
+
document_id = await create_document_record(
|
| 1059 |
+
title=title,
|
| 1060 |
+
source_type=ocr_result.get('source_type', 'url'),
|
| 1061 |
+
source_url=url_str,
|
| 1062 |
+
keywords=request.keywords or [],
|
| 1063 |
+
metadata={
|
| 1064 |
+
**(request.metadata or {}),
|
| 1065 |
+
'url': url_str,
|
| 1066 |
+
'extract_images': request.extract_images,
|
| 1067 |
+
'ocr_pages': len(ocr_result.get('pages', []))
|
| 1068 |
+
}
|
| 1069 |
+
)
|
| 1070 |
+
|
| 1071 |
+
# Create text chunks
|
| 1072 |
+
chunks = create_text_chunks(
|
| 1073 |
+
cleaned_content,
|
| 1074 |
+
chunk_size=request.chunk_size,
|
| 1075 |
+
chunk_overlap=request.chunk_overlap
|
| 1076 |
+
)
|
| 1077 |
+
|
| 1078 |
+
if not chunks:
|
| 1079 |
+
raise HTTPException(status_code=400, detail="No valid chunks created from URL content")
|
| 1080 |
+
|
| 1081 |
+
# Process chunks and generate embeddings
|
| 1082 |
+
logger.info(f"Processing {len(chunks)} chunks for document {document_id}")
|
| 1083 |
+
|
| 1084 |
+
successful_chunks = 0
|
| 1085 |
+
for i, chunk_content in enumerate(chunks):
|
| 1086 |
+
try:
|
| 1087 |
+
if not chunk_content or len(chunk_content.strip()) < 10:
|
| 1088 |
+
logger.warning(f"Skipping chunk {i} - too small")
|
| 1089 |
+
continue
|
| 1090 |
+
|
| 1091 |
+
# Generate embedding
|
| 1092 |
+
embedding = await generate_embedding(chunk_content)
|
| 1093 |
+
|
| 1094 |
+
# Store chunk
|
| 1095 |
+
await store_document_chunk(
|
| 1096 |
+
document_id=document_id,
|
| 1097 |
+
content=chunk_content,
|
| 1098 |
+
chunk_index=i,
|
| 1099 |
+
embedding=embedding,
|
| 1100 |
+
metadata={
|
| 1101 |
+
'chunk_size': len(chunk_content),
|
| 1102 |
+
'position': i
|
| 1103 |
+
}
|
| 1104 |
+
)
|
| 1105 |
+
|
| 1106 |
+
successful_chunks += 1
|
| 1107 |
+
|
| 1108 |
+
except Exception as e:
|
| 1109 |
+
logger.error(f"Failed to process chunk {i} for document {document_id}: {e}")
|
| 1110 |
+
continue
|
| 1111 |
+
|
| 1112 |
+
if successful_chunks == 0:
|
| 1113 |
+
await update_document_status(document_id, "failed")
|
| 1114 |
+
raise HTTPException(status_code=500, detail="No chunks could be processed successfully")
|
| 1115 |
+
|
| 1116 |
+
# Update document status
|
| 1117 |
+
await update_document_status(document_id, "completed", successful_chunks)
|
| 1118 |
+
|
| 1119 |
+
logger.info(f"URL document {document_id} processed successfully with {successful_chunks} chunks")
|
| 1120 |
+
|
| 1121 |
+
return {
|
| 1122 |
+
"success": True,
|
| 1123 |
+
"document_id": document_id,
|
| 1124 |
+
"title": title,
|
| 1125 |
+
"total_chunks": successful_chunks,
|
| 1126 |
+
"source_url": url_str,
|
| 1127 |
+
"message": "URL processed successfully"
|
| 1128 |
+
}
|
| 1129 |
+
|
| 1130 |
+
except HTTPException:
|
| 1131 |
+
if document_id:
|
| 1132 |
+
try:
|
| 1133 |
+
await update_document_status(document_id, "failed")
|
| 1134 |
+
except:
|
| 1135 |
+
pass
|
| 1136 |
+
raise
|
| 1137 |
+
except Exception as e:
|
| 1138 |
+
if document_id:
|
| 1139 |
+
try:
|
| 1140 |
+
await update_document_status(document_id, "failed")
|
| 1141 |
+
except:
|
| 1142 |
+
pass
|
| 1143 |
+
|
| 1144 |
+
logger.error(f"Unexpected error processing URL: {e}")
|
| 1145 |
+
logger.error(traceback.format_exc())
|
| 1146 |
+
raise HTTPException(status_code=500, detail=f"URL processing failed: {e}")
|
| 1147 |
+
|
| 1148 |
+
@app.post("/search", response_model=SearchResponse)
|
| 1149 |
+
async def search_documents(request: SearchRequest):
|
| 1150 |
+
"""Search documents using vector similarity"""
|
| 1151 |
+
try:
|
| 1152 |
+
import time
|
| 1153 |
+
start_time = time.time()
|
| 1154 |
+
|
| 1155 |
+
# Validate input
|
| 1156 |
+
if not request.query or not request.query.strip():
|
| 1157 |
+
raise HTTPException(status_code=400, detail="Query cannot be empty")
|
| 1158 |
+
|
| 1159 |
+
query_text = request.query.strip()
|
| 1160 |
+
logger.info(f"Performing vector search for query: '{query_text}'")
|
| 1161 |
+
|
| 1162 |
+
# Generate embedding for query
|
| 1163 |
+
try:
|
| 1164 |
+
query_embedding = await generate_embedding(query_text)
|
| 1165 |
+
except Exception as e:
|
| 1166 |
+
logger.error(f"Failed to generate query embedding: {e}")
|
| 1167 |
+
raise HTTPException(status_code=500, detail=f"Query embedding generation failed: {e}")
|
| 1168 |
+
|
| 1169 |
+
# Search for similar chunks
|
| 1170 |
+
try:
|
| 1171 |
+
results = await search_similar_chunks(
|
| 1172 |
+
query_embedding=query_embedding,
|
| 1173 |
+
limit=request.limit,
|
| 1174 |
+
similarity_threshold=request.similarity_threshold,
|
| 1175 |
+
filter_metadata=request.filter_metadata
|
| 1176 |
+
)
|
| 1177 |
+
except Exception as e:
|
| 1178 |
+
logger.error(f"Vector search failed: {e}")
|
| 1179 |
+
raise HTTPException(status_code=500, detail=f"Vector search failed: {e}")
|
| 1180 |
+
|
| 1181 |
+
# Format results
|
| 1182 |
+
search_results = []
|
| 1183 |
+
for result in results:
|
| 1184 |
+
try:
|
| 1185 |
+
chunk = DocumentChunk(
|
| 1186 |
+
id=result['chunk_id'],
|
| 1187 |
+
document_id=result['document_id'],
|
| 1188 |
+
content=result['content'],
|
| 1189 |
+
chunk_index=result['chunk_index'],
|
| 1190 |
+
metadata=result['chunk_metadata'],
|
| 1191 |
+
created_at=result['created_at']
|
| 1192 |
+
)
|
| 1193 |
+
|
| 1194 |
+
search_results.append(SearchResult(
|
| 1195 |
+
chunk=chunk,
|
| 1196 |
+
similarity_score=result['similarity_score'],
|
| 1197 |
+
document_info={
|
| 1198 |
+
'title': result['document_title'],
|
| 1199 |
+
'source_type': result['source_type'],
|
| 1200 |
+
'source_url': result['source_url'],
|
| 1201 |
+
'keywords': result['keywords'],
|
| 1202 |
+
'metadata': result['document_metadata']
|
| 1203 |
+
}
|
| 1204 |
+
))
|
| 1205 |
+
except Exception as result_error:
|
| 1206 |
+
logger.error(f"Error formatting search result: {result_error}")
|
| 1207 |
+
continue
|
| 1208 |
+
|
| 1209 |
+
processing_time = time.time() - start_time
|
| 1210 |
+
|
| 1211 |
+
logger.info(f"Search completed: {len(search_results)} results in {processing_time:.3f}s")
|
| 1212 |
+
|
| 1213 |
+
return SearchResponse(
|
| 1214 |
+
query=request.query,
|
| 1215 |
+
results=search_results,
|
| 1216 |
+
total_results=len(search_results),
|
| 1217 |
+
processing_time=processing_time
|
| 1218 |
+
)
|
| 1219 |
+
|
| 1220 |
+
except HTTPException:
|
| 1221 |
+
raise
|
| 1222 |
+
except Exception as e:
|
| 1223 |
+
logger.error(f"Search failed with unexpected error: {e}")
|
| 1224 |
+
logger.error(traceback.format_exc())
|
| 1225 |
+
raise HTTPException(status_code=500, detail=f"Search failed: {e}")
|
| 1226 |
+
|
| 1227 |
+
@app.get("/documents")
|
| 1228 |
+
async def list_documents(
|
| 1229 |
+
limit: int = Query(10, ge=1, le=100),
|
| 1230 |
+
offset: int = Query(0, ge=0),
|
| 1231 |
+
status: str = Query(None)
|
| 1232 |
+
):
|
| 1233 |
+
"""List documents with pagination"""
|
| 1234 |
+
conn = await get_db_connection()
|
| 1235 |
+
try:
|
| 1236 |
+
# Build query
|
| 1237 |
+
base_query = """
|
| 1238 |
+
SELECT id, title, source_type, source_url, keywords, metadata,
|
| 1239 |
+
created_at, processing_status, total_chunks
|
| 1240 |
+
FROM documents
|
| 1241 |
+
"""
|
| 1242 |
+
|
| 1243 |
+
params = []
|
| 1244 |
+
if status:
|
| 1245 |
+
base_query += " WHERE processing_status = $1"
|
| 1246 |
+
params.append(status)
|
| 1247 |
+
|
| 1248 |
+
base_query += " ORDER BY created_at DESC LIMIT $" + str(len(params) + 1) + " OFFSET $" + str(len(params) + 2)
|
| 1249 |
+
params.extend([limit, offset])
|
| 1250 |
+
|
| 1251 |
+
rows = await conn.fetch(base_query, *params)
|
| 1252 |
+
|
| 1253 |
+
documents = []
|
| 1254 |
+
for row in rows:
|
| 1255 |
+
documents.append({
|
| 1256 |
+
'id': str(row['id']),
|
| 1257 |
+
'title': row['title'],
|
| 1258 |
+
'source_type': row['source_type'],
|
| 1259 |
+
'source_url': row['source_url'],
|
| 1260 |
+
'keywords': row['keywords'],
|
| 1261 |
+
'metadata': json.loads(row['metadata']) if row['metadata'] else {},
|
| 1262 |
+
'created_at': row['created_at'].isoformat(),
|
| 1263 |
+
'processing_status': row['processing_status'],
|
| 1264 |
+
'total_chunks': row['total_chunks']
|
| 1265 |
+
})
|
| 1266 |
+
|
| 1267 |
+
# Get total count
|
| 1268 |
+
count_query = "SELECT COUNT(*) FROM documents"
|
| 1269 |
+
if status:
|
| 1270 |
+
count_query += " WHERE processing_status = $1"
|
| 1271 |
+
total_count = await conn.fetchval(count_query, status)
|
| 1272 |
+
else:
|
| 1273 |
+
total_count = await conn.fetchval(count_query)
|
| 1274 |
+
|
| 1275 |
+
return {
|
| 1276 |
+
"documents": documents,
|
| 1277 |
+
"total": total_count,
|
| 1278 |
+
"limit": limit,
|
| 1279 |
+
"offset": offset
|
| 1280 |
+
}
|
| 1281 |
+
|
| 1282 |
+
finally:
|
| 1283 |
+
await release_db_connection(conn)
|
| 1284 |
+
|
| 1285 |
+
@app.get("/documents/{document_id}")
|
| 1286 |
+
async def get_document(document_id: str):
|
| 1287 |
+
"""Get document details"""
|
| 1288 |
+
conn = await get_db_connection()
|
| 1289 |
+
try:
|
| 1290 |
+
# Get document
|
| 1291 |
+
doc_row = await conn.fetchrow("""
|
| 1292 |
+
SELECT id, title, source_type, source_url, keywords, metadata,
|
| 1293 |
+
created_at, processing_status, total_chunks
|
| 1294 |
+
FROM documents WHERE id = $1
|
| 1295 |
+
""", document_id)
|
| 1296 |
+
|
| 1297 |
+
if not doc_row:
|
| 1298 |
+
raise HTTPException(status_code=404, detail="Document not found")
|
| 1299 |
+
|
| 1300 |
+
# Get chunks
|
| 1301 |
+
chunk_rows = await conn.fetch("""
|
| 1302 |
+
SELECT id, content, chunk_index, metadata, created_at
|
| 1303 |
+
FROM document_chunks
|
| 1304 |
+
WHERE document_id = $1
|
| 1305 |
+
ORDER BY chunk_index
|
| 1306 |
+
""", document_id)
|
| 1307 |
+
|
| 1308 |
+
return {
|
| 1309 |
+
'id': str(doc_row['id']),
|
| 1310 |
+
'title': doc_row['title'],
|
| 1311 |
+
'source_type': doc_row['source_type'],
|
| 1312 |
+
'source_url': doc_row['source_url'],
|
| 1313 |
+
'keywords': doc_row['keywords'],
|
| 1314 |
+
'metadata': json.loads(doc_row['metadata']) if doc_row['metadata'] else {},
|
| 1315 |
+
'created_at': doc_row['created_at'].isoformat(),
|
| 1316 |
+
'processing_status': doc_row['processing_status'],
|
| 1317 |
+
'total_chunks': doc_row['total_chunks'],
|
| 1318 |
+
'chunks': [
|
| 1319 |
+
{
|
| 1320 |
+
'id': str(chunk['id']),
|
| 1321 |
+
'content': chunk['content'],
|
| 1322 |
+
'chunk_index': chunk['chunk_index'],
|
| 1323 |
+
'metadata': json.loads(chunk['metadata']) if chunk['metadata'] else {},
|
| 1324 |
+
'created_at': chunk['created_at'].isoformat()
|
| 1325 |
+
}
|
| 1326 |
+
for chunk in chunk_rows
|
| 1327 |
+
]
|
| 1328 |
+
}
|
| 1329 |
+
|
| 1330 |
+
finally:
|
| 1331 |
+
await release_db_connection(conn)
|
| 1332 |
+
|
| 1333 |
+
@app.delete("/documents/{document_id}")
|
| 1334 |
+
async def delete_document(document_id: str):
|
| 1335 |
+
"""Delete document and its chunks"""
|
| 1336 |
+
conn = await get_db_connection()
|
| 1337 |
+
try:
|
| 1338 |
+
# Check if document exists
|
| 1339 |
+
exists = await conn.fetchval("SELECT EXISTS(SELECT 1 FROM documents WHERE id = $1)", document_id)
|
| 1340 |
+
if not exists:
|
| 1341 |
+
raise HTTPException(status_code=404, detail="Document not found")
|
| 1342 |
+
|
| 1343 |
+
# Delete chunks first (foreign key constraint)
|
| 1344 |
+
await conn.execute("DELETE FROM document_chunks WHERE document_id = $1", document_id)
|
| 1345 |
+
|
| 1346 |
+
# Delete document
|
| 1347 |
+
await conn.execute("DELETE FROM documents WHERE id = $1", document_id)
|
| 1348 |
+
|
| 1349 |
+
return {"message": "Document deleted successfully"}
|
| 1350 |
+
|
| 1351 |
+
finally:
|
| 1352 |
+
await release_db_connection(conn)
|
| 1353 |
+
|
| 1354 |
+
if __name__ == "__main__":
|
| 1355 |
+
print("π§ Loading RAG service configuration...")
|
| 1356 |
+
print(f"π Will start server on {config.HOST}:{config.PORT}")
|
| 1357 |
+
print(f"ποΈ Database: {config.PG_HOST}:{config.PG_PORT}/{config.PG_DATABASE}")
|
| 1358 |
+
print(f"π€ Azure OpenAI: {'β
Configured' if config.AZURE_OPENAI_ENDPOINT else 'β Not configured'}")
|
| 1359 |
+
print(f"π OCR Service: {config.OCR_SERVICE_URL}")
|
| 1360 |
+
|
| 1361 |
+
uvicorn.run(
|
| 1362 |
+
"rag_service:app",
|
| 1363 |
+
host=config.HOST,
|
| 1364 |
+
port=config.PORT,
|
| 1365 |
+
reload=config.DEBUG,
|
| 1366 |
+
log_level="info"
|
| 1367 |
+
)
|