Spaces:
Sleeping
Sleeping
Commit
·
8c0cf48
0
Parent(s):
Initial clean commit: purge history (remove .env and venv from history)
Browse files- .DS_Store +0 -0
- .gitignore +189 -0
- AGENTS.md +534 -0
- Dockerfile +43 -0
- INSTRUCTION.md +593 -0
- README.md +261 -0
- app.py +1244 -0
- core/modules.py +416 -0
- examples/usage_examples.py +1159 -0
- llm/gemini_connector.py +721 -0
- requirements.txt +43 -0
- spaces_metadata.yaml +77 -0
- utils/deployment.py +609 -0
- visualization/analytics_engine.py +1393 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
*.egg-info/
|
| 24 |
+
.installed.cfg
|
| 25 |
+
*.egg
|
| 26 |
+
|
| 27 |
+
# PyInstaller
|
| 28 |
+
*.manifest
|
| 29 |
+
*.spec
|
| 30 |
+
|
| 31 |
+
# Installer logs
|
| 32 |
+
pip-log.txt
|
| 33 |
+
pip-delete-this-directory.txt
|
| 34 |
+
|
| 35 |
+
# Unit test / coverage reports
|
| 36 |
+
htmlcov/
|
| 37 |
+
.tox/
|
| 38 |
+
.coverage
|
| 39 |
+
.coverage.*
|
| 40 |
+
.cache
|
| 41 |
+
nosetests.xml
|
| 42 |
+
coverage.xml
|
| 43 |
+
*.cover
|
| 44 |
+
.hypothesis/
|
| 45 |
+
.pytest_cache/
|
| 46 |
+
|
| 47 |
+
# Translations
|
| 48 |
+
*.mo
|
| 49 |
+
*.pot
|
| 50 |
+
|
| 51 |
+
# Django stuff:
|
| 52 |
+
*.log
|
| 53 |
+
local_settings.py
|
| 54 |
+
db.sqlite3
|
| 55 |
+
|
| 56 |
+
# Flask stuff:
|
| 57 |
+
instance/
|
| 58 |
+
.webassets-cache
|
| 59 |
+
|
| 60 |
+
# Scrapy stuff:
|
| 61 |
+
.scrapy
|
| 62 |
+
|
| 63 |
+
# Sphinx documentation
|
| 64 |
+
docs/_build/
|
| 65 |
+
|
| 66 |
+
# PyBuilder
|
| 67 |
+
target/
|
| 68 |
+
|
| 69 |
+
# Jupyter Notebook
|
| 70 |
+
.ipynb_checkpoints
|
| 71 |
+
|
| 72 |
+
# pyenv
|
| 73 |
+
.python-version
|
| 74 |
+
|
| 75 |
+
# celery beat schedule file
|
| 76 |
+
celerybeat-schedule
|
| 77 |
+
|
| 78 |
+
# SageMath parsed files
|
| 79 |
+
*.sage.py
|
| 80 |
+
|
| 81 |
+
# Environments
|
| 82 |
+
.env
|
| 83 |
+
.venv
|
| 84 |
+
env/
|
| 85 |
+
venv/
|
| 86 |
+
ENV/
|
| 87 |
+
env.bak/
|
| 88 |
+
venv.bak/
|
| 89 |
+
|
| 90 |
+
# Spyder project settings
|
| 91 |
+
.spyderproject
|
| 92 |
+
.spyproject
|
| 93 |
+
|
| 94 |
+
# Rope project settings
|
| 95 |
+
.ropeproject
|
| 96 |
+
|
| 97 |
+
# mkdocs documentation
|
| 98 |
+
/site
|
| 99 |
+
|
| 100 |
+
# mypy
|
| 101 |
+
.mypy_cache/
|
| 102 |
+
.dmypy.json
|
| 103 |
+
dmypy.json
|
| 104 |
+
|
| 105 |
+
# Pyre type checker
|
| 106 |
+
.pyre/
|
| 107 |
+
|
| 108 |
+
# VS Code
|
| 109 |
+
.vscode/*
|
| 110 |
+
!.vscode/settings.json
|
| 111 |
+
!.vscode/tasks.json
|
| 112 |
+
!.vscode/launch.json
|
| 113 |
+
!.vscode/extensions.json
|
| 114 |
+
|
| 115 |
+
# PyCharm
|
| 116 |
+
.idea/
|
| 117 |
+
*.iml
|
| 118 |
+
*.ipr
|
| 119 |
+
*.iws
|
| 120 |
+
|
| 121 |
+
# macOS
|
| 122 |
+
.DS_Store
|
| 123 |
+
.AppleDouble
|
| 124 |
+
.LSOverride
|
| 125 |
+
|
| 126 |
+
# Windows
|
| 127 |
+
Thumbs.db
|
| 128 |
+
ehthumbs.db
|
| 129 |
+
Desktop.ini
|
| 130 |
+
$RECYCLE.BIN/
|
| 131 |
+
|
| 132 |
+
# Logs and databases
|
| 133 |
+
*.log
|
| 134 |
+
*.sql
|
| 135 |
+
*.sqlite
|
| 136 |
+
|
| 137 |
+
# Local development settings
|
| 138 |
+
.env.local
|
| 139 |
+
.env.development.local
|
| 140 |
+
.env.test.local
|
| 141 |
+
.env.production.local
|
| 142 |
+
|
| 143 |
+
# Local configuration
|
| 144 |
+
config/local/
|
| 145 |
+
|
| 146 |
+
# Temporary files
|
| 147 |
+
*.swp
|
| 148 |
+
*.swo
|
| 149 |
+
*~
|
| 150 |
+
|
| 151 |
+
# Project specific
|
| 152 |
+
instance/
|
| 153 |
+
.webassets-cache
|
| 154 |
+
.pytest_cache/
|
| 155 |
+
.coverage
|
| 156 |
+
htmlcov/
|
| 157 |
+
|
| 158 |
+
# Project dependencies
|
| 159 |
+
node_modules/
|
| 160 |
+
|
| 161 |
+
# Build files
|
| 162 |
+
build/
|
| 163 |
+
dist/
|
| 164 |
+
*.egg-info/
|
| 165 |
+
|
| 166 |
+
# Virtual Environment
|
| 167 |
+
venv/
|
| 168 |
+
env/
|
| 169 |
+
|
| 170 |
+
# Jupyter Notebook
|
| 171 |
+
.ipynb_checkpoints
|
| 172 |
+
*/.ipynb_checkpoints/*
|
| 173 |
+
|
| 174 |
+
# VS Code
|
| 175 |
+
.vscode/
|
| 176 |
+
!.vscode/settings.json
|
| 177 |
+
!.vscode/tasks.json
|
| 178 |
+
!.vscode/launch.json
|
| 179 |
+
!.vscode/extensions.json
|
| 180 |
+
|
| 181 |
+
# IDE specific files
|
| 182 |
+
.idea/
|
| 183 |
+
*.iml
|
| 184 |
+
*.ipr
|
| 185 |
+
*.iws
|
| 186 |
+
|
| 187 |
+
# System Files
|
| 188 |
+
.DS_Store
|
| 189 |
+
Thumbs.db
|
AGENTS.md
ADDED
|
@@ -0,0 +1,534 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Strategic Architectural Revision: Hugging Face Optimized MarkItDown Platform
|
| 2 |
+
|
| 3 |
+
## Core Design Philosophy Adaptation
|
| 4 |
+
|
| 5 |
+
**"Simplicity scales better than sophistication on shared infrastructure"**
|
| 6 |
+
|
| 7 |
+
### Revised Architectural Principles for HF Deployment:
|
| 8 |
+
- **Stateless by Design**: Zero persistence complexity for shared hosting
|
| 9 |
+
- **Memory-Efficient Processing**: Optimized for HF Spaces resource constraints
|
| 10 |
+
- **Cloud-Native Integration**: Seamless Gemini API integration patterns
|
| 11 |
+
- **Progressive Feature Disclosure**: Core functionality first, advanced features as additive layers
|
| 12 |
+
|
| 13 |
+
## Phase 1: Simplified System Architecture
|
| 14 |
+
|
| 15 |
+
### 🏗️ **HF-Optimized Architecture Overview**
|
| 16 |
+
|
| 17 |
+
```
|
| 18 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 19 |
+
│ GRADIO INTERFACE LAYER │
|
| 20 |
+
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
|
| 21 |
+
│ │ Upload │ │ Process │ │ Analyze │ │ Compare │ │ Export │ │
|
| 22 |
+
│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │
|
| 23 |
+
├─────────────────────────────────────────────────────────────┤
|
| 24 |
+
│ STATELESS PROCESSING LAYER │
|
| 25 |
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
| 26 |
+
│ │ File Handler│ │ Conversion │ │ LLM Gateway │ │
|
| 27 |
+
│ │ Module │ │ Engine │ │ (Gemini) │ │
|
| 28 |
+
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
| 29 |
+
├─────────────────────────────────────────────────────────────┤
|
| 30 |
+
│ IN-MEMORY STATE MANAGEMENT │
|
| 31 |
+
│ Session Variables + Gradio State + Temp Storage │
|
| 32 |
+
└─────────────────────────────────────────────────────────────┘
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
### 🔧 **Simplified Core Modules**
|
| 36 |
+
|
| 37 |
+
#### **1. Stateless File Handler**
|
| 38 |
+
```python
|
| 39 |
+
class StreamlineFileHandler:
|
| 40 |
+
"""Memory-efficient, HF-optimized file processing"""
|
| 41 |
+
|
| 42 |
+
@staticmethod
|
| 43 |
+
def process_upload(file_obj):
|
| 44 |
+
"""Direct stream processing without disk persistence"""
|
| 45 |
+
return {
|
| 46 |
+
'content': file_obj.read(),
|
| 47 |
+
'metadata': extract_minimal_metadata(file_obj),
|
| 48 |
+
'format': detect_format(file_obj.name)
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
@staticmethod
|
| 52 |
+
def validate_constraints(file_obj):
|
| 53 |
+
"""HF Spaces resource-aware validation"""
|
| 54 |
+
# Max file size: 50MB for free tier
|
| 55 |
+
# Supported formats: PDF, DOCX, PPTX, TXT, HTML
|
| 56 |
+
pass
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
#### **2. Conversion Engine Adapter**
|
| 60 |
+
```python
|
| 61 |
+
class HFConversionEngine:
|
| 62 |
+
"""MarkItDown wrapper optimized for stateless execution"""
|
| 63 |
+
|
| 64 |
+
def __init__(self):
|
| 65 |
+
self.md = MarkItDown()
|
| 66 |
+
self.temp_cleanup_queue = []
|
| 67 |
+
|
| 68 |
+
async def convert_stream(self, file_data, config=None):
|
| 69 |
+
"""Stream-based conversion with automatic cleanup"""
|
| 70 |
+
try:
|
| 71 |
+
# Process in memory where possible
|
| 72 |
+
result = await self._process_with_cleanup(file_data)
|
| 73 |
+
return self._format_response(result)
|
| 74 |
+
finally:
|
| 75 |
+
self._cleanup_temp_files()
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
#### **3. Gemini LLM Gateway**
|
| 79 |
+
```python
|
| 80 |
+
class GeminiConnector:
|
| 81 |
+
"""Streamlined Gemini API integration"""
|
| 82 |
+
|
| 83 |
+
def __init__(self, api_key=None):
|
| 84 |
+
self.client = self._init_gemini_client(api_key)
|
| 85 |
+
self.models = {
|
| 86 |
+
'analysis': 'gemini-1.5-pro',
|
| 87 |
+
'summary': 'gemini-1.5-flash',
|
| 88 |
+
'vision': 'gemini-1.5-pro-vision'
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
async def analyze_content(self, markdown_content, task_type='analysis'):
|
| 92 |
+
"""Unified Gemini analysis interface"""
|
| 93 |
+
prompt = self._build_analysis_prompt(markdown_content, task_type)
|
| 94 |
+
response = await self.client.generate_content(
|
| 95 |
+
model=self.models[task_type],
|
| 96 |
+
contents=prompt
|
| 97 |
+
)
|
| 98 |
+
return self._parse_gemini_response(response)
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
## Phase 2: Gradio Interface Strategy
|
| 102 |
+
|
| 103 |
+
### 📱 **HF Spaces Optimized UI Design**
|
| 104 |
+
|
| 105 |
+
#### **Single-Page Progressive Enhancement:**
|
| 106 |
+
|
| 107 |
+
```python
|
| 108 |
+
def create_markitdown_interface():
|
| 109 |
+
"""Main interface factory with progressive complexity"""
|
| 110 |
+
|
| 111 |
+
with gr.Blocks(
|
| 112 |
+
title="MarkItDown Testing Platform",
|
| 113 |
+
theme=gr.themes.Soft(),
|
| 114 |
+
css=custom_hf_styles
|
| 115 |
+
) as interface:
|
| 116 |
+
|
| 117 |
+
# State management for stateless environment
|
| 118 |
+
session_state = gr.State({})
|
| 119 |
+
conversion_results = gr.State({})
|
| 120 |
+
|
| 121 |
+
with gr.Row():
|
| 122 |
+
with gr.Column(scale=1):
|
| 123 |
+
# LEFT: Input & Configuration
|
| 124 |
+
file_upload = gr.File(
|
| 125 |
+
label="Upload Document",
|
| 126 |
+
file_types=['.pdf', '.docx', '.pptx', '.txt', '.html'],
|
| 127 |
+
type="binary"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
# Gemini Configuration
|
| 131 |
+
with gr.Accordion("🔧 LLM Configuration", open=False):
|
| 132 |
+
gemini_key = gr.Textbox(
|
| 133 |
+
label="Gemini API Key",
|
| 134 |
+
type="password",
|
| 135 |
+
placeholder="Enter your Gemini API key..."
|
| 136 |
+
)
|
| 137 |
+
analysis_type = gr.Dropdown(
|
| 138 |
+
choices=['Quality Analysis', 'Structure Review', 'Content Summary'],
|
| 139 |
+
value='Quality Analysis',
|
| 140 |
+
label="Analysis Type"
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
process_btn = gr.Button(
|
| 144 |
+
"🚀 Process Document",
|
| 145 |
+
variant="primary",
|
| 146 |
+
size="lg"
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
with gr.Column(scale=2):
|
| 150 |
+
# RIGHT: Results & Analysis
|
| 151 |
+
with gr.Tabs() as results_tabs:
|
| 152 |
+
|
| 153 |
+
with gr.TabItem("📄 Conversion Results"):
|
| 154 |
+
conversion_status = gr.HTML()
|
| 155 |
+
|
| 156 |
+
with gr.Row():
|
| 157 |
+
with gr.Column():
|
| 158 |
+
gr.Markdown("### Original Preview")
|
| 159 |
+
original_preview = gr.HTML()
|
| 160 |
+
|
| 161 |
+
with gr.Column():
|
| 162 |
+
gr.Markdown("### Markdown Output")
|
| 163 |
+
markdown_output = gr.Code(
|
| 164 |
+
language="markdown",
|
| 165 |
+
show_label=False
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
with gr.TabItem("🤖 LLM Analysis"):
|
| 169 |
+
analysis_status = gr.HTML()
|
| 170 |
+
llm_analysis = gr.Markdown()
|
| 171 |
+
|
| 172 |
+
# Analysis metrics visualization
|
| 173 |
+
metrics_plot = gr.Plot()
|
| 174 |
+
|
| 175 |
+
with gr.TabItem("📊 Comparison Dashboard"):
|
| 176 |
+
quality_metrics = gr.JSON(label="Quality Metrics")
|
| 177 |
+
|
| 178 |
+
# Interactive comparison
|
| 179 |
+
comparison_viz = gr.HTML()
|
| 180 |
+
|
| 181 |
+
with gr.TabItem("💾 Export Options"):
|
| 182 |
+
export_format = gr.Dropdown(
|
| 183 |
+
choices=['Markdown (.md)', 'HTML (.html)', 'JSON Report (.json)'],
|
| 184 |
+
value='Markdown (.md)',
|
| 185 |
+
label="Export Format"
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
export_btn = gr.Button("📥 Download Results")
|
| 189 |
+
download_file = gr.File(visible=False)
|
| 190 |
+
|
| 191 |
+
# Event handlers with HF optimization
|
| 192 |
+
process_btn.click(
|
| 193 |
+
fn=process_document_pipeline,
|
| 194 |
+
inputs=[file_upload, gemini_key, analysis_type, session_state],
|
| 195 |
+
outputs=[conversion_status, markdown_output, original_preview, conversion_results],
|
| 196 |
+
show_progress=True
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
return interface
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
### 🔄 **Stateless Processing Pipeline**
|
| 203 |
+
|
| 204 |
+
```python
|
| 205 |
+
async def process_document_pipeline(file_obj, gemini_key, analysis_type, session_state):
|
| 206 |
+
"""Main processing pipeline optimized for HF Spaces"""
|
| 207 |
+
|
| 208 |
+
pipeline_state = {
|
| 209 |
+
'timestamp': datetime.now().isoformat(),
|
| 210 |
+
'file_info': {},
|
| 211 |
+
'conversion_result': {},
|
| 212 |
+
'analysis_result': {},
|
| 213 |
+
'metrics': {}
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
try:
|
| 217 |
+
# Stage 1: File Processing
|
| 218 |
+
yield gr.HTML("🔄 Processing uploaded file..."), "", "", pipeline_state
|
| 219 |
+
|
| 220 |
+
file_handler = StreamlineFileHandler()
|
| 221 |
+
file_data = file_handler.process_upload(file_obj)
|
| 222 |
+
pipeline_state['file_info'] = file_data['metadata']
|
| 223 |
+
|
| 224 |
+
# Stage 2: MarkItDown Conversion
|
| 225 |
+
yield gr.HTML("🔄 Converting to Markdown..."), "", "", pipeline_state
|
| 226 |
+
|
| 227 |
+
converter = HFConversionEngine()
|
| 228 |
+
conversion_result = await converter.convert_stream(file_data)
|
| 229 |
+
pipeline_state['conversion_result'] = conversion_result
|
| 230 |
+
|
| 231 |
+
# Stage 3: Gemini Analysis (if API key provided)
|
| 232 |
+
if gemini_key and gemini_key.strip():
|
| 233 |
+
yield gr.HTML("🤖 Analyzing with Gemini..."), conversion_result['markdown'], "", pipeline_state
|
| 234 |
+
|
| 235 |
+
gemini = GeminiConnector(gemini_key)
|
| 236 |
+
analysis = await gemini.analyze_content(
|
| 237 |
+
conversion_result['markdown'],
|
| 238 |
+
analysis_type.lower().replace(' ', '_')
|
| 239 |
+
)
|
| 240 |
+
pipeline_state['analysis_result'] = analysis
|
| 241 |
+
|
| 242 |
+
# Stage 4: Generate Visualization Metrics
|
| 243 |
+
metrics = generate_quality_metrics(pipeline_state)
|
| 244 |
+
pipeline_state['metrics'] = metrics
|
| 245 |
+
|
| 246 |
+
# Final Results
|
| 247 |
+
yield (
|
| 248 |
+
gr.HTML("✅ Processing complete!"),
|
| 249 |
+
conversion_result['markdown'],
|
| 250 |
+
generate_original_preview(file_data),
|
| 251 |
+
pipeline_state
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
except Exception as e:
|
| 255 |
+
yield (
|
| 256 |
+
gr.HTML(f"❌ Error: {str(e)}"),
|
| 257 |
+
"",
|
| 258 |
+
"",
|
| 259 |
+
pipeline_state
|
| 260 |
+
)
|
| 261 |
+
```
|
| 262 |
+
|
| 263 |
+
## Phase 3: Gemini Integration Strategy
|
| 264 |
+
|
| 265 |
+
### 🧠 **Multi-Model Gemini Architecture**
|
| 266 |
+
|
| 267 |
+
```python
|
| 268 |
+
class GeminiAnalysisEngine:
|
| 269 |
+
"""Sophisticated Gemini-powered analysis system"""
|
| 270 |
+
|
| 271 |
+
ANALYSIS_PROMPTS = {
|
| 272 |
+
'quality_analysis': """
|
| 273 |
+
Analyze the quality of this Markdown conversion from a document.
|
| 274 |
+
|
| 275 |
+
Focus on:
|
| 276 |
+
1. Structure preservation (headers, lists, tables)
|
| 277 |
+
2. Content completeness
|
| 278 |
+
3. Formatting accuracy
|
| 279 |
+
4. Information hierarchy
|
| 280 |
+
|
| 281 |
+
Provide a structured analysis with scores (1-10) and recommendations.
|
| 282 |
+
""",
|
| 283 |
+
|
| 284 |
+
'structure_review': """
|
| 285 |
+
Review the structural elements of this converted Markdown document.
|
| 286 |
+
|
| 287 |
+
Identify:
|
| 288 |
+
1. Document hierarchy (H1, H2, H3, etc.)
|
| 289 |
+
2. Lists and their nesting
|
| 290 |
+
3. Tables and their formatting
|
| 291 |
+
4. Code blocks and special formatting
|
| 292 |
+
|
| 293 |
+
Create a structural map and quality assessment.
|
| 294 |
+
""",
|
| 295 |
+
|
| 296 |
+
'content_summary': """
|
| 297 |
+
Create a comprehensive summary of this document's content.
|
| 298 |
+
|
| 299 |
+
Include:
|
| 300 |
+
1. Main topics and themes
|
| 301 |
+
2. Key information points
|
| 302 |
+
3. Document purpose and audience
|
| 303 |
+
4. Content organization assessment
|
| 304 |
+
|
| 305 |
+
Provide both a brief summary and detailed breakdown.
|
| 306 |
+
"""
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
async def comprehensive_analysis(self, markdown_content, analysis_types=['quality_analysis']):
|
| 310 |
+
"""Execute multiple analysis types concurrently"""
|
| 311 |
+
|
| 312 |
+
tasks = []
|
| 313 |
+
for analysis_type in analysis_types:
|
| 314 |
+
task = self._single_analysis(markdown_content, analysis_type)
|
| 315 |
+
tasks.append(task)
|
| 316 |
+
|
| 317 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 318 |
+
|
| 319 |
+
return {
|
| 320 |
+
'analyses': dict(zip(analysis_types, results)),
|
| 321 |
+
'combined_score': self._calculate_combined_score(results),
|
| 322 |
+
'recommendations': self._generate_recommendations(results)
|
| 323 |
+
}
|
| 324 |
+
```
|
| 325 |
+
|
| 326 |
+
### 📊 **HF-Optimized Visualization Components**
|
| 327 |
+
|
| 328 |
+
```python
|
| 329 |
+
def create_analysis_visualization(analysis_results):
|
| 330 |
+
"""Generate interactive visualizations for HF Spaces"""
|
| 331 |
+
|
| 332 |
+
import plotly.graph_objects as go
|
| 333 |
+
import plotly.express as px
|
| 334 |
+
|
| 335 |
+
# Quality Score Radar Chart
|
| 336 |
+
def quality_radar_chart(scores):
|
| 337 |
+
categories = ['Structure', 'Completeness', 'Accuracy', 'Readability']
|
| 338 |
+
|
| 339 |
+
fig = go.Figure()
|
| 340 |
+
fig.add_trace(go.Scatterpolar(
|
| 341 |
+
r=list(scores.values()),
|
| 342 |
+
theta=categories,
|
| 343 |
+
fill='toself',
|
| 344 |
+
name='Quality Metrics'
|
| 345 |
+
))
|
| 346 |
+
|
| 347 |
+
fig.update_layout(
|
| 348 |
+
polar=dict(
|
| 349 |
+
radialaxis=dict(
|
| 350 |
+
visible=True,
|
| 351 |
+
range=[0, 10]
|
| 352 |
+
)),
|
| 353 |
+
showlegend=False,
|
| 354 |
+
title="Document Conversion Quality"
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
return fig
|
| 358 |
+
|
| 359 |
+
# Content Structure Tree
|
| 360 |
+
def structure_tree_viz(structure_data):
|
| 361 |
+
"""Hierarchical document structure visualization"""
|
| 362 |
+
# Implementation for interactive document structure
|
| 363 |
+
pass
|
| 364 |
+
|
| 365 |
+
return {
|
| 366 |
+
'quality_chart': quality_radar_chart(analysis_results.get('scores', {})),
|
| 367 |
+
'structure_viz': structure_tree_viz(analysis_results.get('structure', {}))
|
| 368 |
+
}
|
| 369 |
+
```
|
| 370 |
+
|
| 371 |
+
## Phase 4: HF Deployment Optimization
|
| 372 |
+
|
| 373 |
+
### 🚀 **Hugging Face Spaces Configuration**
|
| 374 |
+
|
| 375 |
+
#### **requirements.txt (Optimized)**
|
| 376 |
+
```txt
|
| 377 |
+
gradio>=4.0.0
|
| 378 |
+
markitdown[all]>=0.1.0
|
| 379 |
+
google-generativeai>=0.3.0
|
| 380 |
+
plotly>=5.0.0
|
| 381 |
+
python-multipart>=0.0.6
|
| 382 |
+
aiofiles>=22.0.0
|
| 383 |
+
Pillow>=9.0.0
|
| 384 |
+
|
| 385 |
+
# Lightweight alternatives for HF
|
| 386 |
+
pandas>=1.3.0
|
| 387 |
+
numpy>=1.21.0
|
| 388 |
+
```
|
| 389 |
+
|
| 390 |
+
#### **app.py (Entry Point)**
|
| 391 |
+
```python
|
| 392 |
+
import gradio as gr
|
| 393 |
+
import asyncio
|
| 394 |
+
import os
|
| 395 |
+
from markitdown_platform import create_markitdown_interface
|
| 396 |
+
|
| 397 |
+
# HF Spaces environment configuration
|
| 398 |
+
def setup_hf_environment():
|
| 399 |
+
"""Configure environment for HF Spaces deployment"""
|
| 400 |
+
|
| 401 |
+
# Set memory limits
|
| 402 |
+
os.environ['GRADIO_TEMP_DIR'] = '/tmp'
|
| 403 |
+
os.environ['MAX_FILE_SIZE'] = '50MB' # HF free tier limit
|
| 404 |
+
|
| 405 |
+
# Optimize for HF infrastructure
|
| 406 |
+
gr.set_static_paths(paths=["./assets/"])
|
| 407 |
+
|
| 408 |
+
def main():
|
| 409 |
+
"""Main application entry point"""
|
| 410 |
+
|
| 411 |
+
setup_hf_environment()
|
| 412 |
+
|
| 413 |
+
# Create optimized interface
|
| 414 |
+
interface = create_markitdown_interface()
|
| 415 |
+
|
| 416 |
+
# HF Spaces optimized launch
|
| 417 |
+
interface.launch(
|
| 418 |
+
server_name="0.0.0.0",
|
| 419 |
+
server_port=7860,
|
| 420 |
+
share=False, # HF handles sharing
|
| 421 |
+
show_error=True,
|
| 422 |
+
max_file_size="50mb",
|
| 423 |
+
allowed_paths=["./temp/"],
|
| 424 |
+
show_tips=True,
|
| 425 |
+
enable_queue=True,
|
| 426 |
+
max_size=20 # Queue limit for free tier
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
if __name__ == "__main__":
|
| 430 |
+
main()
|
| 431 |
+
```
|
| 432 |
+
|
| 433 |
+
### 🔧 **Resource Management Strategy**
|
| 434 |
+
|
| 435 |
+
#### **Memory-Efficient Processing**
|
| 436 |
+
```python
|
| 437 |
+
class HFResourceManager:
|
| 438 |
+
"""Resource management for HF Spaces constraints"""
|
| 439 |
+
|
| 440 |
+
MAX_MEMORY_MB = 16 * 1024 # 16GB limit for HF Spaces
|
| 441 |
+
MAX_FILE_SIZE_MB = 50
|
| 442 |
+
MAX_CONCURRENT_PROCESSES = 3
|
| 443 |
+
|
| 444 |
+
@classmethod
|
| 445 |
+
def check_resource_constraints(cls, file_size_mb, current_memory_usage):
|
| 446 |
+
"""Validate resource availability before processing"""
|
| 447 |
+
|
| 448 |
+
if file_size_mb > cls.MAX_FILE_SIZE_MB:
|
| 449 |
+
raise ResourceError(f"File size {file_size_mb}MB exceeds limit {cls.MAX_FILE_SIZE_MB}MB")
|
| 450 |
+
|
| 451 |
+
if current_memory_usage > cls.MAX_MEMORY_MB * 0.8: # 80% threshold
|
| 452 |
+
raise ResourceError("Insufficient memory available")
|
| 453 |
+
|
| 454 |
+
return True
|
| 455 |
+
|
| 456 |
+
@staticmethod
|
| 457 |
+
def cleanup_temp_resources():
|
| 458 |
+
"""Aggressive cleanup for memory management"""
|
| 459 |
+
import gc
|
| 460 |
+
import tempfile
|
| 461 |
+
import shutil
|
| 462 |
+
|
| 463 |
+
# Force garbage collection
|
| 464 |
+
gc.collect()
|
| 465 |
+
|
| 466 |
+
# Clean temporary directories
|
| 467 |
+
temp_dir = tempfile.gettempdir()
|
| 468 |
+
for item in os.listdir(temp_dir):
|
| 469 |
+
if item.startswith('gradio_'):
|
| 470 |
+
shutil.rmtree(os.path.join(temp_dir, item), ignore_errors=True)
|
| 471 |
+
```
|
| 472 |
+
|
| 473 |
+
## Phase 5: Development Roadmap (HF-Optimized)
|
| 474 |
+
|
| 475 |
+
### **Sprint 1: HF Foundation** (1 неділя)
|
| 476 |
+
- Stateless architecture implementation
|
| 477 |
+
- Basic Gradio interface with Gemini integration
|
| 478 |
+
- File upload with HF constraints validation
|
| 479 |
+
- Simple MarkItDown pipeline
|
| 480 |
+
|
| 481 |
+
### **Sprint 2: Core Features** (1 неділя)
|
| 482 |
+
- Multi-model Gemini analysis integration
|
| 483 |
+
- Real-time processing with progress indicators
|
| 484 |
+
- Basic visualization dashboard
|
| 485 |
+
- Export functionality
|
| 486 |
+
|
| 487 |
+
### **Sprint 3: Advanced Analysis** (1 неділя)
|
| 488 |
+
- Comprehensive quality metrics
|
| 489 |
+
- Interactive comparison tools
|
| 490 |
+
- Advanced visualization components
|
| 491 |
+
- Error handling and recovery
|
| 492 |
+
|
| 493 |
+
### **Sprint 4: Polish & Optimization** (1 неділя)
|
| 494 |
+
- HF Spaces performance optimization
|
| 495 |
+
- UI/UX refinements
|
| 496 |
+
- Resource management improvements
|
| 497 |
+
- Documentation and examples
|
| 498 |
+
|
| 499 |
+
## Success Metrics for HF Deployment
|
| 500 |
+
|
| 501 |
+
### **Technical Performance:**
|
| 502 |
+
- Cold start time < 30 seconds
|
| 503 |
+
- Processing time < 2 minutes for 50MB files
|
| 504 |
+
- Memory usage < 12GB peak
|
| 505 |
+
- 99% uptime on HF infrastructure
|
| 506 |
+
|
| 507 |
+
### **User Experience:**
|
| 508 |
+
- Intuitive single-page workflow
|
| 509 |
+
- Clear progress indication
|
| 510 |
+
- Responsive design for mobile
|
| 511 |
+
- Comprehensive error messaging
|
| 512 |
+
|
| 513 |
+
### **Feature Adoption:**
|
| 514 |
+
- Gemini analysis utilization rate
|
| 515 |
+
- Export format preferences
|
| 516 |
+
- Average session duration
|
| 517 |
+
- User return rate
|
| 518 |
+
|
| 519 |
+
---
|
| 520 |
+
|
| 521 |
+
**Immediate Next Steps:**
|
| 522 |
+
|
| 523 |
+
1. **Environment Setup**: Create HF Space and test basic deployment
|
| 524 |
+
2. **Gemini Integration**: Implement and test API connectivity
|
| 525 |
+
3. **Core Pipeline**: Build stateless processing architecture
|
| 526 |
+
4. **UI Prototype**: Create basic Gradio interface with progressive enhancement
|
| 527 |
+
|
| 528 |
+
**Key Architectural Decisions:**
|
| 529 |
+
- ✅ **Stateless Design**: Eliminates persistence complexity
|
| 530 |
+
- ✅ **Gemini Focus**: Single LLM provider for simplicity
|
| 531 |
+
- ✅ **HF Optimization**: Resource-aware processing
|
| 532 |
+
- ✅ **Progressive Enhancement**: Core features first, advanced features additive
|
| 533 |
+
|
| 534 |
+
This revised architecture prioritizes **deployment simplicity** while maintaining **functional richness** - perfect for HF Spaces environment with Gemini integration.
|
Dockerfile
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim-bookworm
|
| 2 |
+
|
| 3 |
+
ARG DEBIAN_FRONTEND=noninteractive
|
| 4 |
+
|
| 5 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 6 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 7 |
+
PIP_NO_CACHE_DIR=1 \
|
| 8 |
+
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 9 |
+
GRADIO_SERVER_NAME="0.0.0.0" \
|
| 10 |
+
GRADIO_SERVER_PORT=7860 \
|
| 11 |
+
HF_HOME="/tmp" \
|
| 12 |
+
GRADIO_TEMP_DIR="/tmp"
|
| 13 |
+
|
| 14 |
+
WORKDIR /app
|
| 15 |
+
|
| 16 |
+
# Копіюємо requirements окремо для кешування
|
| 17 |
+
COPY requirements.txt .
|
| 18 |
+
|
| 19 |
+
# Оновлення безпеки + тимчасові build-інструменти (видалимо після збірки)
|
| 20 |
+
# Залишаємо тільки runtime-пакети: libmagic1, curl
|
| 21 |
+
RUN apt-get update \
|
| 22 |
+
&& apt-get upgrade -y \
|
| 23 |
+
&& apt-get install -y --no-install-recommends \
|
| 24 |
+
gcc g++ make \
|
| 25 |
+
libmagic1 curl \
|
| 26 |
+
&& pip install --no-cache-dir -r requirements.txt \
|
| 27 |
+
&& apt-get purge -y gcc g++ make \
|
| 28 |
+
&& apt-get autoremove -y \
|
| 29 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 30 |
+
|
| 31 |
+
# Копіюємо застосунок
|
| 32 |
+
COPY . .
|
| 33 |
+
|
| 34 |
+
RUN mkdir -p /tmp && chmod 777 /tmp
|
| 35 |
+
|
| 36 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 37 |
+
CMD curl -f http://localhost:7860/ || exit 1
|
| 38 |
+
|
| 39 |
+
RUN chmod +x /app/app.py
|
| 40 |
+
|
| 41 |
+
EXPOSE 7860
|
| 42 |
+
|
| 43 |
+
CMD ["python", "app.py"]
|
INSTRUCTION.md
ADDED
|
@@ -0,0 +1,593 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Керівництво Користувача: MarkItDown Testing Platform
|
| 2 |
+
|
| 3 |
+
## Стратегічне Керівництво з Експлуатації Enterprise-системи
|
| 4 |
+
|
| 5 |
+
**"Перетворюйте документи у структуровані дані з впевненістю підприємства"**
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Основна Філософія Платформи
|
| 10 |
+
|
| 11 |
+
### Ключові Принципи Проектування
|
| 12 |
+
- **Людиноорієнтований Інтерфейс**: Мінімізація когнітивного навантаження користувача
|
| 13 |
+
- **Адаптивна Архітектура**: Система еволюціонує разом з вашими потребами
|
| 14 |
+
- **Прозорість Процесу**: Кожен крок конвертації зрозумілий і контрольований
|
| 15 |
+
- **Надійність Підприємства**: Промислова стабільність з елегантним дизайном
|
| 16 |
+
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
## Розділ 1: Стратегічний Огляд Можливостей
|
| 20 |
+
|
| 21 |
+
### 🎯 **Основні Сценарії Використання**
|
| 22 |
+
|
| 23 |
+
#### Корпоративна Міграція Документів
|
| 24 |
+
- **Завдання**: Перетворення застарілих форматів у сучасні стандарти
|
| 25 |
+
- **Підхід**: Автоматизована обробка з контролем якості
|
| 26 |
+
- **Результат**: Стандартизована документообіг з AI-аналітикою
|
| 27 |
+
|
| 28 |
+
#### Підготовка Даних для AI-систем
|
| 29 |
+
- **Завдання**: Оптимізація документів для RAG (Retrieval-Augmented Generation)
|
| 30 |
+
- **Підхід**: Структурований аналіз з оцінкою якості
|
| 31 |
+
- **Результат**: AI-ready контент з метриками ефективності
|
| 32 |
+
|
| 33 |
+
#### Контроль Якості Конвертації
|
| 34 |
+
- **Завдання**: Валідація точності автоматичного перетворення
|
| 35 |
+
- **Підхož**: Комплексна аналітика з детальними метриками
|
| 36 |
+
- **Результат**: Довіра до процесу з аудиторським слідом
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
## Розділ 2: Покрокова Інструкція з Експлуатації
|
| 41 |
+
|
| 42 |
+
### 🚀 **Етап 1: Початкова Конфігурація**
|
| 43 |
+
|
| 44 |
+
#### Доступ до Платформи
|
| 45 |
+
1. **Перейдіть на Hugging Face Space**: [MarkItDown Testing Platform](https://huggingface.co/spaces/your-username/markitdown-testing-platform)
|
| 46 |
+
2. **Перевірте Системні Вимоги**:
|
| 47 |
+
- Сучасний браузер (Chrome, Firefox, Safari, Edge)
|
| 48 |
+
- Стабільне інтернет-з'єднання
|
| 49 |
+
- JavaScript увімкнений
|
| 50 |
+
|
| 51 |
+
#### Отримання API-ключа Gemini (Опціонально)
|
| 52 |
+
```
|
| 53 |
+
Стратегічна Рекомендація:
|
| 54 |
+
API-ключ Gemini розблоковує потужні AI-можливості аналізу,
|
| 55 |
+
але базова конвертація працює без додаткових налаштувань
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
**Крок-за-кроком налаштування Gemini:**
|
| 59 |
+
1. Відвідайте [Google AI Studio](https://makersuite.google.com/app/apikey)
|
| 60 |
+
2. Створіть новий проект або оберіть існуючий
|
| 61 |
+
3. Згенеруйте API-ключ з відповідними дозволами
|
| 62 |
+
4. Скопіюйте ключ (зберігається локально, не передається на сервер)
|
| 63 |
+
|
| 64 |
+
### 🔧 **Етап 2: Завантаження та Конфігурація Документа**
|
| 65 |
+
|
| 66 |
+
#### Підтримувані Формати Файлів
|
| 67 |
+
| Категорія | Формати | Особливості Обробки |
|
| 68 |
+
|-----------|---------|-------------------|
|
| 69 |
+
| **Офісні документи** | PDF, DOCX, PPTX, XLSX | Збереження структури та форматування |
|
| 70 |
+
| **Веб-контент** | HTML, HTM | Повна підтримка CSS-стилів |
|
| 71 |
+
| **Структуровані дані** | CSV, JSON, XML | Інтелектуальне парсингування |
|
| 72 |
+
| **Текстові файли** | TXT, RTF | Розширена обробка кодувань |
|
| 73 |
+
|
| 74 |
+
#### Процес Завантаження
|
| 75 |
+
1. **Виберіть Вкладку "📁 Document Processing"**
|
| 76 |
+
2. **Завантажте Файл**:
|
| 77 |
+
- Drag & Drop у область завантаження
|
| 78 |
+
- Або натисніть "Select Document" для вибору файлу
|
| 79 |
+
- **Ліміт**: 50MB для Hugging Face Spaces
|
| 80 |
+
|
| 81 |
+
3. **Налаштуйте Параметри Обробки**:
|
| 82 |
+
```
|
| 83 |
+
🔧 Стратегічні Рекомендації:
|
| 84 |
+
- Quality Analysis: Комплексна оцінка якості конвертації
|
| 85 |
+
- Structure Review: Фокус на збереження ієрархії документа
|
| 86 |
+
- Content Summary: Тематичний аналіз та ключові інсайти
|
| 87 |
+
- Extraction Quality: Оцінка збереження даних
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
4. **Виберіть AI-модель**:
|
| 91 |
+
- **Gemini 1.5 Pro**: Максимальна якість аналізу (рекомендовано)
|
| 92 |
+
- **Gemini 1.5 Flash**: Швидша обробка для великих обсягів
|
| 93 |
+
|
| 94 |
+
### ⚡ **Етап 3: Виконання Обробки**
|
| 95 |
+
|
| 96 |
+
#### Процес Конвертації
|
| 97 |
+
1. **Натисніть "🚀 Process Document"**
|
| 98 |
+
2. **Моніторинг Прогресу**:
|
| 99 |
+
- Реальний час відслідковування етапів
|
| 100 |
+
- Індикатори завантаження для кожної фази
|
| 101 |
+
- Автоматичні повідомлення про стан
|
| 102 |
+
|
| 103 |
+
#### Етапи Обробки
|
| 104 |
+
```
|
| 105 |
+
Архітектурний Підхід до Прозорості:
|
| 106 |
+
Кожен етап має чіткі межі відповідальності та точки контролю
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
**Фаза 1: Валідація Файлу**
|
| 110 |
+
- Перевірка формату та цілісності
|
| 111 |
+
- Аналіз безпеки та розміру
|
| 112 |
+
- Метадані екстракція
|
| 113 |
+
|
| 114 |
+
**Фаза 2: Конвертація в Markdown**
|
| 115 |
+
- MarkItDown обробка з оптимізацією
|
| 116 |
+
- Збереження структури та форматування
|
| 117 |
+
- Генерація якісних метрик
|
| 118 |
+
|
| 119 |
+
**Фаза 3: AI-аналіз (за наявності ключа)**
|
| 120 |
+
- Gemini-powered інтелектуальний аналіз
|
| 121 |
+
- Оцінка якості та рекомендації
|
| 122 |
+
- Структурні та змістовні інсайти
|
| 123 |
+
|
| 124 |
+
---
|
| 125 |
+
|
| 126 |
+
## Розділ 3: Інтерпретація Результатів
|
| 127 |
+
|
| 128 |
+
### 📊 **Розуміння Метрик Якості**
|
| 129 |
+
|
| 130 |
+
#### Композитна Оцінка (0-10 балів)
|
| 131 |
+
```
|
| 132 |
+
Стратегічна Інтерпретація Оцінок:
|
| 133 |
+
- 8.0-10.0: Відмінна якість, готово для продакшену
|
| 134 |
+
- 6.0-7.9: Хороша якість, мінорні оптимізації
|
| 135 |
+
- 4.0-5.9: Прийнятна якість, потребує покращень
|
| 136 |
+
- 0.0-3.9: Потребує уваги, перевірте налаштування
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
#### Детальні Компоненти Оцінки
|
| 140 |
+
|
| 141 |
+
**Структурна Оцінка (Structure Score)**
|
| 142 |
+
- **Що вимірює**: Збереження заголовків, списків, таблиць
|
| 143 |
+
- **Високі значення**: Документ зберіг логічну ієрархію
|
| 144 |
+
- **Низькі значення**: Втрачено структурну організацію
|
| 145 |
+
- **Дія**: Перевірте вхідний документ на чітку структуру
|
| 146 |
+
|
| 147 |
+
**Повнота Контенту (Completeness Score)**
|
| 148 |
+
- **Що вимірює**: Збереження інформації з оригіналу
|
| 149 |
+
- **Високі значення**: Мінімальна втрата даних
|
| 150 |
+
- **Низькі значення**: Значна втрата контенту
|
| 151 |
+
- **Дія**: Розгляньте альтернативні налаштування конвертації
|
| 152 |
+
|
| 153 |
+
**Точність Форматування (Accuracy Score)**
|
| 154 |
+
- **Що вимірює**: Правильність передачі форматних елементів
|
| 155 |
+
- **Високі значення**: Форматування відповідає оригіналу
|
| 156 |
+
- **Низькі значення**: Спотворення або втрата форматування
|
| 157 |
+
- **Дія**: Валідуйте критичні форматні елементи
|
| 158 |
+
|
| 159 |
+
**Читабельність для AI (Readability Score)**
|
| 160 |
+
- **Що вимірює**: Оптимізація для AI-споживання
|
| 161 |
+
- **Високі значення**: Ідеальний для LLM обробки
|
| 162 |
+
- **Низькі значення**: Потребує додаткової обробки
|
| 163 |
+
- **Дія**: Розгляньте пост-процесинг оптимізації
|
| 164 |
+
|
| 165 |
+
### 🤖 **AI-аналіз Результатів**
|
| 166 |
+
|
| 167 |
+
#### Типи Аналізу та Їх Застосування
|
| 168 |
+
|
| 169 |
+
**Quality Analysis (Аналіз Якості)**
|
| 170 |
+
```markdown
|
| 171 |
+
Практичне Застосування:
|
| 172 |
+
- Валідація автоматичних процесів конвертації
|
| 173 |
+
- Контроль якості для корпоративних пайплайнів
|
| 174 |
+
- Оцінка готовності для downstream обробки
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
**Structure Review (Структурний Огляд)**
|
| 178 |
+
```markdown
|
| 179 |
+
Бізнес-цінність:
|
| 180 |
+
- Забезпечення збереження документної ієрархії
|
| 181 |
+
- Валідація навігаційної структури
|
| 182 |
+
- Оптимізація для пошукових систем
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
**Content Summary (Змістовий Аналіз)**
|
| 186 |
+
```markdown
|
| 187 |
+
Страт��гічні Інсайти:
|
| 188 |
+
- Розуміння тематичного навантаження документа
|
| 189 |
+
- Ідентифікація ключових концепцій
|
| 190 |
+
- Підготовка для content management систем
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
---
|
| 194 |
+
|
| 195 |
+
## Розділ 4: Візуалізація та Аналітика
|
| 196 |
+
|
| 197 |
+
### 📈 **Навігація Dashboard'ом**
|
| 198 |
+
|
| 199 |
+
#### Вкладка "📊 Analysis Dashboard"
|
| 200 |
+
|
| 201 |
+
**Quality Overview (Загальний Огляд Якості)**
|
| 202 |
+
- **Gauge Chart**: Композитна оцінка з візуальними індикаторами
|
| 203 |
+
- **Інтерпретація**: Швидка оцінка успішності конвертації
|
| 204 |
+
- **Використання**: Executive summary для стейкхолдерів
|
| 205 |
+
|
| 206 |
+
**Detailed Breakdown (Детальна Аналітика)**
|
| 207 |
+
- **Radar Chart**: Багатомірний аналіз якісних показників
|
| 208 |
+
- **Застосування**: Ідентифікація сильних та слабких сторін
|
| 209 |
+
- **Оптимізація**: Фокус на найнижчих показниках
|
| 210 |
+
|
| 211 |
+
**Document Structure (Структура Документа)**
|
| 212 |
+
- **Treemap**: Ієрархічна візуалізація елементів
|
| 213 |
+
- **Bar Charts**: Розподіл структурних компонентів
|
| 214 |
+
- **Insights**: Розуміння організаційної логіки
|
| 215 |
+
|
| 216 |
+
#### Інтерактивні Можливості
|
| 217 |
+
```
|
| 218 |
+
Архітектурний Підхід до UX:
|
| 219 |
+
Кожен візуальний елемент забезпечує actionable insights
|
| 220 |
+
з можливістю drill-down до деталей
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
- **Hover Effects**: Детальна інформація при наведенні
|
| 224 |
+
- **Zoom Functionality**: Масштабування для деталізації
|
| 225 |
+
- **Export Options**: Збереження візуалізацій у різних форматах
|
| 226 |
+
|
| 227 |
+
---
|
| 228 |
+
|
| 229 |
+
## Розділ 5: Експорт та Інтеграція
|
| 230 |
+
|
| 231 |
+
### 💾 **Стратегії Збереження Результатів**
|
| 232 |
+
|
| 233 |
+
#### Формати Експорту та Їх Застосування
|
| 234 |
+
|
| 235 |
+
**Markdown (.md)**
|
| 236 |
+
```markdown
|
| 237 |
+
Стратегічне Застосування:
|
| 238 |
+
- Інтеграція з Git-based workflows
|
| 239 |
+
- Подача в LLM для подальшої обробки
|
| 240 |
+
- Documentation-as-Code процеси
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
**HTML Report (.html)**
|
| 244 |
+
```html
|
| 245 |
+
Бізнес-цінність:
|
| 246 |
+
- Презентація для non-technical стейкхолдерів
|
| 247 |
+
- Архівування з візуальним контекстом
|
| 248 |
+
- Web-based sharing та collaboration
|
| 249 |
+
```
|
| 250 |
+
|
| 251 |
+
**JSON Data (.json)**
|
| 252 |
+
```json
|
| 253 |
+
Технічна Інтеграція:
|
| 254 |
+
- API-based інтеграція з downstream системами
|
| 255 |
+
- Метадані для автоматизованих пайплайнів
|
| 256 |
+
- Structured data для аналітичних платформ
|
| 257 |
+
```
|
| 258 |
+
|
| 259 |
+
**Complete Package (.zip)**
|
| 260 |
+
```
|
| 261 |
+
Enterprise Approach:
|
| 262 |
+
- Comprehensive backup з усіма артефактами
|
| 263 |
+
- Audit trail для compliance процесів
|
| 264 |
+
- Self-contained delivery package
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
#### Процес Експорту
|
| 268 |
+
1. **Перейдіть на "💾 Export & History"**
|
| 269 |
+
2. **Оберіть Формат**: Базуючись на downstream requirements
|
| 270 |
+
3. **Налаштуйте Опції**:
|
| 271 |
+
- Original Document Preview
|
| 272 |
+
- AI Analysis Results
|
| 273 |
+
- Quality Metrics
|
| 274 |
+
- Visualizations
|
| 275 |
+
- Processing Logs
|
| 276 |
+
|
| 277 |
+
4. **Генерація та Завантаження**:
|
| 278 |
+
- Натисніть "📥 Generate Export"
|
| 279 |
+
- Дочекайтесь completion notification
|
| 280 |
+
- Завантажте через browser download
|
| 281 |
+
|
| 282 |
+
---
|
| 283 |
+
|
| 284 |
+
## Розділ 6: Розширене Використання
|
| 285 |
+
|
| 286 |
+
### 🔍 **Advanced Analytics (Розширена Аналітика)**
|
| 287 |
+
|
| 288 |
+
#### Порівняльний Аналіз
|
| 289 |
+
```
|
| 290 |
+
Стратегічний Підхід до Batch Processing:
|
| 291 |
+
Можливість порівняння ефективності конвертації
|
| 292 |
+
для різних типів документів та налаштувань
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
**Workflow для Comparative Analysis**:
|
| 296 |
+
1. Завантажте кілька документів через "🔍 Advanced Analytics"
|
| 297 |
+
2. Оберіть аналітичні опції:
|
| 298 |
+
- Performance Timeline
|
| 299 |
+
- Quality Trends
|
| 300 |
+
- Batch Statistics
|
| 301 |
+
- Resource Usage
|
| 302 |
+
|
| 303 |
+
3. Генеруйте порівняльні звіти з actionable insights
|
| 304 |
+
|
| 305 |
+
#### Performance Monitoring
|
| 306 |
+
- **Processing Speed Trends**: Моніторинг швидкості обробки
|
| 307 |
+
- **Quality Consistency**: Стабільність якісних показників
|
| 308 |
+
- **Resource Utilization**: Ефективність використання ресурсів
|
| 309 |
+
- **Error Pattern Analysis**: Ідентифікація проблемних сценаріїв
|
| 310 |
+
|
| 311 |
+
### ⚙️ **System Status та Моніторинг**
|
| 312 |
+
|
| 313 |
+
#### Health Check Dashboard
|
| 314 |
+
```json
|
| 315 |
+
Operational Excellence Metrics:
|
| 316 |
+
{
|
| 317 |
+
"system_health": "Healthy/Degraded/Unhealthy",
|
| 318 |
+
"processing_capacity": "Available/Limited/Exhausted",
|
| 319 |
+
"api_connectivity": "Connected/Intermittent/Offline",
|
| 320 |
+
"cache_efficiency": "Percentage hit rate"
|
| 321 |
+
}
|
| 322 |
+
```
|
| 323 |
+
|
| 324 |
+
**Інтерпретація Статусів**:
|
| 325 |
+
- **Healthy**: Система функціонує оптимально
|
| 326 |
+
- **Degraded**: Зниження продуктивності, але функціональна
|
| 327 |
+
- **Unhealthy**: Потребує втручання або troubleshooting
|
| 328 |
+
|
| 329 |
+
---
|
| 330 |
+
|
| 331 |
+
## Розділ 7: Troubleshooting та Оптимізація
|
| 332 |
+
|
| 333 |
+
### 🔧 **Поширені Сценарії та Рішення**
|
| 334 |
+
|
| 335 |
+
#### Проблеми з Конвертацією
|
| 336 |
+
|
| 337 |
+
**Симптом**: Низька якість конвертації PDF
|
| 338 |
+
```
|
| 339 |
+
Діагностичний Підхід:
|
| 340 |
+
1. Перевірте, чи PDF містить текстовий шар (не тільки зображення)
|
| 341 |
+
2. Розгляньте Azure Document Intelligence інтеграцію
|
| 342 |
+
3. Тестуйте з різними density настройками
|
| 343 |
+
```
|
| 344 |
+
|
| 345 |
+
**Рішення**:
|
| 346 |
+
- Використайте OCR preprocessing для scan-based PDF
|
| 347 |
+
- Налаштуйте Azure endpoint для складних документів
|
| 348 |
+
- Розбийте великі PDF на секції
|
| 349 |
+
|
| 350 |
+
**Симптом**: Тайм-аут обробки
|
| 351 |
+
```
|
| 352 |
+
Resource Management Strategy:
|
| 353 |
+
- HF Spaces має 5-хвилинний ліміт обробки
|
| 354 |
+
- Файли >20MB потребують особливої уваги
|
| 355 |
+
- Concurrent processing може створювати bottlenecks
|
| 356 |
+
```
|
| 357 |
+
|
| 358 |
+
**Рішення**:
|
| 359 |
+
- Розбийте великі документи на менші частини
|
| 360 |
+
- Оптимізуйте час обробки, відключивши AI-аналіз для тестування
|
| 361 |
+
- Використайте локальне розгортання для великих workloads
|
| 362 |
+
|
| 363 |
+
#### API та Конфігурація
|
| 364 |
+
|
| 365 |
+
**Симптом**: Gemini API помилки
|
| 366 |
+
```
|
| 367 |
+
Authentication та Rate Limiting:
|
| 368 |
+
- Перевірте валідність API ключа
|
| 369 |
+
- Моніторьте usage limits у Google Console
|
| 370 |
+
- Налаштуйте retry logic для intermittent failures
|
| 371 |
+
```
|
| 372 |
+
|
| 373 |
+
**Рішення**:
|
| 374 |
+
- Регенерація API ключа в Google AI Studio
|
| 375 |
+
- Перевірка квот та billing status
|
| 376 |
+
- Використання різних моделей для балансування навантаження
|
| 377 |
+
|
| 378 |
+
### 📈 **Оптимізація Продуктивності**
|
| 379 |
+
|
| 380 |
+
#### Стратегії для Великих Обсягів
|
| 381 |
+
|
| 382 |
+
**Batch Processing Approach**:
|
| 383 |
+
```python
|
| 384 |
+
# Псевдо-код для оптимальної batch стратегії
|
| 385 |
+
documents = preprocess_and_prioritize(document_list)
|
| 386 |
+
for batch in chunk_documents(documents, optimal_size=5):
|
| 387 |
+
results = process_batch_with_monitoring(batch)
|
| 388 |
+
validate_and_store_results(results)
|
| 389 |
+
```
|
| 390 |
+
|
| 391 |
+
**Resource Optimization**:
|
| 392 |
+
- Використовуйте Gemini Flash для швидкої обробки
|
| 393 |
+
- Кешуйте результати для repeated processing
|
| 394 |
+
- Моніторьте system health між batch операціями
|
| 395 |
+
|
| 396 |
+
---
|
| 397 |
+
|
| 398 |
+
## Розділ 8: Інтеграція та Автоматизація
|
| 399 |
+
|
| 400 |
+
### 🔗 **Enterprise Integration Patterns**
|
| 401 |
+
|
| 402 |
+
#### API-based Integration
|
| 403 |
+
```python
|
| 404 |
+
# Приклад інтеграції через programmatic access
|
| 405 |
+
def integrate_with_existing_pipeline(document_path):
|
| 406 |
+
# Використання core components напряму
|
| 407 |
+
from markitdown_platform import DocumentProcessingOrchestrator
|
| 408 |
+
|
| 409 |
+
orchestrator = DocumentProcessingOrchestrator(...)
|
| 410 |
+
request = ProcessingRequest.from_file(document_path)
|
| 411 |
+
result = await orchestrator.process_document(request)
|
| 412 |
+
|
| 413 |
+
return standardize_output_format(result)
|
| 414 |
+
```
|
| 415 |
+
|
| 416 |
+
#### Workflow Automation
|
| 417 |
+
```
|
| 418 |
+
Strategic Automation Framework:
|
| 419 |
+
1. Document Ingestion (Watch folders, S3 triggers, API endpoints)
|
| 420 |
+
2. Quality Gates (Automated validation based on metrics)
|
| 421 |
+
3. Routing Logic (Different pipelines based on document type)
|
| 422 |
+
4. Notification Systems (Slack, email, webhooks for completion)
|
| 423 |
+
```
|
| 424 |
+
|
| 425 |
+
#### CI/CD Integration
|
| 426 |
+
- **Quality Checks**: Automated validation у deployment pipelines
|
| 427 |
+
- **Regression Testing**: Consistency перевірка across versions
|
| 428 |
+
- **Performance Benchmarks**: SLA enforcement через automated tests
|
| 429 |
+
|
| 430 |
+
---
|
| 431 |
+
|
| 432 |
+
## Розділ 9: Безпека та Compliance
|
| 433 |
+
|
| 434 |
+
### 🔒 **Data Security Framework**
|
| 435 |
+
|
| 436 |
+
#### Privacy Protection Strategy
|
| 437 |
+
```
|
| 438 |
+
GDPR-Compliant Architecture:
|
| 439 |
+
- No persistent storage of user documents
|
| 440 |
+
- API keys stored locally, never transmitted
|
| 441 |
+
- Automatic cleanup of temporary processing files
|
| 442 |
+
- Audit trails without sensitive data exposure
|
| 443 |
+
```
|
| 444 |
+
|
| 445 |
+
#### Security Best Practices
|
| 446 |
+
1. **API Key Management**:
|
| 447 |
+
- Rotate ключі регулярно
|
| 448 |
+
- Не зберігайте ключі у коді
|
| 449 |
+
- Використовуйте environment variables
|
| 450 |
+
|
| 451 |
+
2. **Document Handling**:
|
| 452 |
+
- Валідація file signatures
|
| 453 |
+
- Size та format restrictions
|
| 454 |
+
- Automatic sanitization suspicious content
|
| 455 |
+
|
| 456 |
+
3. **Network Security**:
|
| 457 |
+
- HTTPS-only communications
|
| 458 |
+
- Certificate pinning where applicable
|
| 459 |
+
- Rate limiting та DDoS protection
|
| 460 |
+
|
| 461 |
+
### 📋 **Compliance Considerations**
|
| 462 |
+
|
| 463 |
+
#### Audit Trail Management
|
| 464 |
+
- **Processing Logs**: Comprehensive logging без sensitive data
|
| 465 |
+
- **Quality Metrics**: Historical tracking for compliance reporting
|
| 466 |
+
- **System Health**: Operational metrics для SLA validation
|
| 467 |
+
- **User Actions**: Anonymized usage analytics
|
| 468 |
+
|
| 469 |
+
---
|
| 470 |
+
|
| 471 |
+
## Розділ 10: Майбутній Розвиток та Roadmap
|
| 472 |
+
|
| 473 |
+
### 🔮 **Стратегічні Напрямки Розвитку**
|
| 474 |
+
|
| 475 |
+
#### Короткострокові Покращення (3-6 місяців)
|
| 476 |
+
- **Enhanced Batch Processing**: Більш ефективна multi-document обробка
|
| 477 |
+
- **Advanced Comparison Tools**: Side-by-side analysis capabilities
|
| 478 |
+
- **Custom Template Support**: User-defined output formatting
|
| 479 |
+
- **Performance Dashboards**: Real-time operational metrics
|
| 480 |
+
|
| 481 |
+
#### Довгострокова Візія (6-18 місяців)
|
| 482 |
+
```
|
| 483 |
+
Architectural Evolution Path:
|
| 484 |
+
- Multi-LLM Support: Claude, OpenAI, local models
|
| 485 |
+
- Plugin Ecosystem: Third-party extensions framework
|
| 486 |
+
- Advanced Analytics: ML-powered quality prediction
|
| 487 |
+
- Enterprise SSO: Active Directory, OAuth integration
|
| 488 |
+
```
|
| 489 |
+
|
| 490 |
+
#### Community та Ecosystem
|
| 491 |
+
- **Open Source Contributions**: Community-driven improvements
|
| 492 |
+
- **Integration Partners**: Partnerships з document management vendors
|
| 493 |
+
- **Training Programs**: Certification для enterprise users
|
| 494 |
+
- **Support Tiers**: SLA-backed support для enterprise deployments
|
| 495 |
+
|
| 496 |
+
---
|
| 497 |
+
|
| 498 |
+
## Додаток A: Технічні Специфікації
|
| 499 |
+
|
| 500 |
+
### 📋 **Системні Вимоги**
|
| 501 |
+
|
| 502 |
+
#### Browser Compatibility
|
| 503 |
+
| Browser | Minimum Version | Recommended |
|
| 504 |
+
|---------|----------------|-------------|
|
| 505 |
+
| Chrome | 90+ | Latest |
|
| 506 |
+
| Firefox | 88+ | Latest |
|
| 507 |
+
| Safari | 14+ | Latest |
|
| 508 |
+
| Edge | 90+ | Latest |
|
| 509 |
+
|
| 510 |
+
#### File Format Support Matrix
|
| 511 |
+
| Format | Max Size | Special Notes |
|
| 512 |
+
|--------|----------|---------------|
|
| 513 |
+
| PDF | 50MB | Text-based preferred, OCR available |
|
| 514 |
+
| DOCX | 50MB | Full formatting preservation |
|
| 515 |
+
| PPTX | 50MB | Slide structure maintained |
|
| 516 |
+
| XLSX | 50MB | Table structure optimized |
|
| 517 |
+
| HTML | 20MB | CSS styling preserved |
|
| 518 |
+
| TXT | 10MB | Encoding auto-detection |
|
| 519 |
+
|
| 520 |
+
### 🔧 **Advanced Configuration Options**
|
| 521 |
+
|
| 522 |
+
#### Environment Variables (for Local Deployment)
|
| 523 |
+
```bash
|
| 524 |
+
# Core Configuration
|
| 525 |
+
MAX_FILE_SIZE_MB=50
|
| 526 |
+
PROCESSING_TIMEOUT_SECONDS=300
|
| 527 |
+
ENABLE_DEBUG_LOGGING=false
|
| 528 |
+
|
| 529 |
+
# AI Integration
|
| 530 |
+
GEMINI_DEFAULT_MODEL=gemini-1.5-pro
|
| 531 |
+
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=your-endpoint
|
| 532 |
+
|
| 533 |
+
# Performance Tuning
|
| 534 |
+
CACHE_TTL_HOURS=24
|
| 535 |
+
MAX_CONCURRENT_PROCESSES=3
|
| 536 |
+
MEMORY_LIMIT_GB=12
|
| 537 |
+
```
|
| 538 |
+
|
| 539 |
+
---
|
| 540 |
+
|
| 541 |
+
## Додаток B: Часті Питання (FAQ)
|
| 542 |
+
|
| 543 |
+
### ❓ **Загальні Питання**
|
| 544 |
+
|
| 545 |
+
**Q: Чи потрібен Gemini API ключ для роботи?**
|
| 546 |
+
A: Ні, базова конвертація документів працює без API ключа. Gemini потрібен тільки для AI-powered аналізу та рекомендацій.
|
| 547 |
+
|
| 548 |
+
**Q: Які обмеження розміру файлів?**
|
| 549 |
+
A: HF Spaces free tier обмежує файли до 50MB. Для більших файлів використовуйте локальне розгортання або розбийте документ на частини.
|
| 550 |
+
|
| 551 |
+
**Q: Чи зберігаються мої документи на сервері?**
|
| 552 |
+
A: Ні, усі документи обробляються в пам'яті і автоматично видаляються після завершення. Платформа designed для privacy-first обробки.
|
| 553 |
+
|
| 554 |
+
**Q: Як інтерпретувати оцінки якості?**
|
| 555 |
+
A: Оцінки 0-10: 8+ відмінно, 6-8 добре, 4-6 прийнятно, <4 потребує уваги. Фокусуйтеся на найнижчих компонентах для покращення.
|
| 556 |
+
|
| 557 |
+
### 🔧 **Технічні Питання**
|
| 558 |
+
|
| 559 |
+
**Q: Чи можна інтегрувати з існуючими системами?**
|
| 560 |
+
A: Так, платформа побудована з modular architecture що дозволяє integration через API або direct component usage.
|
| 561 |
+
|
| 562 |
+
**Q: Які формати експорту доступні?**
|
| 563 |
+
A: Markdown, HTML, JSON, PDF звіти, та ZIP packages з усіма артефактами.
|
| 564 |
+
|
| 565 |
+
**Q: Чи підтримується batch processing?**
|
| 566 |
+
A: Так, через Advanced Analytics tab можна обробляти кілька документів одночасно з порівняльним аналізом.
|
| 567 |
+
|
| 568 |
+
---
|
| 569 |
+
|
| 570 |
+
## Контакти та Підтримка
|
| 571 |
+
|
| 572 |
+
### 📞 **Канали Підтримки**
|
| 573 |
+
|
| 574 |
+
**Документація та Ресурси:**
|
| 575 |
+
- [GitHub Repository](https://github.com/your-username/markitdown-testing-platform)
|
| 576 |
+
- [Technical Documentation](https://docs.your-domain.com)
|
| 577 |
+
- [Community Forum](https://github.com/your-username/markitdown-testing-platform/discussions)
|
| 578 |
+
|
| 579 |
+
**Зворотний Зв'язок:**
|
| 580 |
+
- [Issue Tracker](https://github.com/your-username/markitdown-testing-platform/issues) для bug reports
|
| 581 |
+
- [Feature Requests](https://github.com/your-username/markitdown-testing-platform/discussions) для нових можливостей
|
| 582 |
+
- Email: [email protected] для enterprise inquiries
|
| 583 |
+
|
| 584 |
+
**Community:**
|
| 585 |
+
- [Discord Channel](https://discord.gg/your-channel) для real-time discussion
|
| 586 |
+
- [LinkedIn Group](https://linkedin.com/groups/your-group) для professional networking
|
| 587 |
+
- [YouTube Channel](https://youtube.com/your-channel) для video tutorials
|
| 588 |
+
|
| 589 |
+
---
|
| 590 |
+
|
| 591 |
+
**Версія документа**: 2.0.0 | **Остання редакція**: Вересень 2025
|
| 592 |
+
|
| 593 |
+
*Це керівництво відображає current state платформи та буде оновлюватися з новими features та improvements.*
|
README.md
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 MarkItDown Testing Platform
|
| 2 |
+
|
| 3 |
+
**Enterprise-Grade Document Conversion Testing with AI-Powered Analysis**
|
| 4 |
+
|
| 5 |
+
[](https://huggingface.co/spaces/your-username/markitdown-testing-platform)
|
| 6 |
+
[](https://www.python.org/downloads/)
|
| 7 |
+
[](https://opensource.org/licenses/MIT)
|
| 8 |
+
|
| 9 |
+
## 🎯 Overview
|
| 10 |
+
|
| 11 |
+
A comprehensive testing platform for Microsoft's MarkItDown document conversion tool, enhanced with Google Gemini AI analysis capabilities. Designed for enterprise-scale document processing workflows with focus on quality assessment and performance optimization.
|
| 12 |
+
|
| 13 |
+
### ✨ Key Features
|
| 14 |
+
|
| 15 |
+
- **🔄 Multi-Format Support**: PDF, DOCX, PPTX, XLSX, HTML, TXT, CSV, JSON, XML
|
| 16 |
+
- **🤖 AI-Powered Analysis**: Google Gemini integration for quality assessment
|
| 17 |
+
- **📊 Interactive Dashboards**: Real-time visualization of conversion metrics
|
| 18 |
+
- **🏢 Enterprise-Ready**: Scalable architecture with comprehensive error handling
|
| 19 |
+
- **💾 Export Capabilities**: Multiple output formats for integration workflows
|
| 20 |
+
- **📈 Performance Monitoring**: Detailed analytics and optimization insights
|
| 21 |
+
|
| 22 |
+
## 🚀 Quick Start
|
| 23 |
+
|
| 24 |
+
### Using the Hugging Face Space
|
| 25 |
+
|
| 26 |
+
1. **Visit the Space**: [MarkItDown Testing Platform](https://huggingface.co/spaces/your-username/markitdown-testing-platform)
|
| 27 |
+
2. **Upload Document**: Drag & drop or select your document
|
| 28 |
+
3. **Configure Analysis**: Enter Gemini API key for AI analysis (optional)
|
| 29 |
+
4. **Process**: Click "Process Document" and review results
|
| 30 |
+
5. **Export**: Download results in your preferred format
|
| 31 |
+
|
| 32 |
+
### Getting Gemini API Key
|
| 33 |
+
|
| 34 |
+
1. Visit [Google AI Studio](https://makersuite.google.com/app/apikey)
|
| 35 |
+
2. Create a new API key
|
| 36 |
+
3. Copy and paste into the application
|
| 37 |
+
4. Enjoy AI-powered document analysis!
|
| 38 |
+
|
| 39 |
+
## 📋 Supported File Formats
|
| 40 |
+
|
| 41 |
+
| Category | Formats | Notes |
|
| 42 |
+
|----------|---------|-------|
|
| 43 |
+
| **Documents** | PDF, DOCX, PPTX, XLSX | Full structure preservation |
|
| 44 |
+
| **Web Content** | HTML, HTM | Complete formatting retention |
|
| 45 |
+
| **Text Files** | TXT, CSV, JSON, XML | Enhanced parsing capabilities |
|
| 46 |
+
| **Rich Text** | RTF | Advanced formatting support |
|
| 47 |
+
|
| 48 |
+
## 🏗️ Architecture Overview
|
| 49 |
+
|
| 50 |
+
```
|
| 51 |
+
┌─────────────────────────────────────────┐
|
| 52 |
+
│ Gradio Interface │
|
| 53 |
+
├─────────────────────────────────────────┤
|
| 54 |
+
│ File Upload │ Config │ Analysis │ Export│
|
| 55 |
+
├─────────────────────────────────────────┤
|
| 56 |
+
│ Processing Pipeline │
|
| 57 |
+
├─────────────────────────────────────────┤
|
| 58 |
+
│MarkItDown │ Gemini AI │ Visualization │
|
| 59 |
+
├─────────────────────────────────────────┤
|
| 60 |
+
│ Analytics & Reporting │
|
| 61 |
+
└─────────────────────────────────────────┘
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
### Core Components
|
| 65 |
+
|
| 66 |
+
- **`core/modules.py`**: Stateless processing engine optimized for HF Spaces
|
| 67 |
+
- **`llm/gemini_connector.py`**: Enterprise Gemini API integration
|
| 68 |
+
- **`visualization/analytics_engine.py`**: Interactive dashboard generation
|
| 69 |
+
- **`app.py`**: Main Gradio application orchestration
|
| 70 |
+
|
| 71 |
+
## 🔧 Technical Specifications
|
| 72 |
+
|
| 73 |
+
### System Requirements
|
| 74 |
+
- **Python**: 3.10+
|
| 75 |
+
- **Memory**: Optimized for HF Spaces (16GB limit)
|
| 76 |
+
- **Storage**: Stateless design with temporary file handling
|
| 77 |
+
- **Processing**: Async pipeline with resource management
|
| 78 |
+
|
| 79 |
+
### Key Dependencies
|
| 80 |
+
```python
|
| 81 |
+
gradio>=4.0.0 # UI framework
|
| 82 |
+
markitdown[all]>=0.1.0 # Document conversion
|
| 83 |
+
google-generativeai>=0.3.0 # Gemini integration
|
| 84 |
+
plotly>=5.17.0 # Interactive visualizations
|
| 85 |
+
pandas>=1.5.0 # Data processing
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
## 📊 Analysis Capabilities
|
| 89 |
+
|
| 90 |
+
### Quality Metrics
|
| 91 |
+
- **Structure Score**: Heading, list, table preservation (0-10)
|
| 92 |
+
- **Completeness Score**: Information retention assessment (0-10)
|
| 93 |
+
- **Accuracy Score**: Formatting correctness evaluation (0-10)
|
| 94 |
+
- **Readability Score**: AI-friendly output optimization (0-10)
|
| 95 |
+
|
| 96 |
+
### AI Analysis Types
|
| 97 |
+
- **Quality Analysis**: Comprehensive conversion assessment
|
| 98 |
+
- **Structure Review**: Document hierarchy and organization
|
| 99 |
+
- **Content Summary**: Thematic analysis and key insights
|
| 100 |
+
- **Extraction Quality**: Data preservation evaluation
|
| 101 |
+
|
| 102 |
+
### Visualization Features
|
| 103 |
+
- **Quality Dashboard**: Multi-metric radar and performance charts
|
| 104 |
+
- **Structure Analysis**: Hierarchical document mapping
|
| 105 |
+
- **Comparison Tools**: Multi-document analysis capabilities
|
| 106 |
+
- **Performance Timeline**: Processing optimization insights
|
| 107 |
+
|
| 108 |
+
## 🎯 Use Cases
|
| 109 |
+
|
| 110 |
+
### Enterprise Document Migration
|
| 111 |
+
- **Legacy System Modernization**: Convert historical documents to modern formats
|
| 112 |
+
- **Content Management**: Standardize document formats across organizations
|
| 113 |
+
- **Compliance Documentation**: Ensure consistent formatting for regulatory requirements
|
| 114 |
+
|
| 115 |
+
### AI/ML Pipeline Integration
|
| 116 |
+
- **RAG System Preparation**: Optimize documents for retrieval systems
|
| 117 |
+
- **Training Data Processing**: Convert diverse formats for model training
|
| 118 |
+
- **Content Analysis**: Extract structured data from unstructured documents
|
| 119 |
+
|
| 120 |
+
### Quality Assurance
|
| 121 |
+
- **Conversion Validation**: Verify accuracy of automated processing
|
| 122 |
+
- **Performance Benchmarking**: Compare different conversion approaches
|
| 123 |
+
- **Error Detection**: Identify and resolve processing issues
|
| 124 |
+
|
| 125 |
+
## 📈 Performance Optimization
|
| 126 |
+
|
| 127 |
+
### HF Spaces Optimizations
|
| 128 |
+
- **Memory Management**: Automatic cleanup and resource monitoring
|
| 129 |
+
- **Processing Limits**: Smart file size and timeout management
|
| 130 |
+
- **Async Processing**: Non-blocking operations for better UX
|
| 131 |
+
- **Error Recovery**: Graceful degradation and retry mechanisms
|
| 132 |
+
|
| 133 |
+
### Best Practices
|
| 134 |
+
- **File Preparation**: Use high-quality source documents
|
| 135 |
+
- **API Management**: Monitor Gemini API usage and limits
|
| 136 |
+
- **Result Analysis**: Review quality metrics for optimization opportunities
|
| 137 |
+
- **Export Strategy**: Choose appropriate formats for downstream processing
|
| 138 |
+
|
| 139 |
+
## 🛠️ Development Setup
|
| 140 |
+
|
| 141 |
+
### Local Development
|
| 142 |
+
```bash
|
| 143 |
+
# Clone repository
|
| 144 |
+
git clone https://github.com/your-username/markitdown-testing-platform
|
| 145 |
+
cd markitdown-testing-platform
|
| 146 |
+
|
| 147 |
+
# Create virtual environment
|
| 148 |
+
python -m venv venv
|
| 149 |
+
source venv/bin/activate # Linux/Mac
|
| 150 |
+
# venv\Scripts\activate # Windows
|
| 151 |
+
|
| 152 |
+
# Install dependencies
|
| 153 |
+
pip install -r requirements.txt
|
| 154 |
+
|
| 155 |
+
# Run application
|
| 156 |
+
python app.py
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
### Environment Variables
|
| 160 |
+
```bash
|
| 161 |
+
# Optional: Set custom configurations
|
| 162 |
+
export GRADIO_TEMP_DIR="/tmp"
|
| 163 |
+
export MAX_FILE_SIZE="52428800" # 50MB in bytes
|
| 164 |
+
export PROCESSING_TIMEOUT="300" # 5 minutes
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
## 📚 API Reference
|
| 168 |
+
|
| 169 |
+
### Core Processing Pipeline
|
| 170 |
+
```python
|
| 171 |
+
from core.modules import StreamlineFileHandler, HFConversionEngine
|
| 172 |
+
from llm.gemini_connector import GeminiAnalysisEngine
|
| 173 |
+
|
| 174 |
+
# Initialize components
|
| 175 |
+
handler = StreamlineFileHandler(resource_manager)
|
| 176 |
+
engine = HFConversionEngine(resource_manager, config)
|
| 177 |
+
gemini = GeminiAnalysisEngine(gemini_config)
|
| 178 |
+
|
| 179 |
+
# Process document
|
| 180 |
+
file_result = await handler.process_upload(file_obj)
|
| 181 |
+
conversion_result = await engine.convert_stream(file_content, metadata)
|
| 182 |
+
analysis_result = await gemini.analyze_content(analysis_request)
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
### Visualization Generation
|
| 186 |
+
```python
|
| 187 |
+
from visualization.analytics_engine import InteractiveVisualizationEngine
|
| 188 |
+
|
| 189 |
+
viz_engine = InteractiveVisualizationEngine()
|
| 190 |
+
dashboard = viz_engine.create_quality_dashboard(conversion_result, analysis_result)
|
| 191 |
+
structure_viz = viz_engine.create_structural_analysis_viz(conversion_result)
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
## 🔐 Security & Privacy
|
| 195 |
+
|
| 196 |
+
### Data Handling
|
| 197 |
+
- **No Persistent Storage**: All processing in memory with automatic cleanup
|
| 198 |
+
- **API Key Security**: Keys stored locally, never transmitted to servers
|
| 199 |
+
- **File Privacy**: Temporary files automatically deleted after processing
|
| 200 |
+
- **Error Logging**: Sanitized logs without sensitive information
|
| 201 |
+
|
| 202 |
+
### Compliance Features
|
| 203 |
+
- **GDPR Ready**: No personal data retention
|
| 204 |
+
- **Enterprise Security**: Secure API integrations
|
| 205 |
+
- **Audit Trail**: Comprehensive processing logs
|
| 206 |
+
- **Access Control**: Environment-based configuration
|
| 207 |
+
|
| 208 |
+
## 🤝 Contributing
|
| 209 |
+
|
| 210 |
+
### Development Guidelines
|
| 211 |
+
1. **Code Style**: Follow PEP 8 with Black formatting
|
| 212 |
+
2. **Testing**: Comprehensive unit and integration tests
|
| 213 |
+
3. **Documentation**: Detailed docstrings and README updates
|
| 214 |
+
4. **Performance**: Memory-efficient and HF Spaces optimized
|
| 215 |
+
|
| 216 |
+
### Pull Request Process
|
| 217 |
+
1. Fork the repository
|
| 218 |
+
2. Create feature branch (`git checkout -b feature/amazing-feature`)
|
| 219 |
+
3. Commit changes (`git commit -m 'Add amazing feature'`)
|
| 220 |
+
4. Push to branch (`git push origin feature/amazing-feature`)
|
| 221 |
+
5. Open Pull Request
|
| 222 |
+
|
| 223 |
+
## 📄 License
|
| 224 |
+
|
| 225 |
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
| 226 |
+
|
| 227 |
+
## 🙏 Acknowledgments
|
| 228 |
+
|
| 229 |
+
- **Microsoft MarkItDown**: Core document conversion capabilities
|
| 230 |
+
- **Google Gemini**: Advanced AI analysis features
|
| 231 |
+
- **Hugging Face**: Platform hosting and community support
|
| 232 |
+
- **Plotly**: Interactive visualization framework
|
| 233 |
+
- **Gradio**: User interface framework
|
| 234 |
+
|
| 235 |
+
## 📞 Support
|
| 236 |
+
|
| 237 |
+
### Getting Help
|
| 238 |
+
- **Documentation**: Comprehensive guides and examples
|
| 239 |
+
- **Issues**: [GitHub Issues](https://github.com/your-username/markitdown-testing-platform/issues)
|
| 240 |
+
- **Discussions**: [Community Forum](https://github.com/your-username/markitdown-testing-platform/discussions)
|
| 241 |
+
- **Email**: [email protected]
|
| 242 |
+
|
| 243 |
+
### Frequently Asked Questions
|
| 244 |
+
|
| 245 |
+
**Q: What's the maximum file size?**
|
| 246 |
+
A: 50MB for HF Spaces free tier. Larger files can be processed in local deployments.
|
| 247 |
+
|
| 248 |
+
**Q: Do I need a Gemini API key?**
|
| 249 |
+
A: No, basic conversion works without API key. Gemini key enables AI analysis features.
|
| 250 |
+
|
| 251 |
+
**Q: Can I process multiple files at once?**
|
| 252 |
+
A: Current version supports single-file processing. Batch processing available in advanced analytics.
|
| 253 |
+
|
| 254 |
+
**Q: How accurate are the quality scores?**
|
| 255 |
+
A: Scores are based on structural analysis and AI evaluation. Use as guidelines for optimization.
|
| 256 |
+
|
| 257 |
+
---
|
| 258 |
+
|
| 259 |
+
**Built with ❤️ for enterprise document processing**
|
| 260 |
+
|
| 261 |
+
*Last updated: September 2025*
|
app.py
ADDED
|
@@ -0,0 +1,1244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MarkItDown Testing Platform - Enterprise Architecture Implementation
|
| 3 |
+
|
| 4 |
+
Strategic Design Philosophy:
|
| 5 |
+
"Complexity is the enemy of reliable software"
|
| 6 |
+
|
| 7 |
+
Core Architectural Principles:
|
| 8 |
+
- Minimize cognitive load for developers
|
| 9 |
+
- Create self-documenting, modular interfaces
|
| 10 |
+
- Design for future adaptability
|
| 11 |
+
- Prioritize human understanding over technical complexity
|
| 12 |
+
|
| 13 |
+
This implementation demonstrates enterprise-grade architectural patterns
|
| 14 |
+
optimized for long-term maintainability and team collaboration.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
import asyncio
|
| 19 |
+
import json
|
| 20 |
+
import logging
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
from typing import Dict, Optional, List, Tuple, Protocol, Any
|
| 23 |
+
from dataclasses import dataclass, field
|
| 24 |
+
from abc import ABC, abstractmethod
|
| 25 |
+
import gradio as gr
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
from pydantic import JsonValue
|
| 28 |
+
|
| 29 |
+
# Strategic import organization - dependency layers clearly defined
|
| 30 |
+
from core.modules import (
|
| 31 |
+
StreamlineFileHandler, HFConversionEngine, ResourceManager,
|
| 32 |
+
ProcessingConfig, ProcessingResult
|
| 33 |
+
)
|
| 34 |
+
from llm.gemini_connector import (
|
| 35 |
+
GeminiAnalysisEngine, GeminiConnectionManager, GeminiConfig,
|
| 36 |
+
AnalysisRequest, AnalysisType, GeminiModel
|
| 37 |
+
)
|
| 38 |
+
from visualization.analytics_engine import (
|
| 39 |
+
InteractiveVisualizationEngine, QualityMetricsCalculator,
|
| 40 |
+
VisualizationConfig, ReportGenerator
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Configure enterprise-grade logging
|
| 44 |
+
logging.basicConfig(
|
| 45 |
+
level=logging.INFO,
|
| 46 |
+
format='%(asctime)s - %(name)s - %(levelname)s - [%(funcName)s:%(lineno)d] - %(message)s'
|
| 47 |
+
)
|
| 48 |
+
logger = logging.getLogger(__name__)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# ==================== SERIALIZABLE TYPE DEFINITIONS ====================
|
| 52 |
+
|
| 53 |
+
JSONDict = Dict[str, JsonValue]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
if hasattr(gr.Blocks, "get_api_info"):
|
| 57 |
+
def _suppress_api_info(self):
|
| 58 |
+
return {"named_endpoints": {}, "unnamed_endpoints": []}
|
| 59 |
+
|
| 60 |
+
gr.Blocks.get_api_info = _suppress_api_info
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ==================== STRATEGIC DATA MODELS ====================
|
| 64 |
+
|
| 65 |
+
@dataclass(frozen=True)
|
| 66 |
+
class ProcessingRequest:
|
| 67 |
+
"""Immutable request container - eliminates parameter coupling"""
|
| 68 |
+
|
| 69 |
+
file_content: bytes
|
| 70 |
+
file_metadata: JSONDict
|
| 71 |
+
gemini_api_key: Optional[str] = None
|
| 72 |
+
analysis_type: str = "quality_analysis"
|
| 73 |
+
model_preference: str = "gemini-1.5-pro"
|
| 74 |
+
enable_plugins: bool = False
|
| 75 |
+
azure_endpoint: Optional[str] = None
|
| 76 |
+
session_context: JSONDict = field(default_factory=dict)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@dataclass(frozen=True)
|
| 80 |
+
class ProcessingResponse:
|
| 81 |
+
"""Standardized response container - predictable interface"""
|
| 82 |
+
|
| 83 |
+
success: bool
|
| 84 |
+
conversion_result: Optional[ProcessingResult]
|
| 85 |
+
analysis_result: Optional[Any]
|
| 86 |
+
quality_metrics: JSONDict
|
| 87 |
+
error_details: Optional[str]
|
| 88 |
+
processing_metadata: JSONDict
|
| 89 |
+
|
| 90 |
+
@classmethod
|
| 91 |
+
def success_response(
|
| 92 |
+
cls,
|
| 93 |
+
conversion_result: ProcessingResult,
|
| 94 |
+
analysis_result: Any = None,
|
| 95 |
+
quality_metrics: Optional[JSONDict] = None
|
| 96 |
+
) -> 'ProcessingResponse':
|
| 97 |
+
"""Factory method for successful processing"""
|
| 98 |
+
return cls(
|
| 99 |
+
success=True,
|
| 100 |
+
conversion_result=conversion_result,
|
| 101 |
+
analysis_result=analysis_result,
|
| 102 |
+
quality_metrics=quality_metrics or {},
|
| 103 |
+
error_details=None,
|
| 104 |
+
processing_metadata={'completed_at': datetime.now().isoformat()}
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
@classmethod
|
| 108 |
+
def error_response(cls, error_message: str, error_context: Optional[JSONDict] = None) -> 'ProcessingResponse':
|
| 109 |
+
"""Factory method for error scenarios"""
|
| 110 |
+
return cls(
|
| 111 |
+
success=False,
|
| 112 |
+
conversion_result=None,
|
| 113 |
+
analysis_result=None,
|
| 114 |
+
quality_metrics={},
|
| 115 |
+
error_details=error_message,
|
| 116 |
+
processing_metadata=error_context or {'failed_at': datetime.now().isoformat()}
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
@dataclass
|
| 121 |
+
class ApplicationState:
|
| 122 |
+
"""Centralized state management - eliminates state scatter"""
|
| 123 |
+
|
| 124 |
+
session_id: str
|
| 125 |
+
processing_history: List[ProcessingResponse] = field(default_factory=list)
|
| 126 |
+
current_gemini_engine_id: Optional[str] = None
|
| 127 |
+
user_preferences: JSONDict = field(default_factory=dict)
|
| 128 |
+
system_metrics: JSONDict = field(default_factory=dict)
|
| 129 |
+
|
| 130 |
+
def add_processing_result(self, response: ProcessingResponse) -> 'ApplicationState':
|
| 131 |
+
"""Immutable state update pattern"""
|
| 132 |
+
new_history = self.processing_history + [response]
|
| 133 |
+
return ApplicationState(
|
| 134 |
+
session_id=self.session_id,
|
| 135 |
+
processing_history=new_history,
|
| 136 |
+
current_gemini_engine_id=self.current_gemini_engine_id,
|
| 137 |
+
user_preferences=self.user_preferences,
|
| 138 |
+
system_metrics=self.system_metrics
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# ==================== STRATEGIC ABSTRACTION LAYER ====================
|
| 143 |
+
|
| 144 |
+
class ProcessingOrchestrator(Protocol):
|
| 145 |
+
"""Interface abstraction - enables component replacement"""
|
| 146 |
+
|
| 147 |
+
async def process_document(self, request: ProcessingRequest) -> ProcessingResponse:
|
| 148 |
+
"""Core processing contract"""
|
| 149 |
+
...
|
| 150 |
+
|
| 151 |
+
def get_processing_status(self) -> JSONDict:
|
| 152 |
+
"""System health interface"""
|
| 153 |
+
...
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
class UIResponseFactory(Protocol):
|
| 157 |
+
"""UI generation abstraction - separates presentation from logic"""
|
| 158 |
+
|
| 159 |
+
def create_success_response(self, response: ProcessingResponse) -> Tuple[str, str, str, JSONDict]:
|
| 160 |
+
"""Generate UI components for successful processing"""
|
| 161 |
+
...
|
| 162 |
+
|
| 163 |
+
def create_error_response(self, error_message: str) -> Tuple[str, str, str, JSONDict]:
|
| 164 |
+
"""Generate UI components for error scenarios"""
|
| 165 |
+
...
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# ==================== CORE ORCHESTRATION IMPLEMENTATION ====================
|
| 169 |
+
|
| 170 |
+
class DocumentProcessingOrchestrator:
|
| 171 |
+
"""
|
| 172 |
+
Strategic orchestration layer - coordinates component interactions
|
| 173 |
+
|
| 174 |
+
Design Principles:
|
| 175 |
+
- Single Responsibility: Document processing coordination only
|
| 176 |
+
- Dependency Injection: All components provided at construction
|
| 177 |
+
- Error Boundary: Comprehensive error handling and recovery
|
| 178 |
+
- Observable: Rich logging and metrics for operational visibility
|
| 179 |
+
"""
|
| 180 |
+
|
| 181 |
+
def __init__(
|
| 182 |
+
self,
|
| 183 |
+
file_handler: StreamlineFileHandler,
|
| 184 |
+
conversion_engine: HFConversionEngine,
|
| 185 |
+
gemini_manager: GeminiConnectionManager,
|
| 186 |
+
viz_engine: InteractiveVisualizationEngine,
|
| 187 |
+
quality_calculator: QualityMetricsCalculator
|
| 188 |
+
):
|
| 189 |
+
self.file_handler = file_handler
|
| 190 |
+
self.conversion_engine = conversion_engine
|
| 191 |
+
self.gemini_manager = gemini_manager
|
| 192 |
+
self.viz_engine = viz_engine
|
| 193 |
+
self.quality_calculator = quality_calculator
|
| 194 |
+
|
| 195 |
+
# Operational metrics
|
| 196 |
+
self.processing_count = 0
|
| 197 |
+
self.error_count = 0
|
| 198 |
+
self.total_processing_time = 0.0
|
| 199 |
+
|
| 200 |
+
async def process_document(self, request: ProcessingRequest) -> ProcessingResponse:
|
| 201 |
+
"""
|
| 202 |
+
Primary processing coordination with comprehensive error handling
|
| 203 |
+
|
| 204 |
+
Strategic Approach:
|
| 205 |
+
1. Input validation and sanitization
|
| 206 |
+
2. Resource availability verification
|
| 207 |
+
3. Processing pipeline execution with checkpoints
|
| 208 |
+
4. Quality assessment and metrics generation
|
| 209 |
+
5. Response standardization and logging
|
| 210 |
+
"""
|
| 211 |
+
|
| 212 |
+
processing_start = datetime.now()
|
| 213 |
+
self.processing_count += 1
|
| 214 |
+
|
| 215 |
+
try:
|
| 216 |
+
logger.info(f"Starting document processing - Session: {request.session_context.get('session_id', 'unknown')}")
|
| 217 |
+
|
| 218 |
+
# Phase 1: Document Ingestion and Validation
|
| 219 |
+
conversion_result = await self._execute_conversion_pipeline(request)
|
| 220 |
+
if not conversion_result.success:
|
| 221 |
+
return ProcessingResponse.error_response(
|
| 222 |
+
f"Conversion failed: {conversion_result.error_message}",
|
| 223 |
+
{"phase": "conversion", "request_metadata": request.file_metadata}
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
# Phase 2: AI Analysis (Optional Enhancement)
|
| 227 |
+
analysis_result = None
|
| 228 |
+
if request.gemini_api_key:
|
| 229 |
+
analysis_result = await self._execute_analysis_pipeline(
|
| 230 |
+
request, conversion_result
|
| 231 |
+
)
|
| 232 |
+
# Note: Analysis failure is non-fatal - system continues with conversion results
|
| 233 |
+
|
| 234 |
+
# Phase 3: Quality Assessment and Metrics Generation
|
| 235 |
+
quality_metrics = self.quality_calculator.calculate_conversion_quality_metrics(
|
| 236 |
+
conversion_result, analysis_result
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
# Phase 4: Response Assembly and Logging
|
| 240 |
+
processing_duration = (datetime.now() - processing_start).total_seconds()
|
| 241 |
+
self.total_processing_time += processing_duration
|
| 242 |
+
|
| 243 |
+
logger.info(f"Processing completed successfully in {processing_duration:.2f}s")
|
| 244 |
+
|
| 245 |
+
return ProcessingResponse.success_response(
|
| 246 |
+
conversion_result=conversion_result,
|
| 247 |
+
analysis_result=analysis_result,
|
| 248 |
+
quality_metrics=quality_metrics
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
except Exception as e:
|
| 252 |
+
self.error_count += 1
|
| 253 |
+
error_duration = (datetime.now() - processing_start).total_seconds()
|
| 254 |
+
|
| 255 |
+
logger.error(f"Processing failed after {error_duration:.2f}s: {str(e)}")
|
| 256 |
+
|
| 257 |
+
return ProcessingResponse.error_response(
|
| 258 |
+
error_message=f"System processing error: {str(e)}",
|
| 259 |
+
error_context={
|
| 260 |
+
"processing_duration": error_duration,
|
| 261 |
+
"error_type": type(e).__name__,
|
| 262 |
+
"processing_phase": "unknown"
|
| 263 |
+
}
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
async def _execute_conversion_pipeline(self, request: ProcessingRequest) -> ProcessingResult:
|
| 267 |
+
"""Isolated conversion processing with resource management"""
|
| 268 |
+
|
| 269 |
+
# Create mock file object for processing
|
| 270 |
+
class ProcessingFile:
|
| 271 |
+
def __init__(self, content: bytes, metadata: JSONDict):
|
| 272 |
+
self.content = content
|
| 273 |
+
self.name = metadata.get('filename', 'uploaded_file')
|
| 274 |
+
self.size = len(content)
|
| 275 |
+
|
| 276 |
+
def read(self) -> bytes:
|
| 277 |
+
return self.content
|
| 278 |
+
|
| 279 |
+
processing_file = ProcessingFile(request.file_content, request.file_metadata)
|
| 280 |
+
|
| 281 |
+
# Execute file processing
|
| 282 |
+
file_result = await self.file_handler.process_upload(
|
| 283 |
+
processing_file,
|
| 284 |
+
metadata_override=request.file_metadata
|
| 285 |
+
)
|
| 286 |
+
if not file_result.success:
|
| 287 |
+
return file_result
|
| 288 |
+
|
| 289 |
+
# Execute document conversion
|
| 290 |
+
conversion_result = await self.conversion_engine.convert_stream(
|
| 291 |
+
request.file_content, request.file_metadata
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
return conversion_result
|
| 295 |
+
|
| 296 |
+
async def _execute_analysis_pipeline(
|
| 297 |
+
self,
|
| 298 |
+
request: ProcessingRequest,
|
| 299 |
+
conversion_result: ProcessingResult
|
| 300 |
+
) -> Optional[Any]:
|
| 301 |
+
"""Isolated AI analysis processing with graceful degradation"""
|
| 302 |
+
|
| 303 |
+
try:
|
| 304 |
+
# Initialize or retrieve Gemini engine
|
| 305 |
+
gemini_config = GeminiConfig(api_key=request.gemini_api_key)
|
| 306 |
+
engine_id = await self.gemini_manager.create_engine(
|
| 307 |
+
request.gemini_api_key, gemini_config
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
engine = self.gemini_manager.get_engine(engine_id)
|
| 311 |
+
if not engine:
|
| 312 |
+
logger.warning("Gemini engine creation failed - proceeding without analysis")
|
| 313 |
+
return None
|
| 314 |
+
|
| 315 |
+
# Execute analysis
|
| 316 |
+
analysis_request = AnalysisRequest(
|
| 317 |
+
content=conversion_result.content,
|
| 318 |
+
analysis_type=AnalysisType(request.analysis_type),
|
| 319 |
+
model=GeminiModel(request.model_preference)
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
analysis_result = await engine.analyze_content(analysis_request)
|
| 323 |
+
|
| 324 |
+
if analysis_result.success:
|
| 325 |
+
logger.info(f"AI analysis completed - Type: {request.analysis_type}")
|
| 326 |
+
return analysis_result
|
| 327 |
+
else:
|
| 328 |
+
logger.warning(f"AI analysis failed: {analysis_result.error_message}")
|
| 329 |
+
return None
|
| 330 |
+
|
| 331 |
+
except Exception as e:
|
| 332 |
+
logger.warning(f"AI analysis pipeline error (non-fatal): {str(e)}")
|
| 333 |
+
return None
|
| 334 |
+
|
| 335 |
+
def get_processing_status(self) -> JSONDict:
|
| 336 |
+
"""Operational visibility interface"""
|
| 337 |
+
|
| 338 |
+
success_rate = (
|
| 339 |
+
((self.processing_count - self.error_count) / self.processing_count * 100)
|
| 340 |
+
if self.processing_count > 0 else 0
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
average_processing_time = (
|
| 344 |
+
self.total_processing_time / self.processing_count
|
| 345 |
+
if self.processing_count > 0 else 0
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
return {
|
| 349 |
+
'total_documents_processed': self.processing_count,
|
| 350 |
+
'success_rate_percent': success_rate,
|
| 351 |
+
'error_count': self.error_count,
|
| 352 |
+
'average_processing_time_seconds': average_processing_time,
|
| 353 |
+
'total_processing_time_seconds': self.total_processing_time,
|
| 354 |
+
'status': 'healthy' if success_rate > 90 else 'degraded' if success_rate > 70 else 'unhealthy'
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
# ==================== UI PRESENTATION LAYER ====================
|
| 359 |
+
|
| 360 |
+
class GradioResponseFactory:
|
| 361 |
+
"""
|
| 362 |
+
Strategic UI generation - separates presentation logic from business logic
|
| 363 |
+
|
| 364 |
+
Design Principles:
|
| 365 |
+
- Presentation Separation: UI generation isolated from business logic
|
| 366 |
+
- Consistent Interface: Standardized response patterns
|
| 367 |
+
- Error Communication: Clear, actionable user messaging
|
| 368 |
+
- Progressive Enhancement: Graceful degradation for failed components
|
| 369 |
+
"""
|
| 370 |
+
|
| 371 |
+
def __init__(self, viz_engine: InteractiveVisualizationEngine):
|
| 372 |
+
self.viz_engine = viz_engine
|
| 373 |
+
|
| 374 |
+
def create_success_response(
|
| 375 |
+
self,
|
| 376 |
+
response: ProcessingResponse
|
| 377 |
+
) -> Tuple[str, str, str, JSONDict]:
|
| 378 |
+
"""Generate comprehensive success UI components"""
|
| 379 |
+
|
| 380 |
+
# Status display with professional formatting
|
| 381 |
+
processing_time = response.conversion_result.processing_time or 0
|
| 382 |
+
content_length = len(response.conversion_result.content)
|
| 383 |
+
|
| 384 |
+
status_html = f"""
|
| 385 |
+
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 10px 0;">
|
| 386 |
+
<h3 style="margin: 0 0 10px 0;">✅ Processing Completed Successfully</h3>
|
| 387 |
+
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-top: 15px;">
|
| 388 |
+
<div>
|
| 389 |
+
<strong>Processing Time:</strong><br/>
|
| 390 |
+
<span style="font-size: 1.2em;">{processing_time:.2f} seconds</span>
|
| 391 |
+
</div>
|
| 392 |
+
<div>
|
| 393 |
+
<strong>Content Generated:</strong><br/>
|
| 394 |
+
<span style="font-size: 1.2em;">{content_length:,} characters</span>
|
| 395 |
+
</div>
|
| 396 |
+
<div>
|
| 397 |
+
<strong>Quality Score:</strong><br/>
|
| 398 |
+
<span style="font-size: 1.2em;">{response.quality_metrics.get('composite_score', 0):.1f}/10</span>
|
| 399 |
+
</div>
|
| 400 |
+
</div>
|
| 401 |
+
</div>
|
| 402 |
+
"""
|
| 403 |
+
|
| 404 |
+
# Document preview with metadata
|
| 405 |
+
original_preview = self._generate_document_preview(response.conversion_result.metadata)
|
| 406 |
+
|
| 407 |
+
# Markdown output
|
| 408 |
+
markdown_content = response.conversion_result.content
|
| 409 |
+
|
| 410 |
+
# Metrics summary for quick review
|
| 411 |
+
quick_metrics = self._extract_summary_metrics(response)
|
| 412 |
+
|
| 413 |
+
return (
|
| 414 |
+
status_html,
|
| 415 |
+
original_preview,
|
| 416 |
+
markdown_content,
|
| 417 |
+
quick_metrics
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
+
def create_error_response(
|
| 421 |
+
self,
|
| 422 |
+
error_message: str,
|
| 423 |
+
error_context: Optional[JSONDict] = None
|
| 424 |
+
) -> Tuple[str, str, str, JSONDict]:
|
| 425 |
+
"""Generate comprehensive error UI components with actionable guidance"""
|
| 426 |
+
|
| 427 |
+
# Determine error severity and user guidance
|
| 428 |
+
error_type = error_context.get('error_type', 'Unknown') if error_context else 'Unknown'
|
| 429 |
+
processing_phase = error_context.get('processing_phase', 'unknown') if error_context else 'unknown'
|
| 430 |
+
|
| 431 |
+
# Generate user-friendly error messaging
|
| 432 |
+
if 'Gemini' in error_message or 'API' in error_message:
|
| 433 |
+
user_guidance = "This appears to be an AI analysis issue. The document conversion may have succeeded. Check your API key and try again."
|
| 434 |
+
elif 'conversion' in error_message.lower():
|
| 435 |
+
user_guidance = "Document conversion failed. Please verify your file format is supported and try again."
|
| 436 |
+
elif 'resource' in error_message.lower():
|
| 437 |
+
user_guidance = "System resources are currently limited. Try with a smaller file or wait a moment before retrying."
|
| 438 |
+
else:
|
| 439 |
+
user_guidance = "An unexpected error occurred. Please try again or contact support if the problem persists."
|
| 440 |
+
|
| 441 |
+
error_html = f"""
|
| 442 |
+
<div style="background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); color: white; padding: 20px; border-radius: 10px; margin: 10px 0;">
|
| 443 |
+
<h3 style="margin: 0 0 10px 0;">❌ Processing Failed</h3>
|
| 444 |
+
<div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 8px; margin: 10px 0;">
|
| 445 |
+
<strong>Error Details:</strong><br/>
|
| 446 |
+
{error_message}
|
| 447 |
+
</div>
|
| 448 |
+
<div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 8px; margin: 10px 0;">
|
| 449 |
+
<strong>💡 Recommended Action:</strong><br/>
|
| 450 |
+
{user_guidance}
|
| 451 |
+
</div>
|
| 452 |
+
{f'<p><strong>Error Type:</strong> {error_type} | <strong>Phase:</strong> {processing_phase}</p>' if error_context else ''}
|
| 453 |
+
</div>
|
| 454 |
+
"""
|
| 455 |
+
|
| 456 |
+
return (
|
| 457 |
+
error_html,
|
| 458 |
+
"", # No preview for errors
|
| 459 |
+
"", # No markdown content for errors
|
| 460 |
+
{"error": error_message, "timestamp": datetime.now().isoformat()}
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
def _generate_document_preview(self, metadata: JSONDict) -> str:
|
| 464 |
+
"""Generate professional document metadata preview"""
|
| 465 |
+
|
| 466 |
+
original_file = metadata.get('original_file', {})
|
| 467 |
+
|
| 468 |
+
return f"""
|
| 469 |
+
<div style="background: #f8f9fa; border: 1px solid #dee2e6; border-radius: 8px; padding: 20px; margin: 10px 0;">
|
| 470 |
+
<h4 style="color: #495057; margin-bottom: 15px;">📄 Document Information</h4>
|
| 471 |
+
<table style="width: 100%; border-collapse: collapse;">
|
| 472 |
+
<tr style="border-bottom: 1px solid #dee2e6;">
|
| 473 |
+
<td style="padding: 8px; font-weight: bold; color: #6c757d;">Filename:</td>
|
| 474 |
+
<td style="padding: 8px;">{original_file.get('filename', 'Unknown')}</td>
|
| 475 |
+
</tr>
|
| 476 |
+
<tr style="border-bottom: 1px solid #dee2e6;">
|
| 477 |
+
<td style="padding: 8px; font-weight: bold; color: #6c757d;">File Size:</td>
|
| 478 |
+
<td style="padding: 8px;">{original_file.get('size', 0) / 1024:.1f} KB</td>
|
| 479 |
+
</tr>
|
| 480 |
+
<tr style="border-bottom: 1px solid #dee2e6;">
|
| 481 |
+
<td style="padding: 8px; font-weight: bold; color: #6c757d;">Format:</td>
|
| 482 |
+
<td style="padding: 8px;">{original_file.get('extension', 'Unknown').upper()}</td>
|
| 483 |
+
</tr>
|
| 484 |
+
<tr style="border-bottom: 1px solid #dee2e6;">
|
| 485 |
+
<td style="padding: 8px; font-weight: bold; color: #6c757d;">Processing Date:</td>
|
| 486 |
+
<td style="padding: 8px;">{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</td>
|
| 487 |
+
</tr>
|
| 488 |
+
</table>
|
| 489 |
+
</div>
|
| 490 |
+
"""
|
| 491 |
+
|
| 492 |
+
def _extract_summary_metrics(self, response: ProcessingResponse) -> JSONDict:
|
| 493 |
+
"""Extract key metrics for UI display"""
|
| 494 |
+
|
| 495 |
+
basic_metrics = response.quality_metrics.get('basic_metrics', {})
|
| 496 |
+
structural_metrics = response.quality_metrics.get('structural_metrics', {})
|
| 497 |
+
|
| 498 |
+
return {
|
| 499 |
+
'overall_score': response.quality_metrics.get('composite_score', 0),
|
| 500 |
+
'processing_time': response.conversion_result.processing_time,
|
| 501 |
+
'content_statistics': {
|
| 502 |
+
'total_words': basic_metrics.get('total_words', 0),
|
| 503 |
+
'total_lines': basic_metrics.get('total_lines', 0),
|
| 504 |
+
'total_characters': basic_metrics.get('total_characters', 0)
|
| 505 |
+
},
|
| 506 |
+
'structural_elements': {
|
| 507 |
+
'headers': structural_metrics.get('header_count', 0),
|
| 508 |
+
'lists': structural_metrics.get('list_items', 0),
|
| 509 |
+
'tables': structural_metrics.get('table_rows', 0),
|
| 510 |
+
'links': structural_metrics.get('links', 0)
|
| 511 |
+
},
|
| 512 |
+
'ai_analysis_available': response.analysis_result is not None and response.analysis_result.success if response.analysis_result else False
|
| 513 |
+
}
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
# ==================== MAIN APPLICATION ASSEMBLY ====================
|
| 517 |
+
|
| 518 |
+
class MarkItDownTestingApp:
|
| 519 |
+
"""
|
| 520 |
+
Strategic application orchestration - human-scale complexity management
|
| 521 |
+
|
| 522 |
+
Core Design Philosophy:
|
| 523 |
+
- Dependency Injection: All components provided at construction
|
| 524 |
+
- Single Responsibility: UI orchestration only
|
| 525 |
+
- Error Boundaries: Comprehensive error handling at interaction level
|
| 526 |
+
- State Management: Immutable state patterns with clear update paths
|
| 527 |
+
|
| 528 |
+
This class represents the composition root of the application - where all
|
| 529 |
+
dependencies are wired together and the system boundary is established.
|
| 530 |
+
"""
|
| 531 |
+
|
| 532 |
+
def __init__(
|
| 533 |
+
self,
|
| 534 |
+
orchestrator: DocumentProcessingOrchestrator,
|
| 535 |
+
ui_factory: GradioResponseFactory,
|
| 536 |
+
initial_state: Optional[ApplicationState] = None
|
| 537 |
+
):
|
| 538 |
+
self.orchestrator = orchestrator
|
| 539 |
+
self.ui_factory = ui_factory
|
| 540 |
+
self.app_state = initial_state or ApplicationState(
|
| 541 |
+
session_id=datetime.now().isoformat()
|
| 542 |
+
)
|
| 543 |
+
|
| 544 |
+
# Application configuration
|
| 545 |
+
self.config = {
|
| 546 |
+
'title': 'MarkItDown Testing Platform',
|
| 547 |
+
'version': '2.0.0-enterprise',
|
| 548 |
+
'max_file_size_mb': 50,
|
| 549 |
+
'supported_formats': ['.pdf', '.docx', '.pptx', '.xlsx', '.txt', '.html', '.htm', '.csv', '.json', '.xml']
|
| 550 |
+
}
|
| 551 |
+
|
| 552 |
+
def create_interface(self) -> gr.Blocks:
|
| 553 |
+
"""
|
| 554 |
+
Gradio interface assembly with modular component design
|
| 555 |
+
|
| 556 |
+
Strategic Approach:
|
| 557 |
+
- Component Isolation: Each UI section is self-contained
|
| 558 |
+
- Event Handling: Clean separation between UI events and business logic
|
| 559 |
+
- State Management: Immutable state updates with clear data flow
|
| 560 |
+
- Error Handling: User-friendly error presentation with recovery guidance
|
| 561 |
+
"""
|
| 562 |
+
|
| 563 |
+
with gr.Blocks(
|
| 564 |
+
title=self.config['title'],
|
| 565 |
+
theme=gr.themes.Soft(),
|
| 566 |
+
analytics_enabled=False
|
| 567 |
+
) as interface:
|
| 568 |
+
|
| 569 |
+
# Application state for Gradio
|
| 570 |
+
gr_state = gr.State(self.app_state)
|
| 571 |
+
|
| 572 |
+
# Main header
|
| 573 |
+
self._create_application_header()
|
| 574 |
+
|
| 575 |
+
# Primary interface tabs
|
| 576 |
+
with gr.Tabs():
|
| 577 |
+
|
| 578 |
+
# Document Processing Tab
|
| 579 |
+
with gr.TabItem("📁 Document Processing"):
|
| 580 |
+
processing_components = self._create_processing_interface(gr_state)
|
| 581 |
+
|
| 582 |
+
# Analytics Dashboard Tab
|
| 583 |
+
with gr.TabItem("📊 Analysis Dashboard"):
|
| 584 |
+
analytics_components = self._create_analytics_interface(gr_state)
|
| 585 |
+
|
| 586 |
+
# System Status Tab
|
| 587 |
+
with gr.TabItem("⚙️ System Status"):
|
| 588 |
+
self._create_status_interface()
|
| 589 |
+
|
| 590 |
+
# Wire event handlers with clean separation
|
| 591 |
+
self._wire_event_handlers(processing_components, analytics_components, gr_state)
|
| 592 |
+
|
| 593 |
+
# Application footer
|
| 594 |
+
self._create_application_footer()
|
| 595 |
+
|
| 596 |
+
return interface
|
| 597 |
+
|
| 598 |
+
def _create_application_header(self) -> None:
|
| 599 |
+
"""Professional application header with branding"""
|
| 600 |
+
|
| 601 |
+
gr.HTML(f"""
|
| 602 |
+
<div style="text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
|
| 603 |
+
color: white; padding: 2rem; border-radius: 10px; margin-bottom: 2rem;">
|
| 604 |
+
<h1 style="margin: 0; font-size: 2.5em;">🚀 {self.config['title']}</h1>
|
| 605 |
+
<p style="margin: 10px 0; font-size: 1.2em;">Enterprise-Grade Document Conversion Testing with AI-Powered Analysis</p>
|
| 606 |
+
<p style="margin: 0; opacity: 0.9;">
|
| 607 |
+
<em>Version {self.config['version']} | Powered by Microsoft MarkItDown & Google Gemini</em>
|
| 608 |
+
</p>
|
| 609 |
+
</div>
|
| 610 |
+
""")
|
| 611 |
+
|
| 612 |
+
def _create_processing_interface(self, gr_state: gr.State) -> Dict[str, Any]:
|
| 613 |
+
"""Document processing interface with professional UX"""
|
| 614 |
+
|
| 615 |
+
with gr.Row():
|
| 616 |
+
with gr.Column(scale=1):
|
| 617 |
+
gr.Markdown("### 📤 Document Upload & Configuration")
|
| 618 |
+
|
| 619 |
+
# File upload
|
| 620 |
+
file_upload = gr.File(
|
| 621 |
+
label="Select Document",
|
| 622 |
+
file_types=self.config['supported_formats'],
|
| 623 |
+
type="binary"
|
| 624 |
+
)
|
| 625 |
+
|
| 626 |
+
# Processing configuration
|
| 627 |
+
with gr.Accordion("🔧 Processing Configuration", open=True):
|
| 628 |
+
gemini_api_key = gr.Textbox(
|
| 629 |
+
label="Gemini API Key (Optional)",
|
| 630 |
+
type="password",
|
| 631 |
+
placeholder="Enter your Google Gemini API key for AI analysis...",
|
| 632 |
+
info="Leave empty for basic conversion only"
|
| 633 |
+
)
|
| 634 |
+
|
| 635 |
+
analysis_type = gr.Dropdown(
|
| 636 |
+
choices=[
|
| 637 |
+
("Quality Analysis", "quality_analysis"),
|
| 638 |
+
("Structure Review", "structure_review"),
|
| 639 |
+
("Content Summary", "content_summary"),
|
| 640 |
+
("Extraction Quality", "extraction_quality")
|
| 641 |
+
],
|
| 642 |
+
value="quality_analysis",
|
| 643 |
+
label="Analysis Type"
|
| 644 |
+
)
|
| 645 |
+
|
| 646 |
+
model_preference = gr.Dropdown(
|
| 647 |
+
choices=[
|
| 648 |
+
("Gemini 1.5 Pro (Best Quality)", "gemini-1.5-pro"),
|
| 649 |
+
("Gemini 1.5 Flash (Faster)", "gemini-1.5-flash")
|
| 650 |
+
],
|
| 651 |
+
value="gemini-1.5-pro",
|
| 652 |
+
label="AI Model Preference"
|
| 653 |
+
)
|
| 654 |
+
|
| 655 |
+
# Action buttons
|
| 656 |
+
with gr.Row():
|
| 657 |
+
process_btn = gr.Button(
|
| 658 |
+
"🚀 Process Document",
|
| 659 |
+
variant="primary",
|
| 660 |
+
size="lg"
|
| 661 |
+
)
|
| 662 |
+
clear_btn = gr.Button(
|
| 663 |
+
"🔄 Clear Session",
|
| 664 |
+
variant="secondary"
|
| 665 |
+
)
|
| 666 |
+
|
| 667 |
+
with gr.Column(scale=2):
|
| 668 |
+
# Results display area
|
| 669 |
+
gr.Markdown("### 📊 Processing Results")
|
| 670 |
+
|
| 671 |
+
status_display = gr.HTML()
|
| 672 |
+
|
| 673 |
+
with gr.Tabs():
|
| 674 |
+
with gr.TabItem("📄 Original Document"):
|
| 675 |
+
original_preview = gr.HTML()
|
| 676 |
+
|
| 677 |
+
with gr.TabItem("📝 Markdown Output"):
|
| 678 |
+
markdown_output = gr.Code(
|
| 679 |
+
language="markdown",
|
| 680 |
+
show_label=False,
|
| 681 |
+
interactive=False
|
| 682 |
+
)
|
| 683 |
+
|
| 684 |
+
with gr.TabItem("📈 Quick Metrics"):
|
| 685 |
+
quick_metrics = gr.JSON()
|
| 686 |
+
|
| 687 |
+
return {
|
| 688 |
+
'file_upload': file_upload,
|
| 689 |
+
'gemini_api_key': gemini_api_key,
|
| 690 |
+
'analysis_type': analysis_type,
|
| 691 |
+
'model_preference': model_preference,
|
| 692 |
+
'process_btn': process_btn,
|
| 693 |
+
'clear_btn': clear_btn,
|
| 694 |
+
'status_display': status_display,
|
| 695 |
+
'original_preview': original_preview,
|
| 696 |
+
'markdown_output': markdown_output,
|
| 697 |
+
'quick_metrics': quick_metrics
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
+
def _create_analytics_interface(self, gr_state: gr.State) -> Dict[str, Any]:
|
| 701 |
+
"""Analytics dashboard interface"""
|
| 702 |
+
|
| 703 |
+
gr.Markdown("### 📊 Document Analysis Dashboard")
|
| 704 |
+
|
| 705 |
+
with gr.Row():
|
| 706 |
+
refresh_btn = gr.Button("🔄 Refresh Dashboard", variant="secondary")
|
| 707 |
+
|
| 708 |
+
with gr.Row():
|
| 709 |
+
quality_dashboard = gr.Plot(label="Quality Analysis Dashboard")
|
| 710 |
+
|
| 711 |
+
with gr.Row():
|
| 712 |
+
with gr.Column():
|
| 713 |
+
analysis_summary = gr.Markdown("*Process a document to see analysis results*")
|
| 714 |
+
with gr.Column():
|
| 715 |
+
structure_metrics = gr.JSON(label="Structure Analysis")
|
| 716 |
+
|
| 717 |
+
return {
|
| 718 |
+
'refresh_btn': refresh_btn,
|
| 719 |
+
'quality_dashboard': quality_dashboard,
|
| 720 |
+
'analysis_summary': analysis_summary,
|
| 721 |
+
'structure_metrics': structure_metrics
|
| 722 |
+
}
|
| 723 |
+
|
| 724 |
+
def _create_status_interface(self) -> None:
|
| 725 |
+
"""System status and health monitoring interface"""
|
| 726 |
+
|
| 727 |
+
gr.Markdown("### ⚙️ System Status & Health")
|
| 728 |
+
|
| 729 |
+
with gr.Row():
|
| 730 |
+
with gr.Column():
|
| 731 |
+
system_health = gr.JSON(
|
| 732 |
+
label="System Health Metrics",
|
| 733 |
+
value=self._get_system_status()
|
| 734 |
+
)
|
| 735 |
+
|
| 736 |
+
with gr.Column():
|
| 737 |
+
processing_stats = gr.JSON(
|
| 738 |
+
label="Processing Statistics",
|
| 739 |
+
value=self.orchestrator.get_processing_status()
|
| 740 |
+
)
|
| 741 |
+
|
| 742 |
+
def _create_application_footer(self) -> None:
|
| 743 |
+
"""Professional application footer"""
|
| 744 |
+
|
| 745 |
+
gr.HTML("""
|
| 746 |
+
<div style="text-align: center; padding: 1rem; color: #6c757d; border-top: 1px solid #dee2e6; margin-top: 2rem;">
|
| 747 |
+
<p>Built with enterprise-grade architecture principles |
|
| 748 |
+
<a href="https://github.com/microsoft/markitdown">Microsoft MarkItDown</a> |
|
| 749 |
+
<a href="https://ai.google.dev/">Google Gemini</a></p>
|
| 750 |
+
</div>
|
| 751 |
+
""")
|
| 752 |
+
|
| 753 |
+
def _wire_event_handlers(
|
| 754 |
+
self,
|
| 755 |
+
processing_components: Dict[str, Any],
|
| 756 |
+
analytics_components: Dict[str, Any],
|
| 757 |
+
gr_state: gr.State
|
| 758 |
+
) -> None:
|
| 759 |
+
"""Wire event handlers with clean separation of concerns"""
|
| 760 |
+
|
| 761 |
+
# Document processing handler
|
| 762 |
+
processing_components['process_btn'].click(
|
| 763 |
+
fn=self._handle_document_processing,
|
| 764 |
+
inputs=[
|
| 765 |
+
processing_components['file_upload'],
|
| 766 |
+
processing_components['gemini_api_key'],
|
| 767 |
+
processing_components['analysis_type'],
|
| 768 |
+
processing_components['model_preference'],
|
| 769 |
+
gr_state
|
| 770 |
+
],
|
| 771 |
+
outputs=[
|
| 772 |
+
processing_components['status_display'],
|
| 773 |
+
processing_components['original_preview'],
|
| 774 |
+
processing_components['markdown_output'],
|
| 775 |
+
processing_components['quick_metrics'],
|
| 776 |
+
gr_state
|
| 777 |
+
],
|
| 778 |
+
show_progress="full"
|
| 779 |
+
)
|
| 780 |
+
|
| 781 |
+
# Clear session handler
|
| 782 |
+
processing_components['clear_btn'].click(
|
| 783 |
+
fn=self._handle_session_clear,
|
| 784 |
+
inputs=[gr_state],
|
| 785 |
+
outputs=[
|
| 786 |
+
processing_components['status_display'],
|
| 787 |
+
processing_components['original_preview'],
|
| 788 |
+
processing_components['markdown_output'],
|
| 789 |
+
processing_components['quick_metrics'],
|
| 790 |
+
gr_state
|
| 791 |
+
]
|
| 792 |
+
)
|
| 793 |
+
|
| 794 |
+
# Analytics refresh handler
|
| 795 |
+
analytics_components['refresh_btn'].click(
|
| 796 |
+
fn=self._handle_analytics_refresh,
|
| 797 |
+
inputs=[gr_state],
|
| 798 |
+
outputs=[
|
| 799 |
+
analytics_components['quality_dashboard'],
|
| 800 |
+
analytics_components['analysis_summary'],
|
| 801 |
+
analytics_components['structure_metrics']
|
| 802 |
+
]
|
| 803 |
+
)
|
| 804 |
+
|
| 805 |
+
async def _handle_document_processing(
|
| 806 |
+
self,
|
| 807 |
+
file_obj,
|
| 808 |
+
gemini_api_key: str,
|
| 809 |
+
analysis_type: str,
|
| 810 |
+
model_preference: str,
|
| 811 |
+
current_state: ApplicationState
|
| 812 |
+
) -> Tuple[str, str, str, JSONDict, ApplicationState]:
|
| 813 |
+
"""
|
| 814 |
+
Clean event handler - delegates to orchestrator
|
| 815 |
+
|
| 816 |
+
Strategic Design:
|
| 817 |
+
- Input Validation: Comprehensive request validation
|
| 818 |
+
- Business Logic Delegation: All processing logic in orchestrator
|
| 819 |
+
- Error Handling: User-friendly error presentation
|
| 820 |
+
- State Management: Immutable state updates
|
| 821 |
+
"""
|
| 822 |
+
|
| 823 |
+
# Input validation
|
| 824 |
+
if not file_obj:
|
| 825 |
+
error_response = self.ui_factory.create_error_response(
|
| 826 |
+
"No file uploaded. Please select a document to process."
|
| 827 |
+
)
|
| 828 |
+
return (*error_response, current_state)
|
| 829 |
+
|
| 830 |
+
try:
|
| 831 |
+
# Extract file content and metadata
|
| 832 |
+
file_content = file_obj.read() if hasattr(file_obj, 'read') else file_obj
|
| 833 |
+
if isinstance(file_content, str):
|
| 834 |
+
file_content = file_content.encode('utf-8')
|
| 835 |
+
|
| 836 |
+
file_metadata = {
|
| 837 |
+
'filename': getattr(file_obj, 'name', 'uploaded_file'),
|
| 838 |
+
'size': len(file_content),
|
| 839 |
+
'extension': Path(getattr(file_obj, 'name', 'file.txt')).suffix.lower(),
|
| 840 |
+
'upload_timestamp': datetime.now().isoformat()
|
| 841 |
+
}
|
| 842 |
+
|
| 843 |
+
# Create processing request
|
| 844 |
+
processing_request = ProcessingRequest(
|
| 845 |
+
file_content=file_content,
|
| 846 |
+
file_metadata=file_metadata,
|
| 847 |
+
gemini_api_key=gemini_api_key.strip() if gemini_api_key else None,
|
| 848 |
+
analysis_type=analysis_type,
|
| 849 |
+
model_preference=model_preference,
|
| 850 |
+
session_context={'session_id': current_state.session_id}
|
| 851 |
+
)
|
| 852 |
+
|
| 853 |
+
# Execute processing through orchestrator
|
| 854 |
+
processing_response = await self.orchestrator.process_document(processing_request)
|
| 855 |
+
|
| 856 |
+
# Update application state
|
| 857 |
+
updated_state = current_state.add_processing_result(processing_response)
|
| 858 |
+
|
| 859 |
+
# Generate UI response
|
| 860 |
+
if processing_response.success:
|
| 861 |
+
ui_response = self.ui_factory.create_success_response(processing_response)
|
| 862 |
+
else:
|
| 863 |
+
ui_response = self.ui_factory.create_error_response(
|
| 864 |
+
processing_response.error_details,
|
| 865 |
+
processing_response.processing_metadata
|
| 866 |
+
)
|
| 867 |
+
|
| 868 |
+
return (*ui_response, updated_state)
|
| 869 |
+
|
| 870 |
+
except Exception as e:
|
| 871 |
+
logger.error(f"Event handler error: {str(e)}")
|
| 872 |
+
error_response = self.ui_factory.create_error_response(
|
| 873 |
+
f"System error during processing: {str(e)}"
|
| 874 |
+
)
|
| 875 |
+
return (*error_response, current_state)
|
| 876 |
+
|
| 877 |
+
def _handle_session_clear(
|
| 878 |
+
self,
|
| 879 |
+
current_state: ApplicationState
|
| 880 |
+
) -> Tuple[str, str, str, JSONDict, ApplicationState]:
|
| 881 |
+
"""Clear session with clean state reset"""
|
| 882 |
+
|
| 883 |
+
# Create fresh application state
|
| 884 |
+
fresh_state = ApplicationState(
|
| 885 |
+
session_id=datetime.now().isoformat()
|
| 886 |
+
)
|
| 887 |
+
|
| 888 |
+
# Clear UI components
|
| 889 |
+
clear_html = """
|
| 890 |
+
<div style="background: #e3f2fd; border: 1px solid #2196f3; color: #1976d2;
|
| 891 |
+
padding: 15px; border-radius: 8px; margin: 10px 0;">
|
| 892 |
+
<h4 style="margin: 0;">🔄 Session Cleared</h4>
|
| 893 |
+
<p style="margin: 5px 0 0 0;">Ready for new document processing.</p>
|
| 894 |
+
</div>
|
| 895 |
+
"""
|
| 896 |
+
|
| 897 |
+
return (
|
| 898 |
+
clear_html,
|
| 899 |
+
"", # Clear preview
|
| 900 |
+
"", # Clear markdown
|
| 901 |
+
{}, # Clear metrics
|
| 902 |
+
fresh_state
|
| 903 |
+
)
|
| 904 |
+
|
| 905 |
+
def _handle_analytics_refresh(
|
| 906 |
+
self,
|
| 907 |
+
current_state: ApplicationState
|
| 908 |
+
) -> Tuple[Any, str, JSONDict]:
|
| 909 |
+
"""Refresh analytics dashboard with latest data"""
|
| 910 |
+
|
| 911 |
+
if not current_state.processing_history:
|
| 912 |
+
# Empty state visualization
|
| 913 |
+
import plotly.graph_objects as go
|
| 914 |
+
empty_fig = go.Figure()
|
| 915 |
+
empty_fig.add_annotation(
|
| 916 |
+
x=0.5, y=0.5,
|
| 917 |
+
xref="paper", yref="paper",
|
| 918 |
+
text="Process documents to see analytics",
|
| 919 |
+
showarrow=False,
|
| 920 |
+
font=dict(size=16, color="gray")
|
| 921 |
+
)
|
| 922 |
+
empty_fig.update_layout(
|
| 923 |
+
title="Analytics Dashboard",
|
| 924 |
+
height=400
|
| 925 |
+
)
|
| 926 |
+
|
| 927 |
+
return (
|
| 928 |
+
empty_fig,
|
| 929 |
+
"*Process documents to see detailed analysis*",
|
| 930 |
+
{}
|
| 931 |
+
)
|
| 932 |
+
|
| 933 |
+
# Get latest successful processing result
|
| 934 |
+
latest_result = None
|
| 935 |
+
for result in reversed(current_state.processing_history):
|
| 936 |
+
if result.success:
|
| 937 |
+
latest_result = result
|
| 938 |
+
break
|
| 939 |
+
|
| 940 |
+
if not latest_result:
|
| 941 |
+
return (
|
| 942 |
+
empty_fig,
|
| 943 |
+
"*No successful processing results available*",
|
| 944 |
+
{}
|
| 945 |
+
)
|
| 946 |
+
|
| 947 |
+
try:
|
| 948 |
+
# Generate dashboard visualization
|
| 949 |
+
quality_dashboard = self.ui_factory.viz_engine.create_quality_dashboard(
|
| 950 |
+
latest_result.conversion_result,
|
| 951 |
+
latest_result.analysis_result
|
| 952 |
+
)
|
| 953 |
+
|
| 954 |
+
# Generate analysis summary
|
| 955 |
+
if latest_result.analysis_result:
|
| 956 |
+
analysis_summary = self._format_analysis_summary(latest_result.analysis_result)
|
| 957 |
+
else:
|
| 958 |
+
analysis_summary = "**Basic conversion completed.** Add Gemini API key for AI-powered analysis."
|
| 959 |
+
|
| 960 |
+
# Generate structure metrics
|
| 961 |
+
structure_metrics = latest_result.quality_metrics.get('structural_metrics', {})
|
| 962 |
+
|
| 963 |
+
return (
|
| 964 |
+
quality_dashboard,
|
| 965 |
+
analysis_summary,
|
| 966 |
+
structure_metrics
|
| 967 |
+
)
|
| 968 |
+
|
| 969 |
+
except Exception as e:
|
| 970 |
+
logger.error(f"Analytics refresh error: {str(e)}")
|
| 971 |
+
return (
|
| 972 |
+
empty_fig,
|
| 973 |
+
f"*Analytics refresh failed: {str(e)}*",
|
| 974 |
+
{"error": str(e)}
|
| 975 |
+
)
|
| 976 |
+
|
| 977 |
+
def _format_analysis_summary(self, analysis_result) -> str:
|
| 978 |
+
"""Format AI analysis results for user presentation"""
|
| 979 |
+
|
| 980 |
+
if not analysis_result or not analysis_result.success:
|
| 981 |
+
return "*AI analysis not available*"
|
| 982 |
+
|
| 983 |
+
content = analysis_result.content
|
| 984 |
+
analysis_type = analysis_result.analysis_type.value.replace('_', ' ').title()
|
| 985 |
+
|
| 986 |
+
summary = f"## 🤖 {analysis_type}\n\n"
|
| 987 |
+
summary += f"**Model:** {analysis_result.model_used.value} \n"
|
| 988 |
+
summary += f"**Processing Time:** {analysis_result.processing_time:.2f}s\n\n"
|
| 989 |
+
|
| 990 |
+
# Extract key insights based on analysis type
|
| 991 |
+
if 'overall_score' in content:
|
| 992 |
+
summary += f"### 📊 Quality Assessment\n"
|
| 993 |
+
summary += f"**Overall Score:** {content.get('overall_score', 0)}/10\n\n"
|
| 994 |
+
|
| 995 |
+
scores = []
|
| 996 |
+
if 'structure_score' in content:
|
| 997 |
+
scores.append(f"Structure: {content['structure_score']}/10")
|
| 998 |
+
if 'completeness_score' in content:
|
| 999 |
+
scores.append(f"Completeness: {content['completeness_score']}/10")
|
| 1000 |
+
if 'accuracy_score' in content:
|
| 1001 |
+
scores.append(f"Accuracy: {content['accuracy_score']}/10")
|
| 1002 |
+
|
| 1003 |
+
if scores:
|
| 1004 |
+
summary += "**Detailed Scores:** " + " | ".join(scores) + "\n\n"
|
| 1005 |
+
|
| 1006 |
+
if 'executive_summary' in content:
|
| 1007 |
+
summary += f"### 📋 Executive Summary\n{content['executive_summary']}\n\n"
|
| 1008 |
+
|
| 1009 |
+
if 'detailed_feedback' in content:
|
| 1010 |
+
feedback = content['detailed_feedback'][:300]
|
| 1011 |
+
summary += f"### 💡 Key Insights\n{feedback}{'...' if len(content['detailed_feedback']) > 300 else ''}\n\n"
|
| 1012 |
+
|
| 1013 |
+
if 'recommendations' in content and content['recommendations']:
|
| 1014 |
+
summary += f"### 🎯 Recommendations\n"
|
| 1015 |
+
for i, rec in enumerate(content['recommendations'][:3], 1):
|
| 1016 |
+
summary += f"{i}. {rec}\n"
|
| 1017 |
+
|
| 1018 |
+
return summary
|
| 1019 |
+
|
| 1020 |
+
def _get_system_status(self) -> JSONDict:
|
| 1021 |
+
"""Get comprehensive system status information"""
|
| 1022 |
+
|
| 1023 |
+
try:
|
| 1024 |
+
import psutil
|
| 1025 |
+
memory = psutil.virtual_memory()
|
| 1026 |
+
cpu_percent = psutil.cpu_percent(interval=1)
|
| 1027 |
+
|
| 1028 |
+
return {
|
| 1029 |
+
'system': {
|
| 1030 |
+
'status': 'Operational',
|
| 1031 |
+
'cpu_usage_percent': cpu_percent,
|
| 1032 |
+
'memory_usage_percent': memory.percent,
|
| 1033 |
+
'available_memory_gb': round(memory.available / (1024**3), 2),
|
| 1034 |
+
'platform': os.name
|
| 1035 |
+
},
|
| 1036 |
+
'application': {
|
| 1037 |
+
'version': self.config['version'],
|
| 1038 |
+
'max_file_size_mb': self.config['max_file_size_mb'],
|
| 1039 |
+
'supported_formats': len(self.config['supported_formats']),
|
| 1040 |
+
'session_id': self.app_state.session_id
|
| 1041 |
+
},
|
| 1042 |
+
'processing': self.orchestrator.get_processing_status()
|
| 1043 |
+
}
|
| 1044 |
+
except Exception as e:
|
| 1045 |
+
return {
|
| 1046 |
+
'system': {'status': 'Unknown', 'error': str(e)},
|
| 1047 |
+
'application': {'version': self.config['version']},
|
| 1048 |
+
'processing': {'status': 'Unknown'}
|
| 1049 |
+
}
|
| 1050 |
+
|
| 1051 |
+
|
| 1052 |
+
# ==================== APPLICATION FACTORY & COMPOSITION ROOT ====================
|
| 1053 |
+
|
| 1054 |
+
class ApplicationFactory:
|
| 1055 |
+
"""
|
| 1056 |
+
Strategic application composition - dependency injection container
|
| 1057 |
+
|
| 1058 |
+
Design Principles:
|
| 1059 |
+
- Composition Root: Single location for all dependency wiring
|
| 1060 |
+
- Environment Awareness: Different configurations for different environments
|
| 1061 |
+
- Component Lifecycle: Proper initialization order and cleanup
|
| 1062 |
+
- Configuration Management: Centralized configuration with validation
|
| 1063 |
+
"""
|
| 1064 |
+
|
| 1065 |
+
@staticmethod
|
| 1066 |
+
def create_hf_spaces_app() -> MarkItDownTestingApp:
|
| 1067 |
+
"""
|
| 1068 |
+
Factory method for HF Spaces optimized application
|
| 1069 |
+
|
| 1070 |
+
Optimizations:
|
| 1071 |
+
- Resource Management: Configured for 16GB memory limit
|
| 1072 |
+
- Processing Timeouts: Appropriate for shared infrastructure
|
| 1073 |
+
- Error Recovery: Graceful degradation under resource pressure
|
| 1074 |
+
- Logging Configuration: Production-appropriate logging levels
|
| 1075 |
+
"""
|
| 1076 |
+
|
| 1077 |
+
logger.info("Initializing MarkItDown Testing Platform for HF Spaces deployment")
|
| 1078 |
+
|
| 1079 |
+
# Core configuration
|
| 1080 |
+
processing_config = ProcessingConfig(
|
| 1081 |
+
max_file_size_mb=50,
|
| 1082 |
+
max_memory_usage_gb=12.0,
|
| 1083 |
+
processing_timeout=300,
|
| 1084 |
+
max_concurrent_processes=2
|
| 1085 |
+
)
|
| 1086 |
+
|
| 1087 |
+
# Resource management
|
| 1088 |
+
resource_manager = ResourceManager(processing_config)
|
| 1089 |
+
|
| 1090 |
+
# Document processing components
|
| 1091 |
+
file_handler = StreamlineFileHandler(resource_manager)
|
| 1092 |
+
conversion_engine = HFConversionEngine(resource_manager, processing_config)
|
| 1093 |
+
|
| 1094 |
+
# AI analysis components
|
| 1095 |
+
gemini_manager = GeminiConnectionManager()
|
| 1096 |
+
|
| 1097 |
+
# Analytics and visualization
|
| 1098 |
+
viz_config = VisualizationConfig(
|
| 1099 |
+
theme=VisualizationConfig.VisualizationTheme.CORPORATE,
|
| 1100 |
+
width=800,
|
| 1101 |
+
height=600
|
| 1102 |
+
)
|
| 1103 |
+
viz_engine = InteractiveVisualizationEngine(viz_config)
|
| 1104 |
+
quality_calculator = QualityMetricsCalculator()
|
| 1105 |
+
|
| 1106 |
+
# Core orchestrator
|
| 1107 |
+
orchestrator = DocumentProcessingOrchestrator(
|
| 1108 |
+
file_handler=file_handler,
|
| 1109 |
+
conversion_engine=conversion_engine,
|
| 1110 |
+
gemini_manager=gemini_manager,
|
| 1111 |
+
viz_engine=viz_engine,
|
| 1112 |
+
quality_calculator=quality_calculator
|
| 1113 |
+
)
|
| 1114 |
+
|
| 1115 |
+
# UI presentation layer
|
| 1116 |
+
ui_factory = GradioResponseFactory(viz_engine)
|
| 1117 |
+
|
| 1118 |
+
# Application assembly
|
| 1119 |
+
app = MarkItDownTestingApp(
|
| 1120 |
+
orchestrator=orchestrator,
|
| 1121 |
+
ui_factory=ui_factory
|
| 1122 |
+
)
|
| 1123 |
+
|
| 1124 |
+
logger.info("Application initialized successfully - Ready for HF Spaces deployment")
|
| 1125 |
+
return app
|
| 1126 |
+
|
| 1127 |
+
@staticmethod
|
| 1128 |
+
def create_local_development_app() -> MarkItDownTestingApp:
|
| 1129 |
+
"""Factory method for local development with enhanced debugging"""
|
| 1130 |
+
|
| 1131 |
+
# Enhanced configuration for local development
|
| 1132 |
+
processing_config = ProcessingConfig(
|
| 1133 |
+
max_file_size_mb=100,
|
| 1134 |
+
max_memory_usage_gb=32.0,
|
| 1135 |
+
processing_timeout=600,
|
| 1136 |
+
max_concurrent_processes=4
|
| 1137 |
+
)
|
| 1138 |
+
|
| 1139 |
+
# Enable debug logging for development
|
| 1140 |
+
logging.getLogger().setLevel(logging.DEBUG)
|
| 1141 |
+
|
| 1142 |
+
# Use same component assembly pattern as HF Spaces
|
| 1143 |
+
return ApplicationFactory.create_hf_spaces_app()
|
| 1144 |
+
|
| 1145 |
+
|
| 1146 |
+
# ==================== ENVIRONMENT SETUP & CONFIGURATION ====================
|
| 1147 |
+
|
| 1148 |
+
def setup_production_environment() -> None:
|
| 1149 |
+
"""Configure production environment for optimal performance"""
|
| 1150 |
+
|
| 1151 |
+
# Environment variables for HF Spaces
|
| 1152 |
+
os.environ.setdefault('GRADIO_TEMP_DIR', '/tmp')
|
| 1153 |
+
os.environ.setdefault('HF_HOME', '/tmp')
|
| 1154 |
+
os.environ.setdefault('PYTHONUNBUFFERED', '1')
|
| 1155 |
+
|
| 1156 |
+
# Logging configuration
|
| 1157 |
+
logging.basicConfig(
|
| 1158 |
+
level=logging.INFO,
|
| 1159 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 1160 |
+
)
|
| 1161 |
+
|
| 1162 |
+
# System resource verification
|
| 1163 |
+
try:
|
| 1164 |
+
import psutil
|
| 1165 |
+
memory = psutil.virtual_memory()
|
| 1166 |
+
logger.info(f"Production environment initialized - Available memory: {memory.available / (1024**3):.2f} GB")
|
| 1167 |
+
|
| 1168 |
+
if memory.available < 2 * (1024**3): # Less than 2GB available
|
| 1169 |
+
logger.warning("Low memory detected - enabling aggressive cleanup policies")
|
| 1170 |
+
|
| 1171 |
+
except ImportError:
|
| 1172 |
+
logger.warning("psutil not available - resource monitoring disabled")
|
| 1173 |
+
|
| 1174 |
+
|
| 1175 |
+
def create_gradio_app() -> gr.Blocks:
|
| 1176 |
+
"""
|
| 1177 |
+
Main application factory for Gradio deployment
|
| 1178 |
+
|
| 1179 |
+
This is the primary entry point for the application, designed to be called
|
| 1180 |
+
by Gradio's deployment infrastructure.
|
| 1181 |
+
"""
|
| 1182 |
+
|
| 1183 |
+
setup_production_environment()
|
| 1184 |
+
|
| 1185 |
+
# Create application instance
|
| 1186 |
+
app = ApplicationFactory.create_hf_spaces_app()
|
| 1187 |
+
|
| 1188 |
+
# Create Gradio interface
|
| 1189 |
+
interface = app.create_interface()
|
| 1190 |
+
|
| 1191 |
+
return interface
|
| 1192 |
+
|
| 1193 |
+
|
| 1194 |
+
# ==================== MAIN ENTRY POINT ====================
|
| 1195 |
+
|
| 1196 |
+
def main():
|
| 1197 |
+
"""
|
| 1198 |
+
Main application entry point for direct execution
|
| 1199 |
+
|
| 1200 |
+
Supports both development and production deployment modes with
|
| 1201 |
+
appropriate configuration for each environment.
|
| 1202 |
+
"""
|
| 1203 |
+
|
| 1204 |
+
setup_production_environment()
|
| 1205 |
+
|
| 1206 |
+
# Create and configure application
|
| 1207 |
+
app = ApplicationFactory.create_hf_spaces_app()
|
| 1208 |
+
interface = app.create_interface()
|
| 1209 |
+
|
| 1210 |
+
# Launch configuration optimized for HF Spaces
|
| 1211 |
+
launch_kwargs = {
|
| 1212 |
+
'server_name': '0.0.0.0',
|
| 1213 |
+
'server_port': int(os.environ.get('PORT', 7860)),
|
| 1214 |
+
'share': False, # HF Spaces handles sharing
|
| 1215 |
+
'show_error': True,
|
| 1216 |
+
'max_file_size': f"{50 * 1024 * 1024}b", # 50MB limit
|
| 1217 |
+
'allowed_paths': ['/tmp'],
|
| 1218 |
+
'root_path': os.environ.get('GRADIO_ROOT_PATH', '')
|
| 1219 |
+
}
|
| 1220 |
+
|
| 1221 |
+
# Launch application
|
| 1222 |
+
try:
|
| 1223 |
+
logger.info(f"Launching MarkItDown Testing Platform on port {launch_kwargs['server_port']}")
|
| 1224 |
+
interface.launch(**launch_kwargs)
|
| 1225 |
+
except Exception as e:
|
| 1226 |
+
logger.error(f"Application launch failed: {str(e)}")
|
| 1227 |
+
raise
|
| 1228 |
+
|
| 1229 |
+
|
| 1230 |
+
# ==================== MODULE INTERFACE ====================
|
| 1231 |
+
|
| 1232 |
+
# Public API for external integration
|
| 1233 |
+
__all__ = [
|
| 1234 |
+
'MarkItDownTestingApp',
|
| 1235 |
+
'ApplicationFactory',
|
| 1236 |
+
'ProcessingRequest',
|
| 1237 |
+
'ProcessingResponse',
|
| 1238 |
+
'create_gradio_app',
|
| 1239 |
+
'main'
|
| 1240 |
+
]
|
| 1241 |
+
|
| 1242 |
+
|
| 1243 |
+
if __name__ == "__main__":
|
| 1244 |
+
main()
|
core/modules.py
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enterprise-Grade Core Modules for MarkItDown Testing Platform
|
| 3 |
+
|
| 4 |
+
Strategic Design Philosophy:
|
| 5 |
+
- Stateless architecture for HF Spaces optimization
|
| 6 |
+
- Resource-aware processing with automatic cleanup
|
| 7 |
+
- Comprehensive error handling and recovery mechanisms
|
| 8 |
+
- Modular design enabling easy component replacement
|
| 9 |
+
|
| 10 |
+
This module implements the foundational processing layer with strict
|
| 11 |
+
separation of concerns and enterprise-grade error handling.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import asyncio
|
| 15 |
+
import tempfile
|
| 16 |
+
import shutil
|
| 17 |
+
import os
|
| 18 |
+
import gc
|
| 19 |
+
import json
|
| 20 |
+
import logging
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
from typing import Dict, Optional, List, Union, AsyncGenerator
|
| 24 |
+
from dataclasses import dataclass, asdict
|
| 25 |
+
from contextlib import asynccontextmanager
|
| 26 |
+
|
| 27 |
+
import aiofiles
|
| 28 |
+
from markitdown import MarkItDown
|
| 29 |
+
import google.generativeai as genai
|
| 30 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 31 |
+
try:
|
| 32 |
+
import magic
|
| 33 |
+
except ImportError:
|
| 34 |
+
magic = None
|
| 35 |
+
import mimetypes
|
| 36 |
+
import psutil
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Strategic Configuration Management
|
| 40 |
+
from pydantic import JsonValue
|
| 41 |
+
|
| 42 |
+
JSONDict = Dict[str, JsonValue]
|
| 43 |
+
|
| 44 |
+
# Strategic Configuration Management
|
| 45 |
+
@dataclass
|
| 46 |
+
class ProcessingConfig:
|
| 47 |
+
"""Centralized configuration for processing parameters"""
|
| 48 |
+
max_file_size_mb: int = 50
|
| 49 |
+
max_memory_usage_gb: float = 12.0
|
| 50 |
+
temp_cleanup_interval: int = 300 # seconds
|
| 51 |
+
max_concurrent_processes: int = 3
|
| 52 |
+
processing_timeout: int = 300
|
| 53 |
+
gemini_timeout: int = 60
|
| 54 |
+
retry_attempts: int = 3
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@dataclass
|
| 58 |
+
class ProcessingResult:
|
| 59 |
+
"""Standardized result container for all processing operations"""
|
| 60 |
+
success: bool
|
| 61 |
+
content: str
|
| 62 |
+
metadata: JSONDict
|
| 63 |
+
error_message: Optional[str] = None
|
| 64 |
+
processing_time: Optional[float] = None
|
| 65 |
+
resource_usage: Optional[JSONDict] = None
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class ResourceManager:
|
| 69 |
+
"""
|
| 70 |
+
Enterprise-grade resource management for HF Spaces constraints
|
| 71 |
+
|
| 72 |
+
Strategic Approach:
|
| 73 |
+
- Proactive resource monitoring
|
| 74 |
+
- Automatic cleanup mechanisms
|
| 75 |
+
- Memory-efficient processing patterns
|
| 76 |
+
- Graceful degradation under resource pressure
|
| 77 |
+
"""
|
| 78 |
+
|
| 79 |
+
def __init__(self, config: ProcessingConfig):
|
| 80 |
+
self.config = config
|
| 81 |
+
self.active_processes = set()
|
| 82 |
+
self.temp_directories = set()
|
| 83 |
+
|
| 84 |
+
def check_resource_availability(self, file_size_bytes: int) -> bool:
|
| 85 |
+
"""Validate resource availability before processing"""
|
| 86 |
+
|
| 87 |
+
# Convert bytes to MB for comparison
|
| 88 |
+
file_size_mb = file_size_bytes / (1024 * 1024)
|
| 89 |
+
|
| 90 |
+
if file_size_mb > self.config.max_file_size_mb:
|
| 91 |
+
raise ResourceError(
|
| 92 |
+
f"File size {file_size_mb:.2f}MB exceeds limit {self.config.max_file_size_mb}MB"
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
memory_info = psutil.virtual_memory()
|
| 96 |
+
process_memory_gb = psutil.Process(os.getpid()).memory_info().rss / (1024**3)
|
| 97 |
+
|
| 98 |
+
if process_memory_gb > self.config.max_memory_usage_gb:
|
| 99 |
+
raise ResourceError(
|
| 100 |
+
f"Process memory usage {process_memory_gb:.2f}GB exceeds limit {self.config.max_memory_usage_gb:.2f}GB"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
available_gb = memory_info.available / (1024**3)
|
| 104 |
+
if available_gb < 1.0:
|
| 105 |
+
raise ResourceError(
|
| 106 |
+
f"Low system memory available: {available_gb:.2f}GB"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
if len(self.active_processes) >= self.config.max_concurrent_processes:
|
| 110 |
+
raise ResourceError("Maximum concurrent processes exceeded")
|
| 111 |
+
|
| 112 |
+
return True
|
| 113 |
+
|
| 114 |
+
@asynccontextmanager
|
| 115 |
+
async def managed_temp_directory(self):
|
| 116 |
+
"""Context manager for temporary directory with automatic cleanup"""
|
| 117 |
+
temp_dir = tempfile.mkdtemp(prefix="markitdown_")
|
| 118 |
+
self.temp_directories.add(temp_dir)
|
| 119 |
+
|
| 120 |
+
try:
|
| 121 |
+
yield temp_dir
|
| 122 |
+
finally:
|
| 123 |
+
await self._cleanup_directory(temp_dir)
|
| 124 |
+
self.temp_directories.discard(temp_dir)
|
| 125 |
+
|
| 126 |
+
async def _cleanup_directory(self, directory: str):
|
| 127 |
+
"""Async cleanup of temporary directory"""
|
| 128 |
+
try:
|
| 129 |
+
if os.path.exists(directory):
|
| 130 |
+
shutil.rmtree(directory, ignore_errors=True)
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logging.warning(f"Cleanup warning for {directory}: {e}")
|
| 133 |
+
|
| 134 |
+
async def force_cleanup(self):
|
| 135 |
+
"""Emergency cleanup of all managed resources"""
|
| 136 |
+
cleanup_tasks = [
|
| 137 |
+
self._cleanup_directory(temp_dir)
|
| 138 |
+
for temp_dir in list(self.temp_directories)
|
| 139 |
+
]
|
| 140 |
+
|
| 141 |
+
if cleanup_tasks:
|
| 142 |
+
await asyncio.gather(*cleanup_tasks, return_exceptions=True)
|
| 143 |
+
|
| 144 |
+
# Force garbage collection
|
| 145 |
+
gc.collect()
|
| 146 |
+
|
| 147 |
+
self.temp_directories.clear()
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
class StreamlineFileHandler:
|
| 151 |
+
"""
|
| 152 |
+
Memory-efficient file processing optimized for HF Spaces
|
| 153 |
+
|
| 154 |
+
Key Design Principles:
|
| 155 |
+
- Stream-based processing to minimize memory footprint
|
| 156 |
+
- Comprehensive file validation and security checks
|
| 157 |
+
- Automatic format detection and metadata extraction
|
| 158 |
+
- Graceful error handling with detailed diagnostics
|
| 159 |
+
"""
|
| 160 |
+
|
| 161 |
+
def __init__(self, resource_manager: ResourceManager):
|
| 162 |
+
self.resource_manager = resource_manager
|
| 163 |
+
self.supported_formats = {
|
| 164 |
+
'.pdf', '.docx', '.pptx', '.xlsx', '.txt',
|
| 165 |
+
'.html', '.htm', '.csv', '.json', '.xml', '.rtf'
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
async def process_upload(self, file_obj, metadata_override: Optional[JSONDict] = None) -> ProcessingResult:
|
| 169 |
+
"""Process uploaded file with comprehensive validation"""
|
| 170 |
+
|
| 171 |
+
start_time = datetime.now()
|
| 172 |
+
|
| 173 |
+
try:
|
| 174 |
+
# Extract basic file information
|
| 175 |
+
file_info = self._extract_file_metadata(file_obj)
|
| 176 |
+
|
| 177 |
+
if metadata_override:
|
| 178 |
+
# Merge provided metadata, prioritising supplied values
|
| 179 |
+
for key, value in metadata_override.items():
|
| 180 |
+
if value in (None, ""):
|
| 181 |
+
continue
|
| 182 |
+
file_info[key] = value
|
| 183 |
+
|
| 184 |
+
# Recalculate support flag using final extension
|
| 185 |
+
extension = file_info.get('extension', '').lower()
|
| 186 |
+
if extension:
|
| 187 |
+
if not extension.startswith('.'):
|
| 188 |
+
extension = f'.{extension}'
|
| 189 |
+
file_info['extension'] = extension
|
| 190 |
+
file_info['supported'] = file_info.get('extension') in self.supported_formats
|
| 191 |
+
|
| 192 |
+
# Resource availability check
|
| 193 |
+
self.resource_manager.check_resource_availability(file_info['size'])
|
| 194 |
+
|
| 195 |
+
# Security validation
|
| 196 |
+
await self._validate_file_security(file_obj, file_info)
|
| 197 |
+
|
| 198 |
+
# Read file content efficiently
|
| 199 |
+
content = await self._read_file_content(file_obj)
|
| 200 |
+
|
| 201 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 202 |
+
|
| 203 |
+
return ProcessingResult(
|
| 204 |
+
success=True,
|
| 205 |
+
content=content,
|
| 206 |
+
metadata=file_info,
|
| 207 |
+
processing_time=processing_time,
|
| 208 |
+
resource_usage=self._get_current_resource_usage()
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
except Exception as e:
|
| 212 |
+
return ProcessingResult(
|
| 213 |
+
success=False,
|
| 214 |
+
content="",
|
| 215 |
+
metadata={},
|
| 216 |
+
error_message=str(e),
|
| 217 |
+
processing_time=(datetime.now() - start_time).total_seconds()
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
def _extract_file_metadata(self, file_obj) -> JSONDict:
|
| 221 |
+
"""Extract comprehensive file metadata"""
|
| 222 |
+
|
| 223 |
+
file_path = Path(file_obj.name) if hasattr(file_obj, 'name') else Path("unknown")
|
| 224 |
+
|
| 225 |
+
return {
|
| 226 |
+
'filename': file_path.name,
|
| 227 |
+
'extension': file_path.suffix.lower(),
|
| 228 |
+
'size': getattr(file_obj, 'size', 0),
|
| 229 |
+
'mime_type': self._detect_mime_type(file_obj),
|
| 230 |
+
'timestamp': datetime.now().isoformat(),
|
| 231 |
+
'supported': file_path.suffix.lower() in self.supported_formats
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
def _detect_mime_type(self, file_obj) -> str:
|
| 235 |
+
"""Detect MIME type using python-magic if available"""
|
| 236 |
+
mime_type = None
|
| 237 |
+
|
| 238 |
+
if magic is not None and hasattr(file_obj, 'read'):
|
| 239 |
+
try:
|
| 240 |
+
current_pos = file_obj.tell() if hasattr(file_obj, 'tell') else 0
|
| 241 |
+
chunk = file_obj.read(1024)
|
| 242 |
+
if hasattr(file_obj, 'seek'):
|
| 243 |
+
file_obj.seek(current_pos)
|
| 244 |
+
mime_type = magic.from_buffer(chunk, mime=True) if chunk else None
|
| 245 |
+
except Exception:
|
| 246 |
+
mime_type = None
|
| 247 |
+
|
| 248 |
+
if not mime_type:
|
| 249 |
+
filename = getattr(file_obj, 'name', None)
|
| 250 |
+
if filename:
|
| 251 |
+
mime_type = mimetypes.guess_type(filename)[0]
|
| 252 |
+
|
| 253 |
+
return mime_type or 'application/octet-stream'
|
| 254 |
+
|
| 255 |
+
async def _validate_file_security(self, file_obj, file_info: JSONDict):
|
| 256 |
+
"""Comprehensive security validation"""
|
| 257 |
+
|
| 258 |
+
# File extension validation
|
| 259 |
+
if not file_info['supported']:
|
| 260 |
+
raise SecurityError(f"Unsupported file format: {file_info['extension']}")
|
| 261 |
+
|
| 262 |
+
# MIME type consistency check
|
| 263 |
+
expected_mimes = {
|
| 264 |
+
'.pdf': 'application/pdf',
|
| 265 |
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
| 266 |
+
'.txt': 'text/plain',
|
| 267 |
+
'.html': 'text/html',
|
| 268 |
+
'.htm': 'text/html'
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
expected_mime = expected_mimes.get(file_info['extension'])
|
| 272 |
+
if expected_mime and not file_info['mime_type'].startswith(expected_mime.split('/')[0]):
|
| 273 |
+
logging.warning(f"MIME type mismatch for {file_info['extension']}")
|
| 274 |
+
|
| 275 |
+
async def _read_file_content(self, file_obj) -> bytes:
|
| 276 |
+
"""Memory-efficient file content reading"""
|
| 277 |
+
|
| 278 |
+
if hasattr(file_obj, 'read'):
|
| 279 |
+
# Reset to beginning if possible
|
| 280 |
+
if hasattr(file_obj, 'seek'):
|
| 281 |
+
file_obj.seek(0)
|
| 282 |
+
return file_obj.read()
|
| 283 |
+
|
| 284 |
+
# Handle different file object types
|
| 285 |
+
if hasattr(file_obj, 'file'):
|
| 286 |
+
return file_obj.file.read()
|
| 287 |
+
|
| 288 |
+
raise ValueError("Unable to read file content")
|
| 289 |
+
|
| 290 |
+
def _get_current_resource_usage(self) -> JSONDict:
|
| 291 |
+
"""Get current system resource usage"""
|
| 292 |
+
|
| 293 |
+
memory_info = psutil.virtual_memory()
|
| 294 |
+
cpu_percent = psutil.cpu_percent(interval=0.1)
|
| 295 |
+
|
| 296 |
+
return {
|
| 297 |
+
'memory_used_gb': memory_info.used / (1024**3),
|
| 298 |
+
'memory_available_gb': memory_info.available / (1024**3),
|
| 299 |
+
'cpu_percent': cpu_percent,
|
| 300 |
+
'timestamp': datetime.now().isoformat()
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
class HFConversionEngine:
|
| 305 |
+
"""
|
| 306 |
+
MarkItDown wrapper optimized for stateless HF Spaces execution
|
| 307 |
+
|
| 308 |
+
Strategic Design Features:
|
| 309 |
+
- Async processing with progress tracking
|
| 310 |
+
- Automatic resource cleanup and memory management
|
| 311 |
+
- Comprehensive error handling and retry mechanisms
|
| 312 |
+
- Performance monitoring and optimization
|
| 313 |
+
"""
|
| 314 |
+
|
| 315 |
+
def __init__(self, resource_manager: ResourceManager, config: ProcessingConfig):
|
| 316 |
+
self.resource_manager = resource_manager
|
| 317 |
+
self.config = config
|
| 318 |
+
self.md = MarkItDown()
|
| 319 |
+
|
| 320 |
+
@retry(
|
| 321 |
+
stop=stop_after_attempt(3),
|
| 322 |
+
wait=wait_exponential(multiplier=1, min=4, max=10)
|
| 323 |
+
)
|
| 324 |
+
async def convert_stream(self, file_content: bytes, file_metadata: JSONDict) -> ProcessingResult:
|
| 325 |
+
"""Stream-based conversion with automatic cleanup and retry logic"""
|
| 326 |
+
|
| 327 |
+
start_time = datetime.now()
|
| 328 |
+
process_id = id(asyncio.current_task())
|
| 329 |
+
self.resource_manager.active_processes.add(process_id)
|
| 330 |
+
|
| 331 |
+
try:
|
| 332 |
+
async with self.resource_manager.managed_temp_directory() as temp_dir:
|
| 333 |
+
# Create temporary file for MarkItDown processing
|
| 334 |
+
temp_file_path = await self._create_temp_file(
|
| 335 |
+
temp_dir, file_content, file_metadata
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
# Perform conversion with timeout
|
| 339 |
+
result = await asyncio.wait_for(
|
| 340 |
+
self._execute_conversion(temp_file_path),
|
| 341 |
+
timeout=self.config.gemini_timeout
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 345 |
+
|
| 346 |
+
return ProcessingResult(
|
| 347 |
+
success=True,
|
| 348 |
+
content=result.text_content,
|
| 349 |
+
metadata={
|
| 350 |
+
'original_file': file_metadata,
|
| 351 |
+
'conversion_time': processing_time,
|
| 352 |
+
'content_length': len(result.text_content),
|
| 353 |
+
'conversion_metadata': self._extract_conversion_metadata(result)
|
| 354 |
+
},
|
| 355 |
+
processing_time=processing_time
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
except Exception as e:
|
| 359 |
+
return ProcessingResult(
|
| 360 |
+
success=False,
|
| 361 |
+
content="",
|
| 362 |
+
metadata=file_metadata,
|
| 363 |
+
error_message=f"Conversion failed: {str(e)}",
|
| 364 |
+
processing_time=(datetime.now() - start_time).total_seconds()
|
| 365 |
+
)
|
| 366 |
+
|
| 367 |
+
finally:
|
| 368 |
+
self.resource_manager.active_processes.discard(process_id)
|
| 369 |
+
|
| 370 |
+
async def _create_temp_file(self, temp_dir: str, content: bytes, metadata: JSONDict) -> str:
|
| 371 |
+
"""Create temporary file for processing"""
|
| 372 |
+
|
| 373 |
+
filename = metadata.get('filename', 'temp_file')
|
| 374 |
+
temp_file_path = os.path.join(temp_dir, filename)
|
| 375 |
+
|
| 376 |
+
async with aiofiles.open(temp_file_path, 'wb') as temp_file:
|
| 377 |
+
await temp_file.write(content)
|
| 378 |
+
|
| 379 |
+
return temp_file_path
|
| 380 |
+
|
| 381 |
+
async def _execute_conversion(self, file_path: str):
|
| 382 |
+
"""Execute MarkItDown conversion in thread pool"""
|
| 383 |
+
|
| 384 |
+
loop = asyncio.get_event_loop()
|
| 385 |
+
return await loop.run_in_executor(
|
| 386 |
+
None, self.md.convert, file_path
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
def _extract_conversion_metadata(self, result) -> JSONDict:
|
| 390 |
+
"""Extract metadata from MarkItDown result"""
|
| 391 |
+
|
| 392 |
+
content = result.text_content
|
| 393 |
+
|
| 394 |
+
return {
|
| 395 |
+
'lines_count': len(content.split('\n')),
|
| 396 |
+
'word_count': len(content.split()),
|
| 397 |
+
'character_count': len(content),
|
| 398 |
+
'has_tables': '|' in content,
|
| 399 |
+
'has_headers': content.count('#') > 0,
|
| 400 |
+
'has_lists': content.count('-') > 0 or content.count('*') > 0,
|
| 401 |
+
'has_links': '[' in content and '](' in content
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
# Custom Exception Classes
|
| 406 |
+
class ResourceError(Exception):
|
| 407 |
+
"""Resource constraint violation"""
|
| 408 |
+
pass
|
| 409 |
+
|
| 410 |
+
class SecurityError(Exception):
|
| 411 |
+
"""Security validation failure"""
|
| 412 |
+
pass
|
| 413 |
+
|
| 414 |
+
class ConversionError(Exception):
|
| 415 |
+
"""Document conversion failure"""
|
| 416 |
+
pass
|
examples/usage_examples.py
ADDED
|
@@ -0,0 +1,1159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MarkItDown Testing Platform - Usage Examples and Testing Suite
|
| 3 |
+
|
| 4 |
+
This module provides comprehensive examples and testing capabilities for the
|
| 5 |
+
MarkItDown Testing Platform, demonstrating various use cases and validation scenarios.
|
| 6 |
+
|
| 7 |
+
Strategic Examples Coverage:
|
| 8 |
+
- Basic document conversion workflows
|
| 9 |
+
- Advanced AI analysis integration
|
| 10 |
+
- Performance benchmarking and optimization
|
| 11 |
+
- Enterprise integration patterns
|
| 12 |
+
- Error handling and recovery scenarios
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import asyncio
|
| 16 |
+
import tempfile
|
| 17 |
+
import json
|
| 18 |
+
import time
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import Dict, List, Any, Optional, Tuple
|
| 22 |
+
import logging
|
| 23 |
+
|
| 24 |
+
# Import platform components
|
| 25 |
+
from core.modules import (
|
| 26 |
+
StreamlineFileHandler, HFConversionEngine, ResourceManager,
|
| 27 |
+
ProcessingConfig, ProcessingResult
|
| 28 |
+
)
|
| 29 |
+
from llm.gemini_connector import (
|
| 30 |
+
GeminiAnalysisEngine, GeminiConfig, AnalysisRequest,
|
| 31 |
+
AnalysisType, GeminiModel, create_analysis_request
|
| 32 |
+
)
|
| 33 |
+
from visualization.analytics_engine import (
|
| 34 |
+
InteractiveVisualizationEngine, QualityMetricsCalculator,
|
| 35 |
+
VisualizationConfig
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# Configure logging
|
| 39 |
+
logging.basicConfig(level=logging.INFO)
|
| 40 |
+
logger = logging.getLogger(__name__)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class DocumentSampleGenerator:
|
| 44 |
+
"""Generate test documents for comprehensive platform testing"""
|
| 45 |
+
|
| 46 |
+
@staticmethod
|
| 47 |
+
def create_test_html() -> str:
|
| 48 |
+
"""Create comprehensive HTML test document"""
|
| 49 |
+
|
| 50 |
+
return """
|
| 51 |
+
<!DOCTYPE html>
|
| 52 |
+
<html lang="en">
|
| 53 |
+
<head>
|
| 54 |
+
<meta charset="UTF-8">
|
| 55 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 56 |
+
<title>MarkItDown Test Document</title>
|
| 57 |
+
<style>
|
| 58 |
+
.highlight { background-color: yellow; }
|
| 59 |
+
.important { font-weight: bold; color: red; }
|
| 60 |
+
</style>
|
| 61 |
+
</head>
|
| 62 |
+
<body>
|
| 63 |
+
<h1>Enterprise Document Conversion Test</h1>
|
| 64 |
+
<p class="important">This is a comprehensive test document for MarkItDown platform validation.</p>
|
| 65 |
+
|
| 66 |
+
<h2>Document Structure Testing</h2>
|
| 67 |
+
<p>This section tests various structural elements and their conversion accuracy.</p>
|
| 68 |
+
|
| 69 |
+
<h3>List Testing</h3>
|
| 70 |
+
<h4>Unordered Lists</h4>
|
| 71 |
+
<ul>
|
| 72 |
+
<li>Primary list item with <strong>bold text</strong></li>
|
| 73 |
+
<li>Secondary item with <em>italic formatting</em></li>
|
| 74 |
+
<li>Nested list testing:
|
| 75 |
+
<ul>
|
| 76 |
+
<li>Nested item 1</li>
|
| 77 |
+
<li>Nested item 2 with <a href="https://example.com">external link</a></li>
|
| 78 |
+
</ul>
|
| 79 |
+
</li>
|
| 80 |
+
<li>Code reference: <code>function processDocument()</code></li>
|
| 81 |
+
</ul>
|
| 82 |
+
|
| 83 |
+
<h4>Ordered Lists</h4>
|
| 84 |
+
<ol>
|
| 85 |
+
<li>First priority task</li>
|
| 86 |
+
<li>Second priority with emphasis: <span class="highlight">critical deadline</span></li>
|
| 87 |
+
<li>Third priority item</li>
|
| 88 |
+
</ol>
|
| 89 |
+
|
| 90 |
+
<h3>Table Structure Testing</h3>
|
| 91 |
+
<table border="1" style="border-collapse: collapse; width: 100%;">
|
| 92 |
+
<thead>
|
| 93 |
+
<tr style="background-color: #f2f2f2;">
|
| 94 |
+
<th>Feature</th>
|
| 95 |
+
<th>Status</th>
|
| 96 |
+
<th>Priority</th>
|
| 97 |
+
<th>Notes</th>
|
| 98 |
+
</tr>
|
| 99 |
+
</thead>
|
| 100 |
+
<tbody>
|
| 101 |
+
<tr>
|
| 102 |
+
<td>Document Conversion</td>
|
| 103 |
+
<td>✅ Complete</td>
|
| 104 |
+
<td>High</td>
|
| 105 |
+
<td>Core functionality working</td>
|
| 106 |
+
</tr>
|
| 107 |
+
<tr>
|
| 108 |
+
<td>AI Analysis</td>
|
| 109 |
+
<td>🔄 In Progress</td>
|
| 110 |
+
<td>High</td>
|
| 111 |
+
<td>Gemini integration active</td>
|
| 112 |
+
</tr>
|
| 113 |
+
<tr>
|
| 114 |
+
<td>Visualization</td>
|
| 115 |
+
<td>✅ Complete</td>
|
| 116 |
+
<td>Medium</td>
|
| 117 |
+
<td>Interactive dashboards ready</td>
|
| 118 |
+
</tr>
|
| 119 |
+
<tr>
|
| 120 |
+
<td>Export Features</td>
|
| 121 |
+
<td>⏳ Planned</td>
|
| 122 |
+
<td>Low</td>
|
| 123 |
+
<td>Multiple format support</td>
|
| 124 |
+
</tr>
|
| 125 |
+
</tbody>
|
| 126 |
+
</table>
|
| 127 |
+
|
| 128 |
+
<h3>Code Block Testing</h3>
|
| 129 |
+
<p>Example Python integration code:</p>
|
| 130 |
+
<pre><code>
|
| 131 |
+
from markitdown import MarkItDown
|
| 132 |
+
from gemini_connector import GeminiAnalysisEngine
|
| 133 |
+
|
| 134 |
+
async def process_document(file_path, api_key):
|
| 135 |
+
# Initialize components
|
| 136 |
+
md = MarkItDown()
|
| 137 |
+
gemini = GeminiAnalysisEngine(api_key)
|
| 138 |
+
|
| 139 |
+
# Convert document
|
| 140 |
+
result = md.convert(file_path)
|
| 141 |
+
|
| 142 |
+
# Analyze with AI
|
| 143 |
+
analysis = await gemini.analyze_content(result.text_content)
|
| 144 |
+
|
| 145 |
+
return result, analysis
|
| 146 |
+
</code></pre>
|
| 147 |
+
|
| 148 |
+
<h3>Link and Reference Testing</h3>
|
| 149 |
+
<p>This section contains various types of references:</p>
|
| 150 |
+
<ul>
|
| 151 |
+
<li>External link: <a href="https://github.com/microsoft/markitdown">Microsoft MarkItDown Repository</a></li>
|
| 152 |
+
<li>Email reference: <a href="mailto:[email protected]">Technical Support</a></li>
|
| 153 |
+
<li>Internal reference: <a href="#document-structure-testing">Jump to Structure Section</a></li>
|
| 154 |
+
<li>Document reference: See the <a href="./documentation.pdf">full documentation</a> for details</li>
|
| 155 |
+
</ul>
|
| 156 |
+
|
| 157 |
+
<h3>Special Formatting Testing</h3>
|
| 158 |
+
<div>
|
| 159 |
+
<p><strong>Bold text emphasis</strong> and <em>italic styling</em> combined with <u>underlined content</u>.</p>
|
| 160 |
+
<p><del>Strikethrough text</del> and <mark>highlighted content</mark> for attention.</p>
|
| 161 |
+
<p>Mathematical notation: E = mc<sup>2</sup> and chemical formula: H<sub>2</sub>O.</p>
|
| 162 |
+
</div>
|
| 163 |
+
|
| 164 |
+
<h2>Content Quality Assessment</h2>
|
| 165 |
+
<blockquote style="border-left: 4px solid #ccc; padding-left: 16px; font-style: italic;">
|
| 166 |
+
"Quality is not an act, it is a habit. The systematic approach to document conversion
|
| 167 |
+
and analysis ensures consistent, reliable results across diverse content types and formats."
|
| 168 |
+
</blockquote>
|
| 169 |
+
|
| 170 |
+
<h3>Technical Specifications</h3>
|
| 171 |
+
<div style="background-color: #f9f9f9; padding: 15px; border-radius: 5px;">
|
| 172 |
+
<h4>Processing Requirements:</h4>
|
| 173 |
+
<ul>
|
| 174 |
+
<li><strong>Maximum File Size:</strong> 50MB (HF Spaces limit)</li>
|
| 175 |
+
<li><strong>Supported Formats:</strong> PDF, DOCX, PPTX, XLSX, HTML, TXT, CSV, JSON, XML</li>
|
| 176 |
+
<li><strong>Processing Timeout:</strong> 5 minutes maximum</li>
|
| 177 |
+
<li><strong>Memory Usage:</strong> Optimized for 16GB constraint</li>
|
| 178 |
+
</ul>
|
| 179 |
+
</div>
|
| 180 |
+
|
| 181 |
+
<h2>Integration Examples</h2>
|
| 182 |
+
<p>The following examples demonstrate enterprise integration patterns:</p>
|
| 183 |
+
|
| 184 |
+
<h3>Batch Processing Workflow</h3>
|
| 185 |
+
<ol>
|
| 186 |
+
<li>Document ingestion from multiple sources</li>
|
| 187 |
+
<li>Automated quality validation pipeline</li>
|
| 188 |
+
<li>AI-powered content analysis and enhancement</li>
|
| 189 |
+
<li>Structured output generation for downstream systems</li>
|
| 190 |
+
<li>Comprehensive reporting and analytics</li>
|
| 191 |
+
</ol>
|
| 192 |
+
|
| 193 |
+
<footer style="margin-top: 50px; padding-top: 20px; border-top: 1px solid #ccc;">
|
| 194 |
+
<p><em>Generated for MarkItDown Testing Platform - Version 1.0.0</em></p>
|
| 195 |
+
<p><strong>Document ID:</strong> TEST-DOC-001 | <strong>Created:</strong> {timestamp}</p>
|
| 196 |
+
</footer>
|
| 197 |
+
</body>
|
| 198 |
+
</html>
|
| 199 |
+
""".replace('{timestamp}', datetime.now().isoformat())
|
| 200 |
+
|
| 201 |
+
@staticmethod
|
| 202 |
+
def create_test_json() -> str:
|
| 203 |
+
"""Create structured JSON test data"""
|
| 204 |
+
|
| 205 |
+
return json.dumps({
|
| 206 |
+
"document_metadata": {
|
| 207 |
+
"title": "MarkItDown Test Configuration",
|
| 208 |
+
"version": "1.0.0",
|
| 209 |
+
"created": datetime.now().isoformat(),
|
| 210 |
+
"description": "Comprehensive test data for platform validation"
|
| 211 |
+
},
|
| 212 |
+
"processing_config": {
|
| 213 |
+
"max_file_size_mb": 50,
|
| 214 |
+
"timeout_seconds": 300,
|
| 215 |
+
"supported_formats": [
|
| 216 |
+
"pdf", "docx", "pptx", "xlsx",
|
| 217 |
+
"html", "txt", "csv", "json", "xml"
|
| 218 |
+
],
|
| 219 |
+
"ai_analysis": {
|
| 220 |
+
"enabled": True,
|
| 221 |
+
"models": ["gemini-1.5-pro", "gemini-1.5-flash"],
|
| 222 |
+
"analysis_types": [
|
| 223 |
+
"quality_analysis",
|
| 224 |
+
"structure_review",
|
| 225 |
+
"content_summary",
|
| 226 |
+
"extraction_quality"
|
| 227 |
+
]
|
| 228 |
+
}
|
| 229 |
+
},
|
| 230 |
+
"test_scenarios": [
|
| 231 |
+
{
|
| 232 |
+
"name": "Basic Document Conversion",
|
| 233 |
+
"description": "Test core MarkItDown functionality",
|
| 234 |
+
"expected_elements": [
|
| 235 |
+
"headers", "paragraphs", "lists", "tables", "links"
|
| 236 |
+
],
|
| 237 |
+
"quality_threshold": 7.0
|
| 238 |
+
},
|
| 239 |
+
{
|
| 240 |
+
"name": "AI Analysis Integration",
|
| 241 |
+
"description": "Test Gemini API integration",
|
| 242 |
+
"required_api_key": True,
|
| 243 |
+
"expected_analysis": [
|
| 244 |
+
"overall_score", "detailed_feedback", "recommendations"
|
| 245 |
+
],
|
| 246 |
+
"quality_threshold": 8.0
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
"name": "Performance Benchmarking",
|
| 250 |
+
"description": "Test processing speed and resource usage",
|
| 251 |
+
"metrics": [
|
| 252 |
+
"processing_time", "memory_usage", "cpu_utilization"
|
| 253 |
+
],
|
| 254 |
+
"performance_threshold": {
|
| 255 |
+
"processing_time_seconds": 60,
|
| 256 |
+
"memory_usage_mb": 1000
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
],
|
| 260 |
+
"quality_metrics": {
|
| 261 |
+
"structural_integrity": {
|
| 262 |
+
"weight": 0.3,
|
| 263 |
+
"components": ["headers", "lists", "tables", "formatting"]
|
| 264 |
+
},
|
| 265 |
+
"content_preservation": {
|
| 266 |
+
"weight": 0.25,
|
| 267 |
+
"components": ["text_accuracy", "link_preservation", "data_integrity"]
|
| 268 |
+
},
|
| 269 |
+
"ai_analysis_quality": {
|
| 270 |
+
"weight": 0.25,
|
| 271 |
+
"components": ["insight_depth", "recommendation_quality", "accuracy"]
|
| 272 |
+
},
|
| 273 |
+
"processing_efficiency": {
|
| 274 |
+
"weight": 0.2,
|
| 275 |
+
"components": ["speed", "resource_usage", "reliability"]
|
| 276 |
+
}
|
| 277 |
+
},
|
| 278 |
+
"expected_outputs": {
|
| 279 |
+
"markdown_conversion": {
|
| 280 |
+
"min_length": 1000,
|
| 281 |
+
"required_elements": ["# ", "## ", "- ", "| "],
|
| 282 |
+
"quality_indicators": ["proper_escaping", "structure_preservation"]
|
| 283 |
+
},
|
| 284 |
+
"ai_analysis": {
|
| 285 |
+
"required_fields": ["overall_score", "detailed_feedback"],
|
| 286 |
+
"score_range": [0, 10],
|
| 287 |
+
"feedback_min_length": 100
|
| 288 |
+
},
|
| 289 |
+
"visualization": {
|
| 290 |
+
"chart_types": ["radar", "bar", "treemap", "line"],
|
| 291 |
+
"interactive_elements": True,
|
| 292 |
+
"export_formats": ["html", "png", "svg"]
|
| 293 |
+
}
|
| 294 |
+
}
|
| 295 |
+
}, indent=2)
|
| 296 |
+
|
| 297 |
+
@staticmethod
|
| 298 |
+
def create_test_csv() -> str:
|
| 299 |
+
"""Create CSV test data with various data types"""
|
| 300 |
+
|
| 301 |
+
return """Name,Age,Department,Salary,Join Date,Performance Rating,Notes
|
| 302 |
+
John Smith,34,Engineering,75000,2023-01-15,4.5,"Excellent problem solver, team lead"
|
| 303 |
+
Maria Garcia,28,Marketing,62000,2023-03-20,4.2,"Creative campaigns, social media expert"
|
| 304 |
+
David Chen,41,Finance,82000,2022-08-10,4.8,"CPA certified, process optimization"
|
| 305 |
+
Sarah Johnson,29,Engineering,68000,2023-02-28,4.3,"Full-stack developer, agile advocate"
|
| 306 |
+
Michael Brown,36,Sales,71000,2022-11-05,4.6,"Top performer, client relationship expert"
|
| 307 |
+
Lisa Wang,32,Product,78000,2023-01-08,4.4,"UX specialist, user research focused"
|
| 308 |
+
Robert Davis,45,Operations,69000,2022-07-22,4.1,"Supply chain optimization, vendor management"
|
| 309 |
+
Jennifer Wilson,33,HR,59000,2023-04-12,4.3,"Talent acquisition, employee engagement"
|
| 310 |
+
James Anderson,38,Engineering,81000,2022-09-18,4.7,"Senior architect, technical mentoring"
|
| 311 |
+
Emily Taylor,27,Marketing,57000,2023-05-01,4.0,"Digital marketing, content strategy"
|
| 312 |
+
"""
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
class PlatformTester:
|
| 316 |
+
"""Comprehensive testing suite for the MarkItDown Testing Platform"""
|
| 317 |
+
|
| 318 |
+
def __init__(self):
|
| 319 |
+
# Initialize platform components
|
| 320 |
+
self.processing_config = ProcessingConfig()
|
| 321 |
+
self.resource_manager = ResourceManager(self.processing_config)
|
| 322 |
+
self.file_handler = StreamlineFileHandler(self.resource_manager)
|
| 323 |
+
self.conversion_engine = HFConversionEngine(self.resource_manager, self.processing_config)
|
| 324 |
+
self.viz_engine = InteractiveVisualizationEngine()
|
| 325 |
+
self.quality_calculator = QualityMetricsCalculator()
|
| 326 |
+
|
| 327 |
+
# Test results storage
|
| 328 |
+
self.test_results = []
|
| 329 |
+
self.performance_metrics = []
|
| 330 |
+
|
| 331 |
+
async def run_basic_conversion_test(self) -> Dict[str, Any]:
|
| 332 |
+
"""Test basic document conversion functionality"""
|
| 333 |
+
|
| 334 |
+
logger.info("Running basic conversion test...")
|
| 335 |
+
|
| 336 |
+
test_start = time.time()
|
| 337 |
+
|
| 338 |
+
try:
|
| 339 |
+
# Create test HTML document
|
| 340 |
+
html_content = DocumentSampleGenerator.create_test_html()
|
| 341 |
+
|
| 342 |
+
# Create temporary file
|
| 343 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as temp_file:
|
| 344 |
+
temp_file.write(html_content)
|
| 345 |
+
temp_file_path = temp_file.name
|
| 346 |
+
|
| 347 |
+
# Simulate file upload
|
| 348 |
+
class MockFile:
|
| 349 |
+
def __init__(self, path):
|
| 350 |
+
self.name = path
|
| 351 |
+
with open(path, 'rb') as f:
|
| 352 |
+
self.content = f.read()
|
| 353 |
+
self.size = len(self.content)
|
| 354 |
+
|
| 355 |
+
def read(self):
|
| 356 |
+
return self.content
|
| 357 |
+
|
| 358 |
+
mock_file = MockFile(temp_file_path)
|
| 359 |
+
|
| 360 |
+
# Process file
|
| 361 |
+
file_result = await self.file_handler.process_upload(mock_file)
|
| 362 |
+
|
| 363 |
+
if not file_result.success:
|
| 364 |
+
return {
|
| 365 |
+
'test_name': 'basic_conversion',
|
| 366 |
+
'status': 'failed',
|
| 367 |
+
'error': file_result.error_message,
|
| 368 |
+
'duration': time.time() - test_start
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
# Convert document
|
| 372 |
+
conversion_result = await self.conversion_engine.convert_stream(
|
| 373 |
+
mock_file.content, file_result.metadata
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
if not conversion_result.success:
|
| 377 |
+
return {
|
| 378 |
+
'test_name': 'basic_conversion',
|
| 379 |
+
'status': 'failed',
|
| 380 |
+
'error': conversion_result.error_message,
|
| 381 |
+
'duration': time.time() - test_start
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
# Validate conversion results
|
| 385 |
+
validation_results = self._validate_conversion_output(conversion_result)
|
| 386 |
+
|
| 387 |
+
# Calculate quality metrics
|
| 388 |
+
quality_metrics = self.quality_calculator.calculate_conversion_quality_metrics(
|
| 389 |
+
conversion_result
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
test_duration = time.time() - test_start
|
| 393 |
+
|
| 394 |
+
# Clean up
|
| 395 |
+
Path(temp_file_path).unlink(missing_ok=True)
|
| 396 |
+
|
| 397 |
+
return {
|
| 398 |
+
'test_name': 'basic_conversion',
|
| 399 |
+
'status': 'passed',
|
| 400 |
+
'duration': test_duration,
|
| 401 |
+
'validation': validation_results,
|
| 402 |
+
'quality_metrics': quality_metrics,
|
| 403 |
+
'performance': {
|
| 404 |
+
'processing_time': conversion_result.processing_time,
|
| 405 |
+
'content_length': len(conversion_result.content),
|
| 406 |
+
'throughput': len(conversion_result.content) / test_duration
|
| 407 |
+
}
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
except Exception as e:
|
| 411 |
+
return {
|
| 412 |
+
'test_name': 'basic_conversion',
|
| 413 |
+
'status': 'error',
|
| 414 |
+
'error': str(e),
|
| 415 |
+
'duration': time.time() - test_start
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
async def run_ai_analysis_test(self, gemini_api_key: str) -> Dict[str, Any]:
|
| 419 |
+
"""Test AI analysis integration with Gemini"""
|
| 420 |
+
|
| 421 |
+
logger.info("Running AI analysis test...")
|
| 422 |
+
|
| 423 |
+
if not gemini_api_key:
|
| 424 |
+
return {
|
| 425 |
+
'test_name': 'ai_analysis',
|
| 426 |
+
'status': 'skipped',
|
| 427 |
+
'reason': 'No API key provided'
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
test_start = time.time()
|
| 431 |
+
|
| 432 |
+
try:
|
| 433 |
+
# Create Gemini engine
|
| 434 |
+
gemini_config = GeminiConfig(api_key=gemini_api_key)
|
| 435 |
+
gemini_engine = GeminiAnalysisEngine(gemini_config)
|
| 436 |
+
|
| 437 |
+
# Create test content
|
| 438 |
+
test_content = """
|
| 439 |
+
# Test Document for AI Analysis
|
| 440 |
+
|
| 441 |
+
This is a comprehensive test document designed to evaluate the AI analysis capabilities
|
| 442 |
+
of the MarkItDown Testing Platform.
|
| 443 |
+
|
| 444 |
+
## Document Structure
|
| 445 |
+
|
| 446 |
+
### Headers and Organization
|
| 447 |
+
This document contains multiple heading levels to test structure recognition.
|
| 448 |
+
|
| 449 |
+
### Content Quality
|
| 450 |
+
The content includes various elements:
|
| 451 |
+
- Technical terminology and concepts
|
| 452 |
+
- Business-oriented language and metrics
|
| 453 |
+
- Complex sentence structures
|
| 454 |
+
- Tables and structured data
|
| 455 |
+
|
| 456 |
+
| Metric | Value | Status |
|
| 457 |
+
|--------|-------|--------|
|
| 458 |
+
| Conversion Quality | 8.5/10 | Excellent |
|
| 459 |
+
| Processing Speed | 2.3s | Good |
|
| 460 |
+
| Resource Usage | 45% | Optimal |
|
| 461 |
+
|
| 462 |
+
## Analysis Requirements
|
| 463 |
+
|
| 464 |
+
This content should trigger comprehensive analysis covering:
|
| 465 |
+
1. **Structure Assessment**: Header hierarchy and organization
|
| 466 |
+
2. **Content Quality**: Information density and clarity
|
| 467 |
+
3. **Technical Accuracy**: Preservation of data and formatting
|
| 468 |
+
4. **Readability**: AI-friendly output optimization
|
| 469 |
+
|
| 470 |
+
The analysis should provide actionable insights and recommendations
|
| 471 |
+
for improving document conversion processes.
|
| 472 |
+
"""
|
| 473 |
+
|
| 474 |
+
# Test different analysis types
|
| 475 |
+
analysis_types = [
|
| 476 |
+
AnalysisType.QUALITY_ANALYSIS,
|
| 477 |
+
AnalysisType.STRUCTURE_REVIEW,
|
| 478 |
+
AnalysisType.CONTENT_SUMMARY
|
| 479 |
+
]
|
| 480 |
+
|
| 481 |
+
analysis_results = {}
|
| 482 |
+
|
| 483 |
+
for analysis_type in analysis_types:
|
| 484 |
+
analysis_request = AnalysisRequest(
|
| 485 |
+
content=test_content,
|
| 486 |
+
analysis_type=analysis_type,
|
| 487 |
+
model=GeminiModel.PRO
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
result = await gemini_engine.analyze_content(analysis_request)
|
| 491 |
+
analysis_results[analysis_type.value] = {
|
| 492 |
+
'success': result.success,
|
| 493 |
+
'processing_time': result.processing_time,
|
| 494 |
+
'content_length': len(str(result.content)) if result.success else 0,
|
| 495 |
+
'error': result.error_message if not result.success else None
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
test_duration = time.time() - test_start
|
| 499 |
+
|
| 500 |
+
# Calculate success rate
|
| 501 |
+
successful_analyses = sum(1 for r in analysis_results.values() if r['success'])
|
| 502 |
+
success_rate = successful_analyses / len(analysis_types) * 100
|
| 503 |
+
|
| 504 |
+
return {
|
| 505 |
+
'test_name': 'ai_analysis',
|
| 506 |
+
'status': 'passed' if success_rate > 0 else 'failed',
|
| 507 |
+
'duration': test_duration,
|
| 508 |
+
'success_rate': success_rate,
|
| 509 |
+
'analysis_results': analysis_results,
|
| 510 |
+
'performance_metrics': gemini_engine.get_performance_metrics()
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
except Exception as e:
|
| 514 |
+
return {
|
| 515 |
+
'test_name': 'ai_analysis',
|
| 516 |
+
'status': 'error',
|
| 517 |
+
'error': str(e),
|
| 518 |
+
'duration': time.time() - test_start
|
| 519 |
+
}
|
| 520 |
+
|
| 521 |
+
async def run_performance_benchmark(self) -> Dict[str, Any]:
|
| 522 |
+
"""Run comprehensive performance benchmark"""
|
| 523 |
+
|
| 524 |
+
logger.info("Running performance benchmark...")
|
| 525 |
+
|
| 526 |
+
benchmark_start = time.time()
|
| 527 |
+
benchmark_results = {
|
| 528 |
+
'test_name': 'performance_benchmark',
|
| 529 |
+
'start_time': benchmark_start,
|
| 530 |
+
'scenarios': []
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
# Test scenarios with different file sizes and types
|
| 534 |
+
test_scenarios = [
|
| 535 |
+
{
|
| 536 |
+
'name': 'Small HTML Document',
|
| 537 |
+
'content': DocumentSampleGenerator.create_test_html()[:1000],
|
| 538 |
+
'format': 'html'
|
| 539 |
+
},
|
| 540 |
+
{
|
| 541 |
+
'name': 'Medium HTML Document',
|
| 542 |
+
'content': DocumentSampleGenerator.create_test_html(),
|
| 543 |
+
'format': 'html'
|
| 544 |
+
},
|
| 545 |
+
{
|
| 546 |
+
'name': 'Large HTML Document',
|
| 547 |
+
'content': DocumentSampleGenerator.create_test_html() * 3,
|
| 548 |
+
'format': 'html'
|
| 549 |
+
},
|
| 550 |
+
{
|
| 551 |
+
'name': 'Structured JSON Data',
|
| 552 |
+
'content': DocumentSampleGenerator.create_test_json(),
|
| 553 |
+
'format': 'json'
|
| 554 |
+
},
|
| 555 |
+
{
|
| 556 |
+
'name': 'CSV Data Table',
|
| 557 |
+
'content': DocumentSampleGenerator.create_test_csv(),
|
| 558 |
+
'format': 'csv'
|
| 559 |
+
}
|
| 560 |
+
]
|
| 561 |
+
|
| 562 |
+
for scenario in test_scenarios:
|
| 563 |
+
scenario_start = time.time()
|
| 564 |
+
|
| 565 |
+
try:
|
| 566 |
+
# Create temporary file
|
| 567 |
+
suffix = f".{scenario['format']}"
|
| 568 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix=suffix, delete=False) as temp_file:
|
| 569 |
+
temp_file.write(scenario['content'])
|
| 570 |
+
temp_file_path = temp_file.name
|
| 571 |
+
|
| 572 |
+
# Simulate file processing
|
| 573 |
+
class MockFile:
|
| 574 |
+
def __init__(self, path, content):
|
| 575 |
+
self.name = path
|
| 576 |
+
self.content = content.encode('utf-8')
|
| 577 |
+
self.size = len(self.content)
|
| 578 |
+
|
| 579 |
+
def read(self):
|
| 580 |
+
return self.content
|
| 581 |
+
|
| 582 |
+
mock_file = MockFile(temp_file_path, scenario['content'])
|
| 583 |
+
|
| 584 |
+
# Measure processing steps
|
| 585 |
+
step_timings = {}
|
| 586 |
+
|
| 587 |
+
# File handling
|
| 588 |
+
step_start = time.time()
|
| 589 |
+
file_result = await self.file_handler.process_upload(mock_file)
|
| 590 |
+
step_timings['file_handling'] = time.time() - step_start
|
| 591 |
+
|
| 592 |
+
if file_result.success:
|
| 593 |
+
# Document conversion
|
| 594 |
+
step_start = time.time()
|
| 595 |
+
conversion_result = await self.conversion_engine.convert_stream(
|
| 596 |
+
mock_file.content, file_result.metadata
|
| 597 |
+
)
|
| 598 |
+
step_timings['conversion'] = time.time() - step_start
|
| 599 |
+
|
| 600 |
+
if conversion_result.success:
|
| 601 |
+
# Quality metrics calculation
|
| 602 |
+
step_start = time.time()
|
| 603 |
+
quality_metrics = self.quality_calculator.calculate_conversion_quality_metrics(
|
| 604 |
+
conversion_result
|
| 605 |
+
)
|
| 606 |
+
step_timings['quality_calculation'] = time.time() - step_start
|
| 607 |
+
|
| 608 |
+
scenario_duration = time.time() - scenario_start
|
| 609 |
+
|
| 610 |
+
scenario_result = {
|
| 611 |
+
'name': scenario['name'],
|
| 612 |
+
'status': 'success',
|
| 613 |
+
'duration': scenario_duration,
|
| 614 |
+
'step_timings': step_timings,
|
| 615 |
+
'content_stats': {
|
| 616 |
+
'input_size': len(scenario['content']),
|
| 617 |
+
'output_size': len(conversion_result.content),
|
| 618 |
+
'compression_ratio': len(conversion_result.content) / len(scenario['content'])
|
| 619 |
+
},
|
| 620 |
+
'performance_metrics': {
|
| 621 |
+
'throughput_chars_per_sec': len(scenario['content']) / scenario_duration,
|
| 622 |
+
'processing_efficiency': quality_metrics.get('composite_score', 0) / scenario_duration
|
| 623 |
+
}
|
| 624 |
+
}
|
| 625 |
+
else:
|
| 626 |
+
scenario_result = {
|
| 627 |
+
'name': scenario['name'],
|
| 628 |
+
'status': 'conversion_failed',
|
| 629 |
+
'error': conversion_result.error_message,
|
| 630 |
+
'duration': time.time() - scenario_start
|
| 631 |
+
}
|
| 632 |
+
else:
|
| 633 |
+
scenario_result = {
|
| 634 |
+
'name': scenario['name'],
|
| 635 |
+
'status': 'file_handling_failed',
|
| 636 |
+
'error': file_result.error_message,
|
| 637 |
+
'duration': time.time() - scenario_start
|
| 638 |
+
}
|
| 639 |
+
|
| 640 |
+
benchmark_results['scenarios'].append(scenario_result)
|
| 641 |
+
|
| 642 |
+
# Clean up
|
| 643 |
+
Path(temp_file_path).unlink(missing_ok=True)
|
| 644 |
+
|
| 645 |
+
except Exception as e:
|
| 646 |
+
benchmark_results['scenarios'].append({
|
| 647 |
+
'name': scenario['name'],
|
| 648 |
+
'status': 'error',
|
| 649 |
+
'error': str(e),
|
| 650 |
+
'duration': time.time() - scenario_start
|
| 651 |
+
})
|
| 652 |
+
|
| 653 |
+
# Calculate overall benchmark metrics
|
| 654 |
+
successful_scenarios = [s for s in benchmark_results['scenarios'] if s['status'] == 'success']
|
| 655 |
+
total_duration = time.time() - benchmark_start
|
| 656 |
+
|
| 657 |
+
benchmark_results.update({
|
| 658 |
+
'total_duration': total_duration,
|
| 659 |
+
'scenarios_total': len(test_scenarios),
|
| 660 |
+
'scenarios_successful': len(successful_scenarios),
|
| 661 |
+
'success_rate': len(successful_scenarios) / len(test_scenarios) * 100,
|
| 662 |
+
'average_processing_time': sum(s['duration'] for s in successful_scenarios) / len(successful_scenarios) if successful_scenarios else 0,
|
| 663 |
+
'total_throughput': sum(s.get('performance_metrics', {}).get('throughput_chars_per_sec', 0) for s in successful_scenarios),
|
| 664 |
+
'status': 'passed' if len(successful_scenarios) > len(test_scenarios) / 2 else 'failed'
|
| 665 |
+
})
|
| 666 |
+
|
| 667 |
+
return benchmark_results
|
| 668 |
+
|
| 669 |
+
async def run_visualization_test(self) -> Dict[str, Any]:
|
| 670 |
+
"""Test visualization generation capabilities"""
|
| 671 |
+
|
| 672 |
+
logger.info("Running visualization test...")
|
| 673 |
+
|
| 674 |
+
test_start = time.time()
|
| 675 |
+
|
| 676 |
+
try:
|
| 677 |
+
# Create mock conversion result for testing
|
| 678 |
+
mock_conversion_result = ProcessingResult(
|
| 679 |
+
success=True,
|
| 680 |
+
content=DocumentSampleGenerator.create_test_html(),
|
| 681 |
+
metadata={
|
| 682 |
+
'original_file': {
|
| 683 |
+
'filename': 'test_document.html',
|
| 684 |
+
'size': 5000,
|
| 685 |
+
'extension': '.html'
|
| 686 |
+
}
|
| 687 |
+
},
|
| 688 |
+
processing_time=2.5
|
| 689 |
+
)
|
| 690 |
+
|
| 691 |
+
# Test visualization generation
|
| 692 |
+
visualization_tests = []
|
| 693 |
+
|
| 694 |
+
# Quality Dashboard Test
|
| 695 |
+
try:
|
| 696 |
+
dashboard_start = time.time()
|
| 697 |
+
quality_dashboard = self.viz_engine.create_quality_dashboard(mock_conversion_result)
|
| 698 |
+
dashboard_duration = time.time() - dashboard_start
|
| 699 |
+
|
| 700 |
+
visualization_tests.append({
|
| 701 |
+
'name': 'quality_dashboard',
|
| 702 |
+
'status': 'success',
|
| 703 |
+
'duration': dashboard_duration,
|
| 704 |
+
'chart_type': 'multi-chart dashboard',
|
| 705 |
+
'data_points': len(quality_dashboard.data) if hasattr(quality_dashboard, 'data') else 'multiple'
|
| 706 |
+
})
|
| 707 |
+
except Exception as e:
|
| 708 |
+
visualization_tests.append({
|
| 709 |
+
'name': 'quality_dashboard',
|
| 710 |
+
'status': 'failed',
|
| 711 |
+
'error': str(e)
|
| 712 |
+
})
|
| 713 |
+
|
| 714 |
+
# Structure Analysis Test
|
| 715 |
+
try:
|
| 716 |
+
structure_start = time.time()
|
| 717 |
+
structure_viz = self.viz_engine.create_structural_analysis_viz(mock_conversion_result)
|
| 718 |
+
structure_duration = time.time() - structure_start
|
| 719 |
+
|
| 720 |
+
visualization_tests.append({
|
| 721 |
+
'name': 'structure_analysis',
|
| 722 |
+
'status': 'success',
|
| 723 |
+
'duration': structure_duration,
|
| 724 |
+
'chart_type': 'structural analysis',
|
| 725 |
+
'components': 'treemap, pie, bar, scatter'
|
| 726 |
+
})
|
| 727 |
+
except Exception as e:
|
| 728 |
+
visualization_tests.append({
|
| 729 |
+
'name': 'structure_analysis',
|
| 730 |
+
'status': 'failed',
|
| 731 |
+
'error': str(e)
|
| 732 |
+
})
|
| 733 |
+
|
| 734 |
+
# Export Ready Report Test
|
| 735 |
+
try:
|
| 736 |
+
report_start = time.time()
|
| 737 |
+
export_report = self.viz_engine.create_export_ready_report(mock_conversion_result)
|
| 738 |
+
report_duration = time.time() - report_start
|
| 739 |
+
|
| 740 |
+
visualization_tests.append({
|
| 741 |
+
'name': 'export_report',
|
| 742 |
+
'status': 'success',
|
| 743 |
+
'duration': report_duration,
|
| 744 |
+
'chart_count': len(export_report),
|
| 745 |
+
'report_types': list(export_report.keys())
|
| 746 |
+
})
|
| 747 |
+
except Exception as e:
|
| 748 |
+
visualization_tests.append({
|
| 749 |
+
'name': 'export_report',
|
| 750 |
+
'status': 'failed',
|
| 751 |
+
'error': str(e)
|
| 752 |
+
})
|
| 753 |
+
|
| 754 |
+
test_duration = time.time() - test_start
|
| 755 |
+
successful_tests = [t for t in visualization_tests if t['status'] == 'success']
|
| 756 |
+
|
| 757 |
+
return {
|
| 758 |
+
'test_name': 'visualization',
|
| 759 |
+
'status': 'passed' if len(successful_tests) > 0 else 'failed',
|
| 760 |
+
'duration': test_duration,
|
| 761 |
+
'tests_run': len(visualization_tests),
|
| 762 |
+
'tests_successful': len(successful_tests),
|
| 763 |
+
'success_rate': len(successful_tests) / len(visualization_tests) * 100,
|
| 764 |
+
'test_details': visualization_tests
|
| 765 |
+
}
|
| 766 |
+
|
| 767 |
+
except Exception as e:
|
| 768 |
+
return {
|
| 769 |
+
'test_name': 'visualization',
|
| 770 |
+
'status': 'error',
|
| 771 |
+
'error': str(e),
|
| 772 |
+
'duration': time.time() - test_start
|
| 773 |
+
}
|
| 774 |
+
|
| 775 |
+
async def run_comprehensive_test_suite(self, gemini_api_key: Optional[str] = None) -> Dict[str, Any]:
|
| 776 |
+
"""Run complete test suite with all components"""
|
| 777 |
+
|
| 778 |
+
logger.info("Starting comprehensive test suite...")
|
| 779 |
+
|
| 780 |
+
suite_start = time.time()
|
| 781 |
+
|
| 782 |
+
# Run all tests
|
| 783 |
+
test_results = []
|
| 784 |
+
|
| 785 |
+
# Basic conversion test
|
| 786 |
+
basic_test = await self.run_basic_conversion_test()
|
| 787 |
+
test_results.append(basic_test)
|
| 788 |
+
|
| 789 |
+
# AI analysis test (if API key provided)
|
| 790 |
+
if gemini_api_key:
|
| 791 |
+
ai_test = await self.run_ai_analysis_test(gemini_api_key)
|
| 792 |
+
test_results.append(ai_test)
|
| 793 |
+
|
| 794 |
+
# Performance benchmark
|
| 795 |
+
perf_test = await self.run_performance_benchmark()
|
| 796 |
+
test_results.append(perf_test)
|
| 797 |
+
|
| 798 |
+
# Visualization test
|
| 799 |
+
viz_test = await self.run_visualization_test()
|
| 800 |
+
test_results.append(viz_test)
|
| 801 |
+
|
| 802 |
+
# Calculate overall results
|
| 803 |
+
suite_duration = time.time() - suite_start
|
| 804 |
+
passed_tests = [t for t in test_results if t.get('status') == 'passed']
|
| 805 |
+
failed_tests = [t for t in test_results if t.get('status') in ['failed', 'error']]
|
| 806 |
+
|
| 807 |
+
# Generate comprehensive report
|
| 808 |
+
comprehensive_report = {
|
| 809 |
+
'test_suite': 'MarkItDown Platform Comprehensive Test',
|
| 810 |
+
'timestamp': datetime.now().isoformat(),
|
| 811 |
+
'duration': suite_duration,
|
| 812 |
+
'summary': {
|
| 813 |
+
'total_tests': len(test_results),
|
| 814 |
+
'passed': len(passed_tests),
|
| 815 |
+
'failed': len(failed_tests),
|
| 816 |
+
'skipped': len([t for t in test_results if t.get('status') == 'skipped']),
|
| 817 |
+
'success_rate': len(passed_tests) / len(test_results) * 100
|
| 818 |
+
},
|
| 819 |
+
'test_results': test_results,
|
| 820 |
+
'system_info': self._get_system_info(),
|
| 821 |
+
'recommendations': self._generate_recommendations(test_results),
|
| 822 |
+
'overall_status': 'PASSED' if len(passed_tests) > len(test_results) / 2 else 'FAILED'
|
| 823 |
+
}
|
| 824 |
+
|
| 825 |
+
return comprehensive_report
|
| 826 |
+
|
| 827 |
+
def _validate_conversion_output(self, conversion_result: ProcessingResult) -> Dict[str, Any]:
|
| 828 |
+
"""Validate conversion output quality and completeness"""
|
| 829 |
+
|
| 830 |
+
content = conversion_result.content
|
| 831 |
+
validation_results = {
|
| 832 |
+
'content_length_ok': len(content) > 100,
|
| 833 |
+
'has_headers': content.count('#') > 0,
|
| 834 |
+
'has_lists': content.count('- ') > 0 or content.count('* ') > 0,
|
| 835 |
+
'has_tables': content.count('|') > 0,
|
| 836 |
+
'has_links': content.count('](') > 0,
|
| 837 |
+
'proper_encoding': all(ord(char) < 128 for char in content[:1000]), # ASCII check sample
|
| 838 |
+
'no_empty_sections': not bool(content.count('##\n\n##'))
|
| 839 |
+
}
|
| 840 |
+
|
| 841 |
+
# Calculate validation score
|
| 842 |
+
validation_score = sum(validation_results.values()) / len(validation_results)
|
| 843 |
+
validation_results['overall_score'] = validation_score
|
| 844 |
+
validation_results['status'] = 'passed' if validation_score > 0.7 else 'warning' if validation_score > 0.5 else 'failed'
|
| 845 |
+
|
| 846 |
+
return validation_results
|
| 847 |
+
|
| 848 |
+
def _get_system_info(self) -> Dict[str, Any]:
|
| 849 |
+
"""Get system information for test report"""
|
| 850 |
+
|
| 851 |
+
try:
|
| 852 |
+
import psutil
|
| 853 |
+
import platform
|
| 854 |
+
|
| 855 |
+
memory = psutil.virtual_memory()
|
| 856 |
+
|
| 857 |
+
return {
|
| 858 |
+
'platform': platform.platform(),
|
| 859 |
+
'python_version': platform.python_version(),
|
| 860 |
+
'cpu_count': psutil.cpu_count(),
|
| 861 |
+
'memory_total_gb': memory.total / (1024**3),
|
| 862 |
+
'memory_available_gb': memory.available / (1024**3),
|
| 863 |
+
'architecture': platform.architecture()[0]
|
| 864 |
+
}
|
| 865 |
+
except Exception as e:
|
| 866 |
+
return {'error': f'Could not gather system info: {e}'}
|
| 867 |
+
|
| 868 |
+
def _generate_recommendations(self, test_results: List[Dict[str, Any]]) -> List[str]:
|
| 869 |
+
"""Generate recommendations based on test results"""
|
| 870 |
+
|
| 871 |
+
recommendations = []
|
| 872 |
+
|
| 873 |
+
# Analyze test results for recommendations
|
| 874 |
+
for test in test_results:
|
| 875 |
+
if test.get('status') == 'failed':
|
| 876 |
+
test_name = test.get('test_name', 'unknown')
|
| 877 |
+
recommendations.append(f"❌ {test_name.title()} test failed - investigate {test.get('error', 'unknown error')}")
|
| 878 |
+
|
| 879 |
+
elif test.get('status') == 'passed':
|
| 880 |
+
test_name = test.get('test_name', 'unknown')
|
| 881 |
+
|
| 882 |
+
# Performance recommendations
|
| 883 |
+
if 'duration' in test and test['duration'] > 30:
|
| 884 |
+
recommendations.append(f"⚠️ {test_name.title()} test took {test['duration']:.2f}s - consider optimization")
|
| 885 |
+
|
| 886 |
+
# Success rate recommendations
|
| 887 |
+
if 'success_rate' in test and test['success_rate'] < 90:
|
| 888 |
+
recommendations.append(f"⚠️ {test_name.title()} success rate is {test['success_rate']:.1f}% - investigate reliability issues")
|
| 889 |
+
|
| 890 |
+
# General recommendations
|
| 891 |
+
if not any('ai_analysis' in str(test) for test in test_results):
|
| 892 |
+
recommendations.append("💡 Consider adding Gemini API key for AI analysis testing")
|
| 893 |
+
|
| 894 |
+
if not recommendations:
|
| 895 |
+
recommendations.append("✅ All tests passed successfully - platform ready for production use")
|
| 896 |
+
|
| 897 |
+
return recommendations
|
| 898 |
+
|
| 899 |
+
|
| 900 |
+
class UsageExamples:
|
| 901 |
+
"""Practical usage examples for different scenarios"""
|
| 902 |
+
|
| 903 |
+
@staticmethod
|
| 904 |
+
async def example_basic_usage():
|
| 905 |
+
"""Example: Basic document conversion"""
|
| 906 |
+
|
| 907 |
+
print("=== Basic Document Conversion Example ===")
|
| 908 |
+
|
| 909 |
+
# Initialize components
|
| 910 |
+
config = ProcessingConfig()
|
| 911 |
+
resource_manager = ResourceManager(config)
|
| 912 |
+
file_handler = StreamlineFileHandler(resource_manager)
|
| 913 |
+
conversion_engine = HFConversionEngine(resource_manager, config)
|
| 914 |
+
|
| 915 |
+
# Create sample document
|
| 916 |
+
sample_html = DocumentSampleGenerator.create_test_html()
|
| 917 |
+
|
| 918 |
+
# Simulate file upload
|
| 919 |
+
class MockFile:
|
| 920 |
+
def __init__(self, content):
|
| 921 |
+
self.name = "sample.html"
|
| 922 |
+
self.content = content.encode('utf-8')
|
| 923 |
+
self.size = len(self.content)
|
| 924 |
+
|
| 925 |
+
def read(self):
|
| 926 |
+
return self.content
|
| 927 |
+
|
| 928 |
+
mock_file = MockFile(sample_html)
|
| 929 |
+
|
| 930 |
+
try:
|
| 931 |
+
# Process file
|
| 932 |
+
print("1. Processing uploaded file...")
|
| 933 |
+
file_result = await file_handler.process_upload(mock_file)
|
| 934 |
+
|
| 935 |
+
if file_result.success:
|
| 936 |
+
print(f" ✅ File processed: {file_result.metadata['filename']}")
|
| 937 |
+
|
| 938 |
+
# Convert document
|
| 939 |
+
print("2. Converting to Markdown...")
|
| 940 |
+
conversion_result = await conversion_engine.convert_stream(
|
| 941 |
+
mock_file.content, file_result.metadata
|
| 942 |
+
)
|
| 943 |
+
|
| 944 |
+
if conversion_result.success:
|
| 945 |
+
print(f" ✅ Conversion successful in {conversion_result.processing_time:.2f}s")
|
| 946 |
+
print(f" 📄 Generated {len(conversion_result.content)} characters")
|
| 947 |
+
print(f" 📋 Preview: {conversion_result.content[:200]}...")
|
| 948 |
+
|
| 949 |
+
# Calculate quality metrics
|
| 950 |
+
print("3. Calculating quality metrics...")
|
| 951 |
+
quality_calculator = QualityMetricsCalculator()
|
| 952 |
+
metrics = quality_calculator.calculate_conversion_quality_metrics(conversion_result)
|
| 953 |
+
|
| 954 |
+
print(f" 📊 Composite Score: {metrics.get('composite_score', 0):.1f}/10")
|
| 955 |
+
print(f" 📈 Word Count: {metrics.get('basic_metrics', {}).get('total_words', 0)}")
|
| 956 |
+
print(f" 🏗️ Structure Elements: {metrics.get('structural_metrics', {}).get('header_count', 0)} headers")
|
| 957 |
+
|
| 958 |
+
else:
|
| 959 |
+
print(f" ❌ Conversion failed: {conversion_result.error_message}")
|
| 960 |
+
else:
|
| 961 |
+
print(f" ❌ File processing failed: {file_result.error_message}")
|
| 962 |
+
|
| 963 |
+
except Exception as e:
|
| 964 |
+
print(f" ❌ Example failed: {e}")
|
| 965 |
+
|
| 966 |
+
print("\n" + "="*50 + "\n")
|
| 967 |
+
|
| 968 |
+
@staticmethod
|
| 969 |
+
async def example_ai_integration(api_key: str):
|
| 970 |
+
"""Example: AI-powered analysis integration"""
|
| 971 |
+
|
| 972 |
+
if not api_key:
|
| 973 |
+
print("=== AI Integration Example (Skipped - No API Key) ===\n")
|
| 974 |
+
return
|
| 975 |
+
|
| 976 |
+
print("=== AI-Powered Analysis Example ===")
|
| 977 |
+
|
| 978 |
+
try:
|
| 979 |
+
# Initialize Gemini engine
|
| 980 |
+
print("1. Initializing Gemini AI...")
|
| 981 |
+
gemini_config = GeminiConfig(api_key=api_key)
|
| 982 |
+
gemini_engine = GeminiAnalysisEngine(gemini_config)
|
| 983 |
+
|
| 984 |
+
# Sample content for analysis
|
| 985 |
+
sample_content = """
|
| 986 |
+
# Enterprise Document Management Strategy
|
| 987 |
+
|
| 988 |
+
## Executive Summary
|
| 989 |
+
This document outlines our comprehensive approach to modernizing document
|
| 990 |
+
management processes through automated conversion and AI-powered analysis.
|
| 991 |
+
|
| 992 |
+
## Key Objectives
|
| 993 |
+
1. **Standardization**: Convert legacy formats to modern, searchable formats
|
| 994 |
+
2. **Quality Assurance**: Implement AI-driven quality validation
|
| 995 |
+
3. **Efficiency**: Reduce manual processing time by 75%
|
| 996 |
+
4. **Scalability**: Handle 10,000+ documents monthly
|
| 997 |
+
|
| 998 |
+
## Implementation Timeline
|
| 999 |
+
|
| 1000 |
+
| Phase | Duration | Deliverables |
|
| 1001 |
+
|-------|----------|--------------|
|
| 1002 |
+
| Phase 1 | 2 months | Platform deployment |
|
| 1003 |
+
| Phase 2 | 3 months | AI integration |
|
| 1004 |
+
| Phase 3 | 1 month | Quality validation |
|
| 1005 |
+
|
| 1006 |
+
## Expected ROI
|
| 1007 |
+
- Processing time reduction: 75%
|
| 1008 |
+
- Quality improvement: 40%
|
| 1009 |
+
- Cost savings: $50,000 annually
|
| 1010 |
+
"""
|
| 1011 |
+
|
| 1012 |
+
# Test different analysis types
|
| 1013 |
+
analysis_types = [
|
| 1014 |
+
(AnalysisType.QUALITY_ANALYSIS, "Quality Assessment"),
|
| 1015 |
+
(AnalysisType.CONTENT_SUMMARY, "Content Summary"),
|
| 1016 |
+
(AnalysisType.STRUCTURE_REVIEW, "Structure Analysis")
|
| 1017 |
+
]
|
| 1018 |
+
|
| 1019 |
+
for analysis_type, description in analysis_types:
|
| 1020 |
+
print(f"\n2. Running {description}...")
|
| 1021 |
+
|
| 1022 |
+
request = AnalysisRequest(
|
| 1023 |
+
content=sample_content,
|
| 1024 |
+
analysis_type=analysis_type,
|
| 1025 |
+
model=GeminiModel.PRO
|
| 1026 |
+
)
|
| 1027 |
+
|
| 1028 |
+
result = await gemini_engine.analyze_content(request)
|
| 1029 |
+
|
| 1030 |
+
if result.success:
|
| 1031 |
+
print(f" ✅ {description} completed in {result.processing_time:.2f}s")
|
| 1032 |
+
|
| 1033 |
+
if analysis_type == AnalysisType.QUALITY_ANALYSIS:
|
| 1034 |
+
content = result.content
|
| 1035 |
+
print(f" 📊 Overall Score: {content.get('overall_score', 0)}/10")
|
| 1036 |
+
print(f" 🏗️ Structure Score: {content.get('structure_score', 0)}/10")
|
| 1037 |
+
print(f" 📋 Completeness: {content.get('completeness_score', 0)}/10")
|
| 1038 |
+
|
| 1039 |
+
elif analysis_type == AnalysisType.CONTENT_SUMMARY:
|
| 1040 |
+
summary = result.content.get('executive_summary', '')[:200]
|
| 1041 |
+
print(f" 📝 Summary: {summary}...")
|
| 1042 |
+
|
| 1043 |
+
else:
|
| 1044 |
+
print(f" ❌ {description} failed: {result.error_message}")
|
| 1045 |
+
|
| 1046 |
+
# Performance metrics
|
| 1047 |
+
print(f"\n3. Performance Metrics:")
|
| 1048 |
+
perf_metrics = gemini_engine.get_performance_metrics()
|
| 1049 |
+
print(f" 📈 Total Requests: {perf_metrics['total_requests']}")
|
| 1050 |
+
print(f" ⏱️ Average Time: {perf_metrics['average_processing_time']:.2f}s")
|
| 1051 |
+
print(f" ✅ Success Rate: {perf_metrics['success_rate_percent']:.1f}%")
|
| 1052 |
+
|
| 1053 |
+
except Exception as e:
|
| 1054 |
+
print(f" ❌ AI Integration example failed: {e}")
|
| 1055 |
+
|
| 1056 |
+
print("\n" + "="*50 + "\n")
|
| 1057 |
+
|
| 1058 |
+
@staticmethod
|
| 1059 |
+
async def example_visualization_generation():
|
| 1060 |
+
"""Example: Generate interactive visualizations"""
|
| 1061 |
+
|
| 1062 |
+
print("=== Visualization Generation Example ===")
|
| 1063 |
+
|
| 1064 |
+
try:
|
| 1065 |
+
# Create mock results for visualization
|
| 1066 |
+
mock_result = ProcessingResult(
|
| 1067 |
+
success=True,
|
| 1068 |
+
content=DocumentSampleGenerator.create_test_html(),
|
| 1069 |
+
metadata={
|
| 1070 |
+
'original_file': {'filename': 'test.html', 'size': 5000}
|
| 1071 |
+
},
|
| 1072 |
+
processing_time=2.3
|
| 1073 |
+
)
|
| 1074 |
+
|
| 1075 |
+
# Initialize visualization engine
|
| 1076 |
+
print("1. Initializing visualization engine...")
|
| 1077 |
+
viz_engine = InteractiveVisualizationEngine()
|
| 1078 |
+
|
| 1079 |
+
# Generate quality dashboard
|
| 1080 |
+
print("2. Creating quality dashboard...")
|
| 1081 |
+
dashboard_start = time.time()
|
| 1082 |
+
quality_dashboard = viz_engine.create_quality_dashboard(mock_result)
|
| 1083 |
+
dashboard_time = time.time() - dashboard_start
|
| 1084 |
+
|
| 1085 |
+
print(f" ✅ Quality dashboard generated in {dashboard_time:.2f}s")
|
| 1086 |
+
print(f" 📊 Chart components: {len(quality_dashboard.data)} data traces")
|
| 1087 |
+
|
| 1088 |
+
# Generate structure analysis
|
| 1089 |
+
print("3. Creating structure analysis...")
|
| 1090 |
+
structure_start = time.time()
|
| 1091 |
+
structure_viz = viz_engine.create_structural_analysis_viz(mock_result)
|
| 1092 |
+
structure_time = time.time() - structure_start
|
| 1093 |
+
|
| 1094 |
+
print(f" ✅ Structure analysis generated in {structure_time:.2f}s")
|
| 1095 |
+
|
| 1096 |
+
# Generate export report
|
| 1097 |
+
print("4. Creating export-ready report...")
|
| 1098 |
+
report_start = time.time()
|
| 1099 |
+
export_report = viz_engine.create_export_ready_report(mock_result)
|
| 1100 |
+
report_time = time.time() - report_start
|
| 1101 |
+
|
| 1102 |
+
print(f" ✅ Export report generated in {report_time:.2f}s")
|
| 1103 |
+
print(f" 📈 Report components: {list(export_report.keys())}")
|
| 1104 |
+
|
| 1105 |
+
total_time = dashboard_time + structure_time + report_time
|
| 1106 |
+
print(f"\n 📊 Total visualization time: {total_time:.2f}s")
|
| 1107 |
+
|
| 1108 |
+
except Exception as e:
|
| 1109 |
+
print(f" ❌ Visualization example failed: {e}")
|
| 1110 |
+
|
| 1111 |
+
print("\n" + "="*50 + "\n")
|
| 1112 |
+
|
| 1113 |
+
|
| 1114 |
+
async def main():
|
| 1115 |
+
"""Main function to run examples and tests"""
|
| 1116 |
+
|
| 1117 |
+
print("🚀 MarkItDown Testing Platform - Examples & Testing Suite")
|
| 1118 |
+
print("=" * 60)
|
| 1119 |
+
|
| 1120 |
+
# Run usage examples
|
| 1121 |
+
await UsageExamples.example_basic_usage()
|
| 1122 |
+
|
| 1123 |
+
# Ask for Gemini API key for AI examples
|
| 1124 |
+
api_key = input("Enter Gemini API key for AI examples (press Enter to skip): ").strip()
|
| 1125 |
+
if api_key:
|
| 1126 |
+
await UsageExamples.example_ai_integration(api_key)
|
| 1127 |
+
|
| 1128 |
+
await UsageExamples.example_visualization_generation()
|
| 1129 |
+
|
| 1130 |
+
# Run comprehensive test suite
|
| 1131 |
+
print("🧪 Running Comprehensive Test Suite...")
|
| 1132 |
+
print("=" * 40)
|
| 1133 |
+
|
| 1134 |
+
tester = PlatformTester()
|
| 1135 |
+
test_results = await tester.run_comprehensive_test_suite(api_key if api_key else None)
|
| 1136 |
+
|
| 1137 |
+
# Display test results
|
| 1138 |
+
print(f"\n📊 Test Suite Results:")
|
| 1139 |
+
print(f" Status: {test_results['overall_status']}")
|
| 1140 |
+
print(f" Duration: {test_results['duration']:.2f}s")
|
| 1141 |
+
print(f" Success Rate: {test_results['summary']['success_rate']:.1f}%")
|
| 1142 |
+
print(f" Tests: {test_results['summary']['passed']}/{test_results['summary']['total_tests']} passed")
|
| 1143 |
+
|
| 1144 |
+
if test_results['recommendations']:
|
| 1145 |
+
print(f"\n💡 Recommendations:")
|
| 1146 |
+
for rec in test_results['recommendations'][:5]: # Show top 5
|
| 1147 |
+
print(f" {rec}")
|
| 1148 |
+
|
| 1149 |
+
# Save detailed results
|
| 1150 |
+
results_file = f"test_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
| 1151 |
+
with open(results_file, 'w') as f:
|
| 1152 |
+
json.dump(test_results, f, indent=2, default=str)
|
| 1153 |
+
|
| 1154 |
+
print(f"\n📁 Detailed results saved to: {results_file}")
|
| 1155 |
+
print("\n✅ Examples and testing complete!")
|
| 1156 |
+
|
| 1157 |
+
|
| 1158 |
+
if __name__ == "__main__":
|
| 1159 |
+
asyncio.run(main())
|
llm/gemini_connector.py
ADDED
|
@@ -0,0 +1,721 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enterprise-Grade Gemini Integration Layer
|
| 3 |
+
|
| 4 |
+
Strategic Design Philosophy:
|
| 5 |
+
- Multi-model orchestration for diverse analysis needs
|
| 6 |
+
- Robust error handling with graceful degradation
|
| 7 |
+
- Configurable analysis pipelines for different use cases
|
| 8 |
+
- Performance optimization for HF Spaces constraints
|
| 9 |
+
|
| 10 |
+
This module provides a comprehensive Gemini API integration designed for
|
| 11 |
+
enterprise-scale document analysis with focus on reliability and extensibility.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import asyncio
|
| 15 |
+
import json
|
| 16 |
+
import logging
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from typing import Dict, Any, List, Optional, Union, AsyncGenerator
|
| 19 |
+
from dataclasses import dataclass, asdict
|
| 20 |
+
from enum import Enum
|
| 21 |
+
|
| 22 |
+
import google.generativeai as genai
|
| 23 |
+
from google.generativeai.types import HarmCategory, HarmBlockThreshold
|
| 24 |
+
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
| 25 |
+
from pydantic import BaseModel, Field, validator, JsonValue
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
JSONDict = Dict[str, JsonValue]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# Strategic Configuration Classes
|
| 32 |
+
class AnalysisType(Enum):
|
| 33 |
+
"""Enumeration of available analysis types"""
|
| 34 |
+
QUALITY_ANALYSIS = "quality_analysis"
|
| 35 |
+
STRUCTURE_REVIEW = "structure_review"
|
| 36 |
+
CONTENT_SUMMARY = "content_summary"
|
| 37 |
+
COMPARATIVE_ANALYSIS = "comparative_analysis"
|
| 38 |
+
EXTRACTION_QUALITY = "extraction_quality"
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class GeminiModel(Enum):
|
| 42 |
+
"""Available Gemini models with strategic use case mapping"""
|
| 43 |
+
PRO = "gemini-1.5-pro" # Complex analysis, reasoning
|
| 44 |
+
FLASH = "gemini-1.5-flash" # Fast processing, summaries
|
| 45 |
+
PRO_VISION = "gemini-1.5-pro-vision" # Multimodal content analysis
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@dataclass
|
| 49 |
+
class GeminiConfig:
|
| 50 |
+
"""Comprehensive Gemini API configuration"""
|
| 51 |
+
api_key: Optional[str] = None
|
| 52 |
+
default_model: GeminiModel = GeminiModel.PRO
|
| 53 |
+
max_tokens: int = 8192
|
| 54 |
+
temperature: float = 0.1 # Low temperature for consistent analysis
|
| 55 |
+
timeout_seconds: int = 60
|
| 56 |
+
max_retry_attempts: int = 3
|
| 57 |
+
safety_settings: Optional[Dict] = None
|
| 58 |
+
|
| 59 |
+
def __post_init__(self):
|
| 60 |
+
if self.safety_settings is None:
|
| 61 |
+
self.safety_settings = {
|
| 62 |
+
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
|
| 63 |
+
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
|
| 64 |
+
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
|
| 65 |
+
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class AnalysisRequest(BaseModel):
|
| 70 |
+
"""Structured request for document analysis"""
|
| 71 |
+
|
| 72 |
+
content: str = Field(..., description="Markdown content to analyze")
|
| 73 |
+
analysis_type: AnalysisType = Field(..., description="Type of analysis to perform")
|
| 74 |
+
model: GeminiModel = Field(default=GeminiModel.PRO, description="Gemini model to use")
|
| 75 |
+
custom_instructions: Optional[str] = Field(None, description="Additional analysis instructions")
|
| 76 |
+
context: Optional[JSONDict] = Field(default_factory=dict, description="Additional context")
|
| 77 |
+
|
| 78 |
+
@validator('content')
|
| 79 |
+
def validate_content(cls, v):
|
| 80 |
+
if not v or len(v.strip()) < 10:
|
| 81 |
+
raise ValueError("Content must be at least 10 characters long")
|
| 82 |
+
return v
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class AnalysisResponse(BaseModel):
|
| 86 |
+
"""Standardized analysis response structure"""
|
| 87 |
+
|
| 88 |
+
success: bool
|
| 89 |
+
analysis_type: AnalysisType
|
| 90 |
+
model_used: GeminiModel
|
| 91 |
+
content: JSONDict
|
| 92 |
+
metadata: JSONDict
|
| 93 |
+
error_message: Optional[str] = None
|
| 94 |
+
processing_time: Optional[float] = None
|
| 95 |
+
token_usage: Optional[Dict[str, int]] = None
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class GeminiAnalysisEngine:
|
| 99 |
+
"""
|
| 100 |
+
Comprehensive Gemini-powered analysis system
|
| 101 |
+
|
| 102 |
+
Strategic Architecture:
|
| 103 |
+
- Multi-model orchestration for optimal performance vs cost
|
| 104 |
+
- Prompt engineering templates for consistent results
|
| 105 |
+
- Error handling with intelligent retry mechanisms
|
| 106 |
+
- Performance monitoring and optimization
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
# Strategic Prompt Templates for Different Analysis Types
|
| 110 |
+
ANALYSIS_PROMPTS = {
|
| 111 |
+
AnalysisType.QUALITY_ANALYSIS: {
|
| 112 |
+
"system": """You are an expert document conversion analyst specializing in evaluating
|
| 113 |
+
the quality of document-to-Markdown conversions.""",
|
| 114 |
+
"template": """
|
| 115 |
+
Analyze the quality of this Markdown conversion from a document.
|
| 116 |
+
|
| 117 |
+
**Analysis Focus Areas:**
|
| 118 |
+
1. **Structure Preservation**: How well are headers, lists, tables maintained?
|
| 119 |
+
2. **Content Completeness**: Is all information preserved from the original?
|
| 120 |
+
3. **Formatting Accuracy**: Are formatting elements correctly converted?
|
| 121 |
+
4. **Information Hierarchy**: Is the document structure logical and clear?
|
| 122 |
+
5. **Readability**: How accessible is the converted content?
|
| 123 |
+
|
| 124 |
+
**Content to Analyze:**
|
| 125 |
+
```markdown
|
| 126 |
+
{content}
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
**Provide your analysis as a structured JSON response with these fields:**
|
| 130 |
+
- overall_score: (1-10 scale)
|
| 131 |
+
- structure_score: (1-10 scale)
|
| 132 |
+
- completeness_score: (1-10 scale)
|
| 133 |
+
- accuracy_score: (1-10 scale)
|
| 134 |
+
- readability_score: (1-10 scale)
|
| 135 |
+
- detailed_feedback: (string with specific observations)
|
| 136 |
+
- recommendations: (array of improvement suggestions)
|
| 137 |
+
- detected_elements: (object listing found structural elements)
|
| 138 |
+
|
| 139 |
+
Focus on actionable insights and specific examples from the content.
|
| 140 |
+
""",
|
| 141 |
+
},
|
| 142 |
+
|
| 143 |
+
AnalysisType.STRUCTURE_REVIEW: {
|
| 144 |
+
"system": """You are a document structure specialist analyzing Markdown
|
| 145 |
+
document organization and hierarchy.""",
|
| 146 |
+
"template": """
|
| 147 |
+
Conduct a comprehensive structural analysis of this Markdown document.
|
| 148 |
+
|
| 149 |
+
**Structure Analysis Requirements:**
|
| 150 |
+
1. **Hierarchy Analysis**: Map all heading levels (H1, H2, H3, etc.)
|
| 151 |
+
2. **List Structures**: Identify and categorize all lists (ordered, unordered, nested)
|
| 152 |
+
3. **Table Analysis**: Evaluate table formatting and completeness
|
| 153 |
+
4. **Content Organization**: Assess logical flow and organization
|
| 154 |
+
5. **Special Elements**: Identify code blocks, links, images, etc.
|
| 155 |
+
|
| 156 |
+
**Content to Analyze:**
|
| 157 |
+
```markdown
|
| 158 |
+
{content}
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
**Provide a structured JSON response with:**
|
| 162 |
+
- document_outline: (hierarchical structure map)
|
| 163 |
+
- heading_analysis: (object with heading counts and levels)
|
| 164 |
+
- list_analysis: (detailed list structure information)
|
| 165 |
+
- table_analysis: (table count, structure, formatting quality)
|
| 166 |
+
- special_elements: (code blocks, links, images, etc.)
|
| 167 |
+
- organization_score: (1-10 scale)
|
| 168 |
+
- structure_recommendations: (array of specific improvements)
|
| 169 |
+
- accessibility_notes: (readability and navigation considerations)
|
| 170 |
+
|
| 171 |
+
Provide specific examples and actionable structural insights.
|
| 172 |
+
""",
|
| 173 |
+
},
|
| 174 |
+
|
| 175 |
+
AnalysisType.CONTENT_SUMMARY: {
|
| 176 |
+
"system": """You are a content analysis expert specializing in document
|
| 177 |
+
summarization and thematic analysis.""",
|
| 178 |
+
"template": """
|
| 179 |
+
Create a comprehensive content summary and thematic analysis of this document.
|
| 180 |
+
|
| 181 |
+
**Summary Requirements:**
|
| 182 |
+
1. **Executive Summary**: 2-3 sentence overview of main content
|
| 183 |
+
2. **Key Topics**: Primary themes and subjects covered
|
| 184 |
+
3. **Content Classification**: Document type, purpose, target audience
|
| 185 |
+
4. **Information Density**: Assessment of content richness and depth
|
| 186 |
+
5. **Actionable Insights**: Key takeaways and important information
|
| 187 |
+
|
| 188 |
+
**Content to Analyze:**
|
| 189 |
+
```markdown
|
| 190 |
+
{content}
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
**Provide a structured JSON response with:**
|
| 194 |
+
- executive_summary: (brief overview)
|
| 195 |
+
- main_topics: (array of key themes)
|
| 196 |
+
- document_classification: (type, purpose, audience)
|
| 197 |
+
- content_metrics: (word count estimates, complexity level)
|
| 198 |
+
- key_information: (array of important facts/insights)
|
| 199 |
+
- content_quality: (1-10 scale for informativeness)
|
| 200 |
+
- summary_recommendations: (suggestions for content improvement)
|
| 201 |
+
- thematic_analysis: (deeper dive into content themes)
|
| 202 |
+
|
| 203 |
+
Focus on extracting actionable intelligence from the content.
|
| 204 |
+
""",
|
| 205 |
+
},
|
| 206 |
+
|
| 207 |
+
AnalysisType.EXTRACTION_QUALITY: {
|
| 208 |
+
"system": """You are a data extraction quality specialist evaluating how well
|
| 209 |
+
information was preserved during document conversion.""",
|
| 210 |
+
"template": """
|
| 211 |
+
Evaluate the extraction quality and information preservation in this converted document.
|
| 212 |
+
|
| 213 |
+
**Quality Assessment Areas:**
|
| 214 |
+
1. **Data Preservation**: Are numbers, dates, names preserved accurately?
|
| 215 |
+
2. **Formatting Retention**: How well were original formatting cues maintained?
|
| 216 |
+
3. **Context Preservation**: Is the meaning and context clear?
|
| 217 |
+
4. **Information Completeness**: Are there signs of missing information?
|
| 218 |
+
5. **Conversion Artifacts**: Any obvious conversion errors or artifacts?
|
| 219 |
+
|
| 220 |
+
**Content to Analyze:**
|
| 221 |
+
```markdown
|
| 222 |
+
{content}
|
| 223 |
+
```
|
| 224 |
+
|
| 225 |
+
**Provide a structured JSON response with:**
|
| 226 |
+
- extraction_score: (1-10 overall quality)
|
| 227 |
+
- data_accuracy: (assessment of numerical/factual data)
|
| 228 |
+
- context_preservation: (meaning and relationships maintained)
|
| 229 |
+
- formatting_quality: (original structure maintained)
|
| 230 |
+
- completeness_indicators: (signs of missing content)
|
| 231 |
+
- conversion_artifacts: (errors or issues detected)
|
| 232 |
+
- quality_recommendations: (specific improvement suggestions)
|
| 233 |
+
- confidence_level: (confidence in the analysis)
|
| 234 |
+
|
| 235 |
+
Identify specific examples of good and poor extraction quality.
|
| 236 |
+
""",
|
| 237 |
+
}
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
def __init__(self, config: GeminiConfig):
|
| 241 |
+
"""Initialize Gemini Analysis Engine with configuration"""
|
| 242 |
+
|
| 243 |
+
self.config = config
|
| 244 |
+
self.client = None
|
| 245 |
+
self._initialize_client()
|
| 246 |
+
|
| 247 |
+
# Performance tracking
|
| 248 |
+
self.request_count = 0
|
| 249 |
+
self.total_processing_time = 0.0
|
| 250 |
+
self.error_count = 0
|
| 251 |
+
|
| 252 |
+
def _initialize_client(self):
|
| 253 |
+
"""Initialize Gemini client with error handling"""
|
| 254 |
+
|
| 255 |
+
if not self.config.api_key:
|
| 256 |
+
raise ValueError("Gemini API key is required")
|
| 257 |
+
|
| 258 |
+
try:
|
| 259 |
+
genai.configure(api_key=self.config.api_key)
|
| 260 |
+
# Test client initialization with a simple call
|
| 261 |
+
models = genai.list_models()
|
| 262 |
+
logging.info(f"Gemini client initialized successfully. Available models: {len(list(models))}")
|
| 263 |
+
|
| 264 |
+
except Exception as e:
|
| 265 |
+
logging.error(f"Failed to initialize Gemini client: {e}")
|
| 266 |
+
raise
|
| 267 |
+
|
| 268 |
+
@retry(
|
| 269 |
+
stop=stop_after_attempt(3),
|
| 270 |
+
wait=wait_exponential(multiplier=1, min=4, max=10),
|
| 271 |
+
retry=retry_if_exception_type((Exception,))
|
| 272 |
+
)
|
| 273 |
+
async def analyze_content(self, request: AnalysisRequest) -> AnalysisResponse:
|
| 274 |
+
"""
|
| 275 |
+
Execute comprehensive content analysis with retry logic
|
| 276 |
+
|
| 277 |
+
Strategic Processing Approach:
|
| 278 |
+
1. Validate request and prepare prompt
|
| 279 |
+
2. Execute analysis with appropriate model
|
| 280 |
+
3. Parse and validate response
|
| 281 |
+
4. Return structured results with metadata
|
| 282 |
+
"""
|
| 283 |
+
|
| 284 |
+
start_time = datetime.now()
|
| 285 |
+
self.request_count += 1
|
| 286 |
+
|
| 287 |
+
try:
|
| 288 |
+
# Prepare analysis prompt
|
| 289 |
+
prompt = self._build_analysis_prompt(request)
|
| 290 |
+
|
| 291 |
+
# Select optimal model for analysis type
|
| 292 |
+
model_name = self._select_optimal_model(request.analysis_type, request.model)
|
| 293 |
+
|
| 294 |
+
# Execute analysis
|
| 295 |
+
response = await self._execute_analysis(model_name, prompt)
|
| 296 |
+
|
| 297 |
+
# Parse and structure response
|
| 298 |
+
analysis_content = self._parse_analysis_response(response.text, request.analysis_type)
|
| 299 |
+
|
| 300 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 301 |
+
self.total_processing_time += processing_time
|
| 302 |
+
|
| 303 |
+
return AnalysisResponse(
|
| 304 |
+
success=True,
|
| 305 |
+
analysis_type=request.analysis_type,
|
| 306 |
+
model_used=GeminiModel(model_name),
|
| 307 |
+
content=analysis_content,
|
| 308 |
+
metadata={
|
| 309 |
+
'processing_time': processing_time,
|
| 310 |
+
'content_length': len(request.content),
|
| 311 |
+
'prompt_tokens': len(prompt.split()), # Rough estimate
|
| 312 |
+
'timestamp': start_time.isoformat(),
|
| 313 |
+
'request_id': self.request_count
|
| 314 |
+
},
|
| 315 |
+
processing_time=processing_time
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
except Exception as e:
|
| 319 |
+
self.error_count += 1
|
| 320 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 321 |
+
|
| 322 |
+
logging.error(f"Analysis failed for {request.analysis_type}: {e}")
|
| 323 |
+
|
| 324 |
+
return AnalysisResponse(
|
| 325 |
+
success=False,
|
| 326 |
+
analysis_type=request.analysis_type,
|
| 327 |
+
model_used=request.model,
|
| 328 |
+
content={},
|
| 329 |
+
metadata={'error_timestamp': datetime.now().isoformat()},
|
| 330 |
+
error_message=str(e),
|
| 331 |
+
processing_time=processing_time
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
def _build_analysis_prompt(self, request: AnalysisRequest) -> str:
|
| 335 |
+
"""Build comprehensive analysis prompt from template"""
|
| 336 |
+
|
| 337 |
+
prompt_config = self.ANALYSIS_PROMPTS.get(request.analysis_type)
|
| 338 |
+
if not prompt_config:
|
| 339 |
+
raise ValueError(f"Unsupported analysis type: {request.analysis_type}")
|
| 340 |
+
|
| 341 |
+
# Build complete prompt with system context
|
| 342 |
+
system_context = prompt_config["system"]
|
| 343 |
+
main_prompt = prompt_config["template"].format(content=request.content)
|
| 344 |
+
|
| 345 |
+
# Add custom instructions if provided
|
| 346 |
+
if request.custom_instructions:
|
| 347 |
+
main_prompt += f"\n\n**Additional Instructions:**\n{request.custom_instructions}"
|
| 348 |
+
|
| 349 |
+
# Add context if available
|
| 350 |
+
if request.context:
|
| 351 |
+
context_str = "\n".join([f"- {k}: {v}" for k, v in request.context.items()])
|
| 352 |
+
main_prompt += f"\n\n**Context:**\n{context_str}"
|
| 353 |
+
|
| 354 |
+
return f"{system_context}\n\n{main_prompt}"
|
| 355 |
+
|
| 356 |
+
def _select_optimal_model(self, analysis_type: AnalysisType, requested_model: GeminiModel) -> str:
|
| 357 |
+
"""Select optimal Gemini model based on analysis requirements"""
|
| 358 |
+
|
| 359 |
+
# Strategic model selection based on analysis complexity
|
| 360 |
+
model_recommendations = {
|
| 361 |
+
AnalysisType.QUALITY_ANALYSIS: GeminiModel.PRO, # Complex reasoning
|
| 362 |
+
AnalysisType.STRUCTURE_REVIEW: GeminiModel.PRO, # Detailed analysis
|
| 363 |
+
AnalysisType.CONTENT_SUMMARY: GeminiModel.FLASH, # Fast processing
|
| 364 |
+
AnalysisType.COMPARATIVE_ANALYSIS: GeminiModel.PRO, # Complex comparison
|
| 365 |
+
AnalysisType.EXTRACTION_QUALITY: GeminiModel.PRO, # Detailed quality assessment
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
# Use recommended model unless specifically overridden
|
| 369 |
+
recommended_model = model_recommendations.get(analysis_type, requested_model)
|
| 370 |
+
return recommended_model.value
|
| 371 |
+
|
| 372 |
+
async def _execute_analysis(self, model_name: str, prompt: str):
|
| 373 |
+
"""Execute analysis using Gemini API with timeout and error handling"""
|
| 374 |
+
|
| 375 |
+
try:
|
| 376 |
+
model = genai.GenerativeModel(
|
| 377 |
+
model_name=model_name,
|
| 378 |
+
safety_settings=self.config.safety_settings
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
# Configure generation parameters
|
| 382 |
+
generation_config = genai.GenerationConfig(
|
| 383 |
+
max_output_tokens=self.config.max_tokens,
|
| 384 |
+
temperature=self.config.temperature,
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
# Execute with timeout
|
| 388 |
+
response = await asyncio.wait_for(
|
| 389 |
+
asyncio.to_thread(
|
| 390 |
+
model.generate_content,
|
| 391 |
+
prompt,
|
| 392 |
+
generation_config=generation_config
|
| 393 |
+
),
|
| 394 |
+
timeout=self.config.timeout_seconds
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
return response
|
| 398 |
+
|
| 399 |
+
except asyncio.TimeoutError:
|
| 400 |
+
raise TimeoutError(f"Gemini API request timed out after {self.config.timeout_seconds} seconds")
|
| 401 |
+
except Exception as e:
|
| 402 |
+
raise RuntimeError(f"Gemini API error: {str(e)}")
|
| 403 |
+
|
| 404 |
+
def _parse_analysis_response(self, response_text: str, analysis_type: AnalysisType) -> JSONDict:
|
| 405 |
+
"""Parse and validate Gemini response into structured format"""
|
| 406 |
+
|
| 407 |
+
try:
|
| 408 |
+
# Try to extract JSON from response
|
| 409 |
+
json_start = response_text.find('{')
|
| 410 |
+
json_end = response_text.rfind('}') + 1
|
| 411 |
+
|
| 412 |
+
if json_start >= 0 and json_end > json_start:
|
| 413 |
+
json_content = response_text[json_start:json_end]
|
| 414 |
+
parsed_response = json.loads(json_content)
|
| 415 |
+
|
| 416 |
+
# Validate required fields based on analysis type
|
| 417 |
+
validated_response = self._validate_response_structure(parsed_response, analysis_type)
|
| 418 |
+
return validated_response
|
| 419 |
+
|
| 420 |
+
else:
|
| 421 |
+
# Fallback: structure unstructured response
|
| 422 |
+
return self._structure_unstructured_response(response_text, analysis_type)
|
| 423 |
+
|
| 424 |
+
except json.JSONDecodeError:
|
| 425 |
+
# Handle non-JSON response
|
| 426 |
+
return self._structure_unstructured_response(response_text, analysis_type)
|
| 427 |
+
|
| 428 |
+
def _validate_response_structure(self, response: JSONDict, analysis_type: AnalysisType) -> JSONDict:
|
| 429 |
+
"""Validate and ensure response contains required fields"""
|
| 430 |
+
|
| 431 |
+
# Define required fields for each analysis type
|
| 432 |
+
required_fields = {
|
| 433 |
+
AnalysisType.QUALITY_ANALYSIS: [
|
| 434 |
+
'overall_score', 'structure_score', 'completeness_score',
|
| 435 |
+
'accuracy_score', 'readability_score', 'detailed_feedback'
|
| 436 |
+
],
|
| 437 |
+
AnalysisType.STRUCTURE_REVIEW: [
|
| 438 |
+
'document_outline', 'heading_analysis', 'organization_score'
|
| 439 |
+
],
|
| 440 |
+
AnalysisType.CONTENT_SUMMARY: [
|
| 441 |
+
'executive_summary', 'main_topics', 'content_quality'
|
| 442 |
+
],
|
| 443 |
+
AnalysisType.EXTRACTION_QUALITY: [
|
| 444 |
+
'extraction_score', 'data_accuracy', 'completeness_indicators'
|
| 445 |
+
]
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
expected_fields = required_fields.get(analysis_type, [])
|
| 449 |
+
|
| 450 |
+
# Ensure all required fields are present with defaults
|
| 451 |
+
validated_response = response.copy()
|
| 452 |
+
for field in expected_fields:
|
| 453 |
+
if field not in validated_response:
|
| 454 |
+
validated_response[field] = self._get_default_field_value(field)
|
| 455 |
+
|
| 456 |
+
return validated_response
|
| 457 |
+
|
| 458 |
+
def _get_default_field_value(self, field_name: str) -> Any:
|
| 459 |
+
"""Get default value for missing response fields"""
|
| 460 |
+
|
| 461 |
+
if field_name.endswith('_score'):
|
| 462 |
+
return 0
|
| 463 |
+
elif field_name in ['detailed_feedback', 'executive_summary']:
|
| 464 |
+
return "Analysis incomplete - field not provided"
|
| 465 |
+
elif field_name.endswith('_analysis') or field_name == 'document_outline':
|
| 466 |
+
return {}
|
| 467 |
+
elif field_name in ['main_topics', 'recommendations']:
|
| 468 |
+
return []
|
| 469 |
+
else:
|
| 470 |
+
return None
|
| 471 |
+
|
| 472 |
+
def _structure_unstructured_response(self, response_text: str, analysis_type: AnalysisType) -> JSONDict:
|
| 473 |
+
"""Structure unstructured response text into expected format"""
|
| 474 |
+
|
| 475 |
+
# Basic structuring based on analysis type
|
| 476 |
+
base_structure = {
|
| 477 |
+
'raw_response': response_text,
|
| 478 |
+
'structured': False,
|
| 479 |
+
'analysis_timestamp': datetime.now().isoformat()
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
# Add type-specific default structure
|
| 483 |
+
if analysis_type == AnalysisType.QUALITY_ANALYSIS:
|
| 484 |
+
base_structure.update({
|
| 485 |
+
'overall_score': 5, # Neutral default
|
| 486 |
+
'detailed_feedback': response_text,
|
| 487 |
+
'recommendations': []
|
| 488 |
+
})
|
| 489 |
+
elif analysis_type == AnalysisType.CONTENT_SUMMARY:
|
| 490 |
+
base_structure.update({
|
| 491 |
+
'executive_summary': response_text[:200] + "..." if len(response_text) > 200 else response_text,
|
| 492 |
+
'content_quality': 5
|
| 493 |
+
})
|
| 494 |
+
|
| 495 |
+
return base_structure
|
| 496 |
+
|
| 497 |
+
async def batch_analyze(self, requests: List[AnalysisRequest]) -> List[AnalysisResponse]:
|
| 498 |
+
"""Execute multiple analyses concurrently with rate limiting"""
|
| 499 |
+
|
| 500 |
+
# Implement concurrent processing with semaphore for rate limiting
|
| 501 |
+
semaphore = asyncio.Semaphore(3) # Max 3 concurrent requests
|
| 502 |
+
|
| 503 |
+
async def limited_analyze(request):
|
| 504 |
+
async with semaphore:
|
| 505 |
+
return await self.analyze_content(request)
|
| 506 |
+
|
| 507 |
+
# Execute all requests concurrently
|
| 508 |
+
tasks = [limited_analyze(request) for request in requests]
|
| 509 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 510 |
+
|
| 511 |
+
# Convert exceptions to error responses
|
| 512 |
+
processed_results = []
|
| 513 |
+
for i, result in enumerate(results):
|
| 514 |
+
if isinstance(result, Exception):
|
| 515 |
+
error_response = AnalysisResponse(
|
| 516 |
+
success=False,
|
| 517 |
+
analysis_type=requests[i].analysis_type,
|
| 518 |
+
model_used=requests[i].model,
|
| 519 |
+
content={},
|
| 520 |
+
metadata={'batch_error': True},
|
| 521 |
+
error_message=str(result)
|
| 522 |
+
)
|
| 523 |
+
processed_results.append(error_response)
|
| 524 |
+
else:
|
| 525 |
+
processed_results.append(result)
|
| 526 |
+
|
| 527 |
+
return processed_results
|
| 528 |
+
|
| 529 |
+
def get_performance_metrics(self) -> JSONDict:
|
| 530 |
+
"""Get comprehensive performance metrics"""
|
| 531 |
+
|
| 532 |
+
avg_processing_time = (
|
| 533 |
+
self.total_processing_time / self.request_count
|
| 534 |
+
if self.request_count > 0 else 0
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
success_rate = (
|
| 538 |
+
(self.request_count - self.error_count) / self.request_count * 100
|
| 539 |
+
if self.request_count > 0 else 0
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
return {
|
| 543 |
+
'total_requests': self.request_count,
|
| 544 |
+
'total_errors': self.error_count,
|
| 545 |
+
'success_rate_percent': success_rate,
|
| 546 |
+
'average_processing_time': avg_processing_time,
|
| 547 |
+
'total_processing_time': self.total_processing_time,
|
| 548 |
+
'requests_per_minute': self.request_count / max(self.total_processing_time / 60, 1)
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
class GeminiConnectionManager:
|
| 553 |
+
"""
|
| 554 |
+
Enterprise-grade connection and configuration management for Gemini
|
| 555 |
+
|
| 556 |
+
Strategic Features:
|
| 557 |
+
- API key validation and secure storage
|
| 558 |
+
- Connection health monitoring
|
| 559 |
+
- Automatic reconnection and failover
|
| 560 |
+
- Usage tracking and optimization recommendations
|
| 561 |
+
"""
|
| 562 |
+
|
| 563 |
+
def __init__(self):
|
| 564 |
+
self.engines: Dict[str, GeminiAnalysisEngine] = {}
|
| 565 |
+
self.connection_health = {}
|
| 566 |
+
|
| 567 |
+
async def create_engine(self, api_key: str, config: Optional[GeminiConfig] = None) -> str:
|
| 568 |
+
"""Create and validate new Gemini engine instance"""
|
| 569 |
+
|
| 570 |
+
if not api_key or not api_key.strip():
|
| 571 |
+
raise ValueError("Valid API key is required")
|
| 572 |
+
|
| 573 |
+
# Create configuration
|
| 574 |
+
if config is None:
|
| 575 |
+
config = GeminiConfig(api_key=api_key)
|
| 576 |
+
else:
|
| 577 |
+
config.api_key = api_key
|
| 578 |
+
|
| 579 |
+
# Generate unique engine ID
|
| 580 |
+
engine_id = f"gemini_{hash(api_key) % 10000}"
|
| 581 |
+
|
| 582 |
+
try:
|
| 583 |
+
# Create and test engine
|
| 584 |
+
engine = GeminiAnalysisEngine(config)
|
| 585 |
+
await self._test_engine_connection(engine)
|
| 586 |
+
|
| 587 |
+
# Store engine and mark as healthy
|
| 588 |
+
self.engines[engine_id] = engine
|
| 589 |
+
self.connection_health[engine_id] = {
|
| 590 |
+
'status': 'healthy',
|
| 591 |
+
'last_check': datetime.now().isoformat(),
|
| 592 |
+
'consecutive_failures': 0
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
logging.info(f"Gemini engine {engine_id} created and validated successfully")
|
| 596 |
+
return engine_id
|
| 597 |
+
|
| 598 |
+
except Exception as e:
|
| 599 |
+
logging.error(f"Failed to create Gemini engine: {e}")
|
| 600 |
+
raise
|
| 601 |
+
|
| 602 |
+
async def _test_engine_connection(self, engine: GeminiAnalysisEngine):
|
| 603 |
+
"""Test engine connection with minimal request"""
|
| 604 |
+
|
| 605 |
+
test_request = AnalysisRequest(
|
| 606 |
+
content="# Test Document\n\nThis is a test.",
|
| 607 |
+
analysis_type=AnalysisType.CONTENT_SUMMARY,
|
| 608 |
+
model=GeminiModel.FLASH
|
| 609 |
+
)
|
| 610 |
+
|
| 611 |
+
response = await engine.analyze_content(test_request)
|
| 612 |
+
if not response.success:
|
| 613 |
+
raise RuntimeError(f"Engine connection test failed: {response.error_message}")
|
| 614 |
+
|
| 615 |
+
def get_engine(self, engine_id: str) -> Optional[GeminiAnalysisEngine]:
|
| 616 |
+
"""Get engine instance by ID"""
|
| 617 |
+
return self.engines.get(engine_id)
|
| 618 |
+
|
| 619 |
+
def list_engines(self) -> Dict[str, JSONDict]:
|
| 620 |
+
"""List all available engines with health status"""
|
| 621 |
+
|
| 622 |
+
result = {}
|
| 623 |
+
for engine_id, engine in self.engines.items():
|
| 624 |
+
health = self.connection_health.get(engine_id, {})
|
| 625 |
+
metrics = engine.get_performance_metrics()
|
| 626 |
+
|
| 627 |
+
result[engine_id] = {
|
| 628 |
+
'health_status': health,
|
| 629 |
+
'performance_metrics': metrics,
|
| 630 |
+
'config': {
|
| 631 |
+
'default_model': engine.config.default_model.value,
|
| 632 |
+
'max_tokens': engine.config.max_tokens,
|
| 633 |
+
'temperature': engine.config.temperature
|
| 634 |
+
}
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
return result
|
| 638 |
+
|
| 639 |
+
async def health_check_all(self) -> Dict[str, bool]:
|
| 640 |
+
"""Perform health check on all engines"""
|
| 641 |
+
|
| 642 |
+
health_results = {}
|
| 643 |
+
|
| 644 |
+
for engine_id, engine in self.engines.items():
|
| 645 |
+
try:
|
| 646 |
+
await self._test_engine_connection(engine)
|
| 647 |
+
self.connection_health[engine_id].update({
|
| 648 |
+
'status': 'healthy',
|
| 649 |
+
'last_check': datetime.now().isoformat(),
|
| 650 |
+
'consecutive_failures': 0
|
| 651 |
+
})
|
| 652 |
+
health_results[engine_id] = True
|
| 653 |
+
|
| 654 |
+
except Exception as e:
|
| 655 |
+
self.connection_health[engine_id]['consecutive_failures'] += 1
|
| 656 |
+
self.connection_health[engine_id]['status'] = 'unhealthy'
|
| 657 |
+
self.connection_health[engine_id]['last_error'] = str(e)
|
| 658 |
+
health_results[engine_id] = False
|
| 659 |
+
|
| 660 |
+
logging.warning(f"Health check failed for engine {engine_id}: {e}")
|
| 661 |
+
|
| 662 |
+
return health_results
|
| 663 |
+
|
| 664 |
+
|
| 665 |
+
# Utility Functions for External Integration
|
| 666 |
+
def create_analysis_request(
|
| 667 |
+
content: str,
|
| 668 |
+
analysis_type: str,
|
| 669 |
+
model: str = "gemini-1.5-pro",
|
| 670 |
+
custom_instructions: Optional[str] = None
|
| 671 |
+
) -> AnalysisRequest:
|
| 672 |
+
"""Factory function for creating analysis requests"""
|
| 673 |
+
|
| 674 |
+
return AnalysisRequest(
|
| 675 |
+
content=content,
|
| 676 |
+
analysis_type=AnalysisType(analysis_type),
|
| 677 |
+
model=GeminiModel(model),
|
| 678 |
+
custom_instructions=custom_instructions
|
| 679 |
+
)
|
| 680 |
+
|
| 681 |
+
|
| 682 |
+
def extract_key_insights(analysis_response: AnalysisResponse) -> JSONDict:
|
| 683 |
+
"""Extract key insights from analysis response for UI display"""
|
| 684 |
+
|
| 685 |
+
if not analysis_response.success:
|
| 686 |
+
return {
|
| 687 |
+
'error': True,
|
| 688 |
+
'message': analysis_response.error_message,
|
| 689 |
+
'analysis_type': analysis_response.analysis_type.value
|
| 690 |
+
}
|
| 691 |
+
|
| 692 |
+
content = analysis_response.content
|
| 693 |
+
insights = {
|
| 694 |
+
'analysis_type': analysis_response.analysis_type.value,
|
| 695 |
+
'model_used': analysis_response.model_used.value,
|
| 696 |
+
'processing_time': analysis_response.processing_time,
|
| 697 |
+
'success': True
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
+
# Extract type-specific insights
|
| 701 |
+
if analysis_response.analysis_type == AnalysisType.QUALITY_ANALYSIS:
|
| 702 |
+
insights.update({
|
| 703 |
+
'overall_score': content.get('overall_score', 0),
|
| 704 |
+
'key_scores': {
|
| 705 |
+
'structure': content.get('structure_score', 0),
|
| 706 |
+
'completeness': content.get('completeness_score', 0),
|
| 707 |
+
'accuracy': content.get('accuracy_score', 0),
|
| 708 |
+
'readability': content.get('readability_score', 0)
|
| 709 |
+
},
|
| 710 |
+
'summary': content.get('detailed_feedback', '')[:200] + '...' if content.get('detailed_feedback', '') else ''
|
| 711 |
+
})
|
| 712 |
+
|
| 713 |
+
elif analysis_response.analysis_type == AnalysisType.CONTENT_SUMMARY:
|
| 714 |
+
insights.update({
|
| 715 |
+
'summary': content.get('executive_summary', ''),
|
| 716 |
+
'topics': content.get('main_topics', []),
|
| 717 |
+
'quality_score': content.get('content_quality', 0)
|
| 718 |
+
})
|
| 719 |
+
|
| 720 |
+
return insights
|
| 721 |
+
JSONDict = Dict[str, JsonValue]
|
requirements.txt
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MarkItDown Testing Platform - HF Spaces Optimized Dependencies
|
| 2 |
+
# Strategic dependency selection for enterprise-grade reliability
|
| 3 |
+
|
| 4 |
+
# Core Framework Dependencies
|
| 5 |
+
gradio>=4.0.0,<5.0.0 # UI framework - pinned major version for stability
|
| 6 |
+
markitdown[all]>=0.1.0 # Microsoft's document conversion engine
|
| 7 |
+
|
| 8 |
+
# LLM Integration - Gemini Focus
|
| 9 |
+
google-generativeai>=0.3.0,<1.0.0 # Google Gemini API client
|
| 10 |
+
google-auth>=2.0.0 # Authentication for Google services
|
| 11 |
+
|
| 12 |
+
# Data Processing & Visualization
|
| 13 |
+
plotly>=5.17.0,<6.0.0 # Interactive visualizations
|
| 14 |
+
pandas>=1.5.0,<3.0.0 # Data manipulation and analysis
|
| 15 |
+
numpy>=1.21.0,<2.0.0 # Numerical computing foundation
|
| 16 |
+
|
| 17 |
+
# Async Processing & File Handling
|
| 18 |
+
aiofiles>=22.0.0 # Async file operations
|
| 19 |
+
python-multipart>=0.0.6 # Multipart form data handling
|
| 20 |
+
async-timeout>=4.0.0 # Timeout management for async operations
|
| 21 |
+
|
| 22 |
+
# Image Processing (for multimodal capabilities)
|
| 23 |
+
Pillow>=9.0.0,<11.0.0 # Image processing library
|
| 24 |
+
python-magic>=0.4.27 # File type detection
|
| 25 |
+
|
| 26 |
+
# Utilities & Performance
|
| 27 |
+
pydantic>=2.0.0,<3.0.0 # Data validation and settings management
|
| 28 |
+
python-dotenv>=1.0.0 # Environment variable management
|
| 29 |
+
tenacity>=8.0.0 # Retry mechanisms for API calls
|
| 30 |
+
|
| 31 |
+
# Optional Dependencies for Advanced Features
|
| 32 |
+
openpyxl>=3.1.0 # Excel file processing
|
| 33 |
+
python-docx>=0.8.11 # Word document processing
|
| 34 |
+
PyPDF2>=3.0.0 # PDF processing backup
|
| 35 |
+
|
| 36 |
+
# Security & Monitoring (Production considerations)
|
| 37 |
+
cryptography>=41.0.0 # Secure API key handling
|
| 38 |
+
psutil>=5.9.0 # System resource monitoring
|
| 39 |
+
|
| 40 |
+
# Development & Testing Dependencies
|
| 41 |
+
pytest>=7.0.0 # Testing framework
|
| 42 |
+
black>=23.0.0 # Code formatting
|
| 43 |
+
flake8>=6.0.0 # Code linting
|
spaces_metadata.yaml
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Spaces Configuration
|
| 2 |
+
# MarkItDown Testing Platform Metadata
|
| 3 |
+
|
| 4 |
+
title: "MarkItDown Testing Platform"
|
| 5 |
+
emoji: "🚀"
|
| 6 |
+
colorFrom: "blue"
|
| 7 |
+
colorTo: "purple"
|
| 8 |
+
sdk: "gradio"
|
| 9 |
+
sdk_version: "4.0.0"
|
| 10 |
+
app_file: "app.py"
|
| 11 |
+
python_version: "3.10"
|
| 12 |
+
|
| 13 |
+
# Space configuration
|
| 14 |
+
models:
|
| 15 |
+
- google/gemini-pro
|
| 16 |
+
- microsoft/markitdown
|
| 17 |
+
|
| 18 |
+
datasets: []
|
| 19 |
+
|
| 20 |
+
# Space settings
|
| 21 |
+
pinned: false
|
| 22 |
+
license: "mit"
|
| 23 |
+
duplicated_from: null
|
| 24 |
+
|
| 25 |
+
# Hardware requirements (for paid tiers)
|
| 26 |
+
# hardware: "t4-medium" # Uncomment for GPU acceleration
|
| 27 |
+
|
| 28 |
+
# Environment variables (public - no secrets here)
|
| 29 |
+
variables:
|
| 30 |
+
GRADIO_THEME: "soft"
|
| 31 |
+
MAX_FILE_SIZE_MB: "50"
|
| 32 |
+
PROCESSING_TIMEOUT: "300"
|
| 33 |
+
APP_VERSION: "1.0.0"
|
| 34 |
+
|
| 35 |
+
# App metadata
|
| 36 |
+
short_description: "Enterprise-grade document conversion testing with AI-powered analysis using Microsoft MarkItDown and Google Gemini"
|
| 37 |
+
|
| 38 |
+
# Tags for discoverability
|
| 39 |
+
tags:
|
| 40 |
+
- document-processing
|
| 41 |
+
- ai-analysis
|
| 42 |
+
- markdown-conversion
|
| 43 |
+
- enterprise-tools
|
| 44 |
+
- quality-assessment
|
| 45 |
+
- microsoft-markitdown
|
| 46 |
+
- google-gemini
|
| 47 |
+
- document-conversion
|
| 48 |
+
- pdf-processing
|
| 49 |
+
- office-documents
|
| 50 |
+
|
| 51 |
+
# Custom configuration for the space
|
| 52 |
+
custom:
|
| 53 |
+
features:
|
| 54 |
+
- "Multi-format document conversion (PDF, DOCX, PPTX, XLSX, HTML, TXT, CSV, JSON, XML)"
|
| 55 |
+
- "AI-powered quality analysis with Google Gemini"
|
| 56 |
+
- "Interactive visualization dashboards"
|
| 57 |
+
- "Real-time processing metrics"
|
| 58 |
+
- "Export capabilities (Markdown, HTML, JSON, PDF)"
|
| 59 |
+
- "Enterprise-grade error handling and recovery"
|
| 60 |
+
- "Performance optimization and monitoring"
|
| 61 |
+
|
| 62 |
+
supported_formats:
|
| 63 |
+
documents: ["PDF", "DOCX", "PPTX", "XLSX"]
|
| 64 |
+
web: ["HTML", "HTM"]
|
| 65 |
+
text: ["TXT", "CSV", "JSON", "XML", "RTF"]
|
| 66 |
+
|
| 67 |
+
analysis_types:
|
| 68 |
+
- "Quality Analysis: Comprehensive conversion assessment"
|
| 69 |
+
- "Structure Review: Document hierarchy evaluation"
|
| 70 |
+
- "Content Summary: Thematic analysis and insights"
|
| 71 |
+
- "Extraction Quality: Data preservation assessment"
|
| 72 |
+
|
| 73 |
+
technical_specs:
|
| 74 |
+
max_file_size: "50MB (HF Spaces free tier)"
|
| 75 |
+
processing_timeout: "5 minutes"
|
| 76 |
+
memory_optimization: "Stateless architecture with automatic cleanup"
|
| 77 |
+
concurrent_processing: "Async pipeline with resource management"
|
utils/deployment.py
ADDED
|
@@ -0,0 +1,609 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Deployment Utilities for MarkItDown Testing Platform
|
| 3 |
+
|
| 4 |
+
Strategic deployment tools for various environments:
|
| 5 |
+
- Hugging Face Spaces optimization
|
| 6 |
+
- Local development setup
|
| 7 |
+
- Production environment configuration
|
| 8 |
+
- Health monitoring and diagnostics
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
import json
|
| 14 |
+
import logging
|
| 15 |
+
import platform
|
| 16 |
+
import subprocess
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import Dict, Any, List, Optional
|
| 20 |
+
import psutil
|
| 21 |
+
|
| 22 |
+
# Configure logging
|
| 23 |
+
logging.basicConfig(level=logging.INFO)
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class EnvironmentDetector:
|
| 28 |
+
"""Detect and configure for different deployment environments"""
|
| 29 |
+
|
| 30 |
+
@staticmethod
|
| 31 |
+
def detect_environment() -> str:
|
| 32 |
+
"""Detect the current deployment environment"""
|
| 33 |
+
|
| 34 |
+
# Check for Hugging Face Spaces
|
| 35 |
+
if os.environ.get('SPACE_ID'):
|
| 36 |
+
return 'hf_spaces'
|
| 37 |
+
|
| 38 |
+
# Check for Docker environment
|
| 39 |
+
if os.path.exists('/.dockerenv'):
|
| 40 |
+
return 'docker'
|
| 41 |
+
|
| 42 |
+
# Check for common cloud providers
|
| 43 |
+
if os.environ.get('HEROKU_APP_NAME'):
|
| 44 |
+
return 'heroku'
|
| 45 |
+
|
| 46 |
+
if os.environ.get('AWS_EXECUTION_ENV'):
|
| 47 |
+
return 'aws'
|
| 48 |
+
|
| 49 |
+
if os.environ.get('GOOGLE_CLOUD_PROJECT'):
|
| 50 |
+
return 'gcp'
|
| 51 |
+
|
| 52 |
+
# Default to local development
|
| 53 |
+
return 'local'
|
| 54 |
+
|
| 55 |
+
@staticmethod
|
| 56 |
+
def get_environment_config(env_type: str) -> Dict[str, Any]:
|
| 57 |
+
"""Get configuration for specific environment"""
|
| 58 |
+
|
| 59 |
+
configs = {
|
| 60 |
+
'hf_spaces': {
|
| 61 |
+
'max_file_size_mb': 50,
|
| 62 |
+
'processing_timeout': 300,
|
| 63 |
+
'max_memory_gb': 16,
|
| 64 |
+
'temp_dir': '/tmp',
|
| 65 |
+
'enable_analytics': True,
|
| 66 |
+
'log_level': 'INFO',
|
| 67 |
+
'gradio_config': {
|
| 68 |
+
'server_name': '0.0.0.0',
|
| 69 |
+
'server_port': 7860,
|
| 70 |
+
'share': False,
|
| 71 |
+
'enable_queue': True,
|
| 72 |
+
'max_file_size': '50mb'
|
| 73 |
+
}
|
| 74 |
+
},
|
| 75 |
+
'docker': {
|
| 76 |
+
'max_file_size_mb': 100,
|
| 77 |
+
'processing_timeout': 600,
|
| 78 |
+
'max_memory_gb': 32,
|
| 79 |
+
'temp_dir': '/tmp',
|
| 80 |
+
'enable_analytics': True,
|
| 81 |
+
'log_level': 'INFO',
|
| 82 |
+
'gradio_config': {
|
| 83 |
+
'server_name': '0.0.0.0',
|
| 84 |
+
'server_port': int(os.environ.get('PORT', 7860)),
|
| 85 |
+
'share': False,
|
| 86 |
+
'enable_queue': True,
|
| 87 |
+
'max_file_size': '100mb'
|
| 88 |
+
}
|
| 89 |
+
},
|
| 90 |
+
'local': {
|
| 91 |
+
'max_file_size_mb': 200,
|
| 92 |
+
'processing_timeout': 900,
|
| 93 |
+
'max_memory_gb': 64,
|
| 94 |
+
'temp_dir': './temp',
|
| 95 |
+
'enable_analytics': True,
|
| 96 |
+
'log_level': 'DEBUG',
|
| 97 |
+
'gradio_config': {
|
| 98 |
+
'server_name': '127.0.0.1',
|
| 99 |
+
'server_port': 7860,
|
| 100 |
+
'share': True,
|
| 101 |
+
'enable_queue': False,
|
| 102 |
+
'max_file_size': '200mb'
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
return configs.get(env_type, configs['local'])
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class SystemHealthChecker:
|
| 111 |
+
"""System health monitoring and diagnostics"""
|
| 112 |
+
|
| 113 |
+
@staticmethod
|
| 114 |
+
def check_system_resources() -> Dict[str, Any]:
|
| 115 |
+
"""Check system resource availability"""
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
# Memory information
|
| 119 |
+
memory = psutil.virtual_memory()
|
| 120 |
+
|
| 121 |
+
# CPU information
|
| 122 |
+
cpu_count = psutil.cpu_count()
|
| 123 |
+
cpu_percent = psutil.cpu_percent(interval=1)
|
| 124 |
+
|
| 125 |
+
# Disk information
|
| 126 |
+
disk = psutil.disk_usage('/')
|
| 127 |
+
|
| 128 |
+
# System information
|
| 129 |
+
system_info = {
|
| 130 |
+
'platform': platform.platform(),
|
| 131 |
+
'python_version': platform.python_version(),
|
| 132 |
+
'architecture': platform.architecture()[0]
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
return {
|
| 136 |
+
'timestamp': datetime.now().isoformat(),
|
| 137 |
+
'memory': {
|
| 138 |
+
'total_gb': memory.total / (1024**3),
|
| 139 |
+
'available_gb': memory.available / (1024**3),
|
| 140 |
+
'used_percent': memory.percent,
|
| 141 |
+
'free_gb': memory.free / (1024**3)
|
| 142 |
+
},
|
| 143 |
+
'cpu': {
|
| 144 |
+
'count': cpu_count,
|
| 145 |
+
'usage_percent': cpu_percent,
|
| 146 |
+
'load_average': os.getloadavg() if hasattr(os, 'getloadavg') else None
|
| 147 |
+
},
|
| 148 |
+
'disk': {
|
| 149 |
+
'total_gb': disk.total / (1024**3),
|
| 150 |
+
'free_gb': disk.free / (1024**3),
|
| 151 |
+
'used_percent': (disk.used / disk.total) * 100
|
| 152 |
+
},
|
| 153 |
+
'system': system_info,
|
| 154 |
+
'status': 'healthy' if memory.percent < 80 and cpu_percent < 80 else 'warning'
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.error(f"Health check failed: {e}")
|
| 159 |
+
return {
|
| 160 |
+
'timestamp': datetime.now().isoformat(),
|
| 161 |
+
'status': 'error',
|
| 162 |
+
'error': str(e)
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
@staticmethod
|
| 166 |
+
def check_dependencies() -> Dict[str, Any]:
|
| 167 |
+
"""Check if all required dependencies are available"""
|
| 168 |
+
|
| 169 |
+
required_packages = [
|
| 170 |
+
'gradio',
|
| 171 |
+
'markitdown',
|
| 172 |
+
'google-generativeai',
|
| 173 |
+
'plotly',
|
| 174 |
+
'pandas',
|
| 175 |
+
'numpy',
|
| 176 |
+
'aiofiles',
|
| 177 |
+
'tenacity',
|
| 178 |
+
'psutil',
|
| 179 |
+
'magic'
|
| 180 |
+
]
|
| 181 |
+
|
| 182 |
+
dependency_status = {}
|
| 183 |
+
all_available = True
|
| 184 |
+
|
| 185 |
+
for package in required_packages:
|
| 186 |
+
try:
|
| 187 |
+
__import__(package.replace('-', '_'))
|
| 188 |
+
dependency_status[package] = {'available': True, 'error': None}
|
| 189 |
+
except ImportError as e:
|
| 190 |
+
dependency_status[package] = {'available': False, 'error': str(e)}
|
| 191 |
+
all_available = False
|
| 192 |
+
|
| 193 |
+
return {
|
| 194 |
+
'timestamp': datetime.now().isoformat(),
|
| 195 |
+
'all_dependencies_available': all_available,
|
| 196 |
+
'packages': dependency_status,
|
| 197 |
+
'status': 'ready' if all_available else 'missing_dependencies'
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
@staticmethod
|
| 201 |
+
def run_comprehensive_health_check() -> Dict[str, Any]:
|
| 202 |
+
"""Run comprehensive system health check"""
|
| 203 |
+
|
| 204 |
+
logger.info("Starting comprehensive health check...")
|
| 205 |
+
|
| 206 |
+
# Detect environment
|
| 207 |
+
env_type = EnvironmentDetector.detect_environment()
|
| 208 |
+
env_config = EnvironmentDetector.get_environment_config(env_type)
|
| 209 |
+
|
| 210 |
+
# Check system resources
|
| 211 |
+
resource_check = SystemHealthChecker.check_system_resources()
|
| 212 |
+
|
| 213 |
+
# Check dependencies
|
| 214 |
+
dependency_check = SystemHealthChecker.check_dependencies()
|
| 215 |
+
|
| 216 |
+
# Overall health assessment
|
| 217 |
+
overall_status = 'healthy'
|
| 218 |
+
issues = []
|
| 219 |
+
|
| 220 |
+
if resource_check.get('status') != 'healthy':
|
| 221 |
+
overall_status = 'warning'
|
| 222 |
+
issues.append('System resources under pressure')
|
| 223 |
+
|
| 224 |
+
if not dependency_check.get('all_dependencies_available'):
|
| 225 |
+
overall_status = 'error'
|
| 226 |
+
issues.append('Missing required dependencies')
|
| 227 |
+
|
| 228 |
+
return {
|
| 229 |
+
'timestamp': datetime.now().isoformat(),
|
| 230 |
+
'environment': {
|
| 231 |
+
'type': env_type,
|
| 232 |
+
'config': env_config
|
| 233 |
+
},
|
| 234 |
+
'system_resources': resource_check,
|
| 235 |
+
'dependencies': dependency_check,
|
| 236 |
+
'overall_status': overall_status,
|
| 237 |
+
'issues': issues,
|
| 238 |
+
'recommendations': SystemHealthChecker._generate_recommendations(
|
| 239 |
+
env_type, resource_check, dependency_check
|
| 240 |
+
)
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
@staticmethod
|
| 244 |
+
def _generate_recommendations(
|
| 245 |
+
env_type: str,
|
| 246 |
+
resource_check: Dict[str, Any],
|
| 247 |
+
dependency_check: Dict[str, Any]
|
| 248 |
+
) -> List[str]:
|
| 249 |
+
"""Generate recommendations based on health check results"""
|
| 250 |
+
|
| 251 |
+
recommendations = []
|
| 252 |
+
|
| 253 |
+
# Memory recommendations
|
| 254 |
+
memory_percent = resource_check.get('memory', {}).get('used_percent', 0)
|
| 255 |
+
if memory_percent > 80:
|
| 256 |
+
recommendations.append("High memory usage detected. Consider reducing file sizes or processing batch sizes.")
|
| 257 |
+
|
| 258 |
+
# CPU recommendations
|
| 259 |
+
cpu_percent = resource_check.get('cpu', {}).get('usage_percent', 0)
|
| 260 |
+
if cpu_percent > 80:
|
| 261 |
+
recommendations.append("High CPU usage detected. Consider enabling async processing or reducing concurrent operations.")
|
| 262 |
+
|
| 263 |
+
# Environment-specific recommendations
|
| 264 |
+
if env_type == 'hf_spaces':
|
| 265 |
+
recommendations.extend([
|
| 266 |
+
"Optimize for HF Spaces: Keep file sizes under 50MB",
|
| 267 |
+
"Use stateless processing to avoid memory leaks",
|
| 268 |
+
"Implement proper cleanup in temporary directories"
|
| 269 |
+
])
|
| 270 |
+
|
| 271 |
+
# Dependency recommendations
|
| 272 |
+
if not dependency_check.get('all_dependencies_available'):
|
| 273 |
+
recommendations.append("Install missing dependencies using: pip install -r requirements.txt")
|
| 274 |
+
|
| 275 |
+
return recommendations
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
class DeploymentConfigGenerator:
|
| 279 |
+
"""Generate configuration files for different deployment environments"""
|
| 280 |
+
|
| 281 |
+
@staticmethod
|
| 282 |
+
def generate_hf_spaces_config() -> Dict[str, str]:
|
| 283 |
+
"""Generate configuration files for HF Spaces"""
|
| 284 |
+
|
| 285 |
+
# README.md content
|
| 286 |
+
readme_content = """---
|
| 287 |
+
title: MarkItDown Testing Platform
|
| 288 |
+
emoji: 🚀
|
| 289 |
+
colorFrom: blue
|
| 290 |
+
colorTo: purple
|
| 291 |
+
sdk: gradio
|
| 292 |
+
sdk_version: 4.0.0
|
| 293 |
+
app_file: app.py
|
| 294 |
+
pinned: false
|
| 295 |
+
---
|
| 296 |
+
|
| 297 |
+
# MarkItDown Testing Platform
|
| 298 |
+
|
| 299 |
+
Enterprise-grade document conversion testing with AI-powered analysis.
|
| 300 |
+
|
| 301 |
+
## Features
|
| 302 |
+
- Multi-format document conversion
|
| 303 |
+
- Google Gemini AI analysis
|
| 304 |
+
- Interactive dashboards
|
| 305 |
+
- Quality metrics and reporting
|
| 306 |
+
|
| 307 |
+
## Usage
|
| 308 |
+
1. Upload your document
|
| 309 |
+
2. Configure analysis settings
|
| 310 |
+
3. Enter Gemini API key (optional)
|
| 311 |
+
4. Process and analyze results
|
| 312 |
+
"""
|
| 313 |
+
|
| 314 |
+
# Dockerfile content
|
| 315 |
+
dockerfile_content = """FROM python:3.10-slim
|
| 316 |
+
|
| 317 |
+
ENV PYTHONUNBUFFERED=1
|
| 318 |
+
WORKDIR /app
|
| 319 |
+
|
| 320 |
+
RUN apt-get update && apt-get install -y gcc g++ libmagic1 libmagic-dev
|
| 321 |
+
|
| 322 |
+
COPY requirements.txt .
|
| 323 |
+
RUN pip install -r requirements.txt
|
| 324 |
+
|
| 325 |
+
COPY . .
|
| 326 |
+
|
| 327 |
+
EXPOSE 7860
|
| 328 |
+
|
| 329 |
+
CMD ["python", "app.py"]
|
| 330 |
+
"""
|
| 331 |
+
|
| 332 |
+
return {
|
| 333 |
+
'README.md': readme_content,
|
| 334 |
+
'Dockerfile': dockerfile_content
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
@staticmethod
|
| 338 |
+
def save_deployment_configs(output_dir: str = "."):
|
| 339 |
+
"""Save all deployment configuration files"""
|
| 340 |
+
|
| 341 |
+
output_path = Path(output_dir)
|
| 342 |
+
output_path.mkdir(exist_ok=True)
|
| 343 |
+
|
| 344 |
+
# Generate HF Spaces configs
|
| 345 |
+
hf_configs = DeploymentConfigGenerator.generate_hf_spaces_config()
|
| 346 |
+
|
| 347 |
+
for filename, content in hf_configs.items():
|
| 348 |
+
file_path = output_path / filename
|
| 349 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
| 350 |
+
f.write(content)
|
| 351 |
+
|
| 352 |
+
logger.info(f"Generated {filename} in {output_dir}")
|
| 353 |
+
|
| 354 |
+
logger.info("All deployment configurations generated successfully")
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
class DeploymentValidator:
|
| 358 |
+
"""Validate deployment readiness"""
|
| 359 |
+
|
| 360 |
+
@staticmethod
|
| 361 |
+
def validate_for_hf_spaces() -> Dict[str, Any]:
|
| 362 |
+
"""Validate configuration for HF Spaces deployment"""
|
| 363 |
+
|
| 364 |
+
validation_results = {
|
| 365 |
+
'timestamp': datetime.now().isoformat(),
|
| 366 |
+
'environment': 'hf_spaces',
|
| 367 |
+
'checks': {},
|
| 368 |
+
'overall_status': 'ready',
|
| 369 |
+
'issues': []
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
# Check required files
|
| 373 |
+
required_files = ['app.py', 'requirements.txt', 'README.md']
|
| 374 |
+
for file in required_files:
|
| 375 |
+
if os.path.exists(file):
|
| 376 |
+
validation_results['checks'][f'{file}_exists'] = True
|
| 377 |
+
else:
|
| 378 |
+
validation_results['checks'][f'{file}_exists'] = False
|
| 379 |
+
validation_results['issues'].append(f"Missing required file: {file}")
|
| 380 |
+
validation_results['overall_status'] = 'error'
|
| 381 |
+
|
| 382 |
+
# Check app.py structure
|
| 383 |
+
if os.path.exists('app.py'):
|
| 384 |
+
try:
|
| 385 |
+
with open('app.py', 'r') as f:
|
| 386 |
+
content = f.read()
|
| 387 |
+
|
| 388 |
+
# Check for required components
|
| 389 |
+
required_components = [
|
| 390 |
+
'gradio',
|
| 391 |
+
'launch',
|
| 392 |
+
'if __name__ == "__main__"'
|
| 393 |
+
]
|
| 394 |
+
|
| 395 |
+
for component in required_components:
|
| 396 |
+
if component in content:
|
| 397 |
+
validation_results['checks'][f'app_{component}'] = True
|
| 398 |
+
else:
|
| 399 |
+
validation_results['checks'][f'app_{component}'] = False
|
| 400 |
+
validation_results['issues'].append(f"Missing component in app.py: {component}")
|
| 401 |
+
validation_results['overall_status'] = 'warning'
|
| 402 |
+
|
| 403 |
+
except Exception as e:
|
| 404 |
+
validation_results['checks']['app_readable'] = False
|
| 405 |
+
validation_results['issues'].append(f"Cannot read app.py: {e}")
|
| 406 |
+
validation_results['overall_status'] = 'error'
|
| 407 |
+
|
| 408 |
+
# Check requirements.txt
|
| 409 |
+
if os.path.exists('requirements.txt'):
|
| 410 |
+
try:
|
| 411 |
+
with open('requirements.txt', 'r') as f:
|
| 412 |
+
requirements = f.read()
|
| 413 |
+
|
| 414 |
+
# Check for essential packages
|
| 415 |
+
essential_packages = ['gradio', 'markitdown', 'google-generativeai']
|
| 416 |
+
for package in essential_packages:
|
| 417 |
+
if package in requirements:
|
| 418 |
+
validation_results['checks'][f'req_{package}'] = True
|
| 419 |
+
else:
|
| 420 |
+
validation_results['checks'][f'req_{package}'] = False
|
| 421 |
+
validation_results['issues'].append(f"Missing package in requirements.txt: {package}")
|
| 422 |
+
validation_results['overall_status'] = 'warning'
|
| 423 |
+
|
| 424 |
+
except Exception as e:
|
| 425 |
+
validation_results['checks']['requirements_readable'] = False
|
| 426 |
+
validation_results['issues'].append(f"Cannot read requirements.txt: {e}")
|
| 427 |
+
validation_results['overall_status'] = 'error'
|
| 428 |
+
|
| 429 |
+
# Check file sizes (HF Spaces has limits)
|
| 430 |
+
total_size = 0
|
| 431 |
+
for root, dirs, files in os.walk('.'):
|
| 432 |
+
for file in files:
|
| 433 |
+
file_path = os.path.join(root, file)
|
| 434 |
+
if os.path.exists(file_path):
|
| 435 |
+
total_size += os.path.getsize(file_path)
|
| 436 |
+
|
| 437 |
+
total_size_mb = total_size / (1024 * 1024)
|
| 438 |
+
validation_results['checks']['total_size_mb'] = total_size_mb
|
| 439 |
+
|
| 440 |
+
if total_size_mb > 500: # HF Spaces limit
|
| 441 |
+
validation_results['issues'].append(f"Total size ({total_size_mb:.2f}MB) exceeds HF Spaces limit")
|
| 442 |
+
validation_results['overall_status'] = 'error'
|
| 443 |
+
|
| 444 |
+
return validation_results
|
| 445 |
+
|
| 446 |
+
@staticmethod
|
| 447 |
+
def generate_deployment_report() -> str:
|
| 448 |
+
"""Generate comprehensive deployment readiness report"""
|
| 449 |
+
|
| 450 |
+
# Run health check
|
| 451 |
+
health_check = SystemHealthChecker.run_comprehensive_health_check()
|
| 452 |
+
|
| 453 |
+
# Run HF Spaces validation
|
| 454 |
+
hf_validation = DeploymentValidator.validate_for_hf_spaces()
|
| 455 |
+
|
| 456 |
+
# Generate report
|
| 457 |
+
report = f"""
|
| 458 |
+
# MarkItDown Testing Platform - Deployment Report
|
| 459 |
+
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 460 |
+
|
| 461 |
+
## Environment Information
|
| 462 |
+
- **Type**: {health_check['environment']['type']}
|
| 463 |
+
- **Platform**: {health_check['system_resources']['system']['platform']}
|
| 464 |
+
- **Python**: {health_check['system_resources']['system']['python_version']}
|
| 465 |
+
|
| 466 |
+
## System Health Status: {health_check['overall_status'].upper()}
|
| 467 |
+
|
| 468 |
+
### System Resources
|
| 469 |
+
- **Memory**: {health_check['system_resources']['memory']['available_gb']:.2f}GB available ({health_check['system_resources']['memory']['used_percent']:.1f}% used)
|
| 470 |
+
- **CPU**: {health_check['system_resources']['cpu']['count']} cores, {health_check['system_resources']['cpu']['usage_percent']:.1f}% usage
|
| 471 |
+
- **Disk**: {health_check['system_resources']['disk']['free_gb']:.2f}GB free
|
| 472 |
+
|
| 473 |
+
### Dependencies Status: {"✅ READY" if health_check['dependencies']['all_dependencies_available'] else "❌ MISSING"}
|
| 474 |
+
"""
|
| 475 |
+
|
| 476 |
+
# Add dependency details
|
| 477 |
+
for package, status in health_check['dependencies']['packages'].items():
|
| 478 |
+
status_icon = "✅" if status['available'] else "❌"
|
| 479 |
+
report += f"- {status_icon} {package}\n"
|
| 480 |
+
|
| 481 |
+
# Add HF Spaces validation
|
| 482 |
+
report += f"""
|
| 483 |
+
## HF Spaces Deployment Readiness: {hf_validation['overall_status'].upper()}
|
| 484 |
+
|
| 485 |
+
### File Validation
|
| 486 |
+
"""
|
| 487 |
+
|
| 488 |
+
for check, result in hf_validation['checks'].items():
|
| 489 |
+
status_icon = "✅" if result else "❌"
|
| 490 |
+
report += f"- {status_icon} {check}\n"
|
| 491 |
+
|
| 492 |
+
# Add issues and recommendations
|
| 493 |
+
if health_check['issues']:
|
| 494 |
+
report += "\n### Issues Identified:\n"
|
| 495 |
+
for issue in health_check['issues']:
|
| 496 |
+
report += f"- ⚠️ {issue}\n"
|
| 497 |
+
|
| 498 |
+
if hf_validation['issues']:
|
| 499 |
+
report += "\n### HF Spaces Issues:\n"
|
| 500 |
+
for issue in hf_validation['issues']:
|
| 501 |
+
report += f"- ⚠️ {issue}\n"
|
| 502 |
+
|
| 503 |
+
if health_check['recommendations']:
|
| 504 |
+
report += "\n### Recommendations:\n"
|
| 505 |
+
for rec in health_check['recommendations']:
|
| 506 |
+
report += f"- 💡 {rec}\n"
|
| 507 |
+
|
| 508 |
+
# Add deployment commands
|
| 509 |
+
report += f"""
|
| 510 |
+
## Deployment Commands
|
| 511 |
+
|
| 512 |
+
### Local Development
|
| 513 |
+
```bash
|
| 514 |
+
python app.py
|
| 515 |
+
```
|
| 516 |
+
|
| 517 |
+
### Docker Deployment
|
| 518 |
+
```bash
|
| 519 |
+
docker build -t markitdown-platform .
|
| 520 |
+
docker run -p 7860:7860 markitdown-platform
|
| 521 |
+
```
|
| 522 |
+
|
| 523 |
+
### HF Spaces Deployment
|
| 524 |
+
1. Create new Space on Hugging Face
|
| 525 |
+
2. Upload files or connect GitHub repository
|
| 526 |
+
3. Configure Space settings:
|
| 527 |
+
- SDK: Gradio
|
| 528 |
+
- Python version: 3.10
|
| 529 |
+
- Hardware: CPU (free tier)
|
| 530 |
+
|
| 531 |
+
---
|
| 532 |
+
Report generated by MarkItDown Testing Platform Deployment Utils
|
| 533 |
+
"""
|
| 534 |
+
|
| 535 |
+
return report
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
def main():
|
| 539 |
+
"""Main function for deployment utilities CLI"""
|
| 540 |
+
|
| 541 |
+
import argparse
|
| 542 |
+
|
| 543 |
+
parser = argparse.ArgumentParser(description='MarkItDown Platform Deployment Utilities')
|
| 544 |
+
parser.add_argument(
|
| 545 |
+
'command',
|
| 546 |
+
choices=['health-check', 'validate', 'generate-configs', 'report'],
|
| 547 |
+
help='Command to execute'
|
| 548 |
+
)
|
| 549 |
+
parser.add_argument(
|
| 550 |
+
'--output',
|
| 551 |
+
default='.',
|
| 552 |
+
help='Output directory for generated files'
|
| 553 |
+
)
|
| 554 |
+
parser.add_argument(
|
| 555 |
+
'--format',
|
| 556 |
+
choices=['json', 'text'],
|
| 557 |
+
default='text',
|
| 558 |
+
help='Output format'
|
| 559 |
+
)
|
| 560 |
+
|
| 561 |
+
args = parser.parse_args()
|
| 562 |
+
|
| 563 |
+
if args.command == 'health-check':
|
| 564 |
+
result = SystemHealthChecker.run_comprehensive_health_check()
|
| 565 |
+
if args.format == 'json':
|
| 566 |
+
print(json.dumps(result, indent=2))
|
| 567 |
+
else:
|
| 568 |
+
print(f"System Status: {result['overall_status'].upper()}")
|
| 569 |
+
print(f"Environment: {result['environment']['type']}")
|
| 570 |
+
print(f"Memory Available: {result['system_resources']['memory']['available_gb']:.2f}GB")
|
| 571 |
+
print(f"Dependencies: {'OK' if result['dependencies']['all_dependencies_available'] else 'MISSING'}")
|
| 572 |
+
|
| 573 |
+
if result['issues']:
|
| 574 |
+
print("\nIssues:")
|
| 575 |
+
for issue in result['issues']:
|
| 576 |
+
print(f" - {issue}")
|
| 577 |
+
|
| 578 |
+
elif args.command == 'validate':
|
| 579 |
+
result = DeploymentValidator.validate_for_hf_spaces()
|
| 580 |
+
if args.format == 'json':
|
| 581 |
+
print(json.dumps(result, indent=2))
|
| 582 |
+
else:
|
| 583 |
+
print(f"HF Spaces Validation: {result['overall_status'].upper()}")
|
| 584 |
+
if result['issues']:
|
| 585 |
+
print("Issues found:")
|
| 586 |
+
for issue in result['issues']:
|
| 587 |
+
print(f" - {issue}")
|
| 588 |
+
else:
|
| 589 |
+
print("✅ Ready for HF Spaces deployment!")
|
| 590 |
+
|
| 591 |
+
elif args.command == 'generate-configs':
|
| 592 |
+
DeploymentConfigGenerator.save_deployment_configs(args.output)
|
| 593 |
+
print(f"Configuration files generated in {args.output}")
|
| 594 |
+
|
| 595 |
+
elif args.command == 'report':
|
| 596 |
+
report = DeploymentValidator.generate_deployment_report()
|
| 597 |
+
|
| 598 |
+
if args.output != '.':
|
| 599 |
+
os.makedirs(args.output, exist_ok=True)
|
| 600 |
+
report_file = os.path.join(args.output, 'deployment_report.md')
|
| 601 |
+
with open(report_file, 'w') as f:
|
| 602 |
+
f.write(report)
|
| 603 |
+
print(f"Deployment report saved to {report_file}")
|
| 604 |
+
else:
|
| 605 |
+
print(report)
|
| 606 |
+
|
| 607 |
+
|
| 608 |
+
if __name__ == "__main__":
|
| 609 |
+
main()
|
visualization/analytics_engine.py
ADDED
|
@@ -0,0 +1,1393 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enterprise Visualization Architecture - Strategic Refactoring Implementation
|
| 3 |
+
|
| 4 |
+
Core Design Philosophy:
|
| 5 |
+
"Complexity is the enemy of reliable software"
|
| 6 |
+
|
| 7 |
+
Architectural Principles Applied:
|
| 8 |
+
- Single Responsibility: Each component handles one concern
|
| 9 |
+
- Dependency Inversion: Abstract interfaces eliminate tight coupling
|
| 10 |
+
- Human-Scale Modularity: Components fit in developer working memory
|
| 11 |
+
- Testable Design: Every component can be unit tested independently
|
| 12 |
+
|
| 13 |
+
Strategic Benefits:
|
| 14 |
+
- Maintainability: Clear component boundaries enable team collaboration
|
| 15 |
+
- Extensibility: Plugin architecture supports future requirements
|
| 16 |
+
- Performance: Optimized algorithms with caching strategies
|
| 17 |
+
- Reliability: Comprehensive error boundaries with graceful degradation
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import logging
|
| 21 |
+
from abc import ABC, abstractmethod
|
| 22 |
+
from dataclasses import dataclass, field
|
| 23 |
+
from datetime import datetime
|
| 24 |
+
from typing import Dict, Any, List, Optional, Tuple, Union, Protocol
|
| 25 |
+
from enum import Enum
|
| 26 |
+
import json
|
| 27 |
+
from pydantic import JsonValue
|
| 28 |
+
|
| 29 |
+
JSONDict = Dict[str, JsonValue]
|
| 30 |
+
|
| 31 |
+
# Strategic import approach - minimal external dependencies
|
| 32 |
+
import plotly.graph_objects as go
|
| 33 |
+
import plotly.express as px
|
| 34 |
+
from plotly.subplots import make_subplots
|
| 35 |
+
import pandas as pd
|
| 36 |
+
import numpy as np
|
| 37 |
+
|
| 38 |
+
# Configure enterprise logging
|
| 39 |
+
logger = logging.getLogger(__name__)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ==================== STRATEGIC DATA ABSTRACTIONS ====================
|
| 43 |
+
|
| 44 |
+
@dataclass(frozen=True)
|
| 45 |
+
class DocumentAnalysisData:
|
| 46 |
+
"""
|
| 47 |
+
Immutable data container - eliminates circular import dependencies
|
| 48 |
+
|
| 49 |
+
Strategic Design:
|
| 50 |
+
- Frozen dataclass ensures immutability
|
| 51 |
+
- Self-contained data eliminates external module coupling
|
| 52 |
+
- Clear interface enables component testing
|
| 53 |
+
"""
|
| 54 |
+
content: str
|
| 55 |
+
metadata: JSONDict
|
| 56 |
+
processing_metrics: JSONDict = field(default_factory=dict)
|
| 57 |
+
ai_analysis_data: Optional[JSONDict] = None
|
| 58 |
+
|
| 59 |
+
@classmethod
|
| 60 |
+
def from_processing_result(cls, conversion_result, analysis_result=None) -> 'DocumentAnalysisData':
|
| 61 |
+
"""Factory method for creating from external processing results"""
|
| 62 |
+
|
| 63 |
+
# Extract content and metadata safely
|
| 64 |
+
content = getattr(conversion_result, 'content', '') or ''
|
| 65 |
+
metadata = getattr(conversion_result, 'metadata', {}) or {}
|
| 66 |
+
|
| 67 |
+
# Extract processing metrics
|
| 68 |
+
processing_metrics = {
|
| 69 |
+
'processing_time': getattr(conversion_result, 'processing_time', 0),
|
| 70 |
+
'success': getattr(conversion_result, 'success', False),
|
| 71 |
+
'content_length': len(content)
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# Extract AI analysis data if available
|
| 75 |
+
ai_data = None
|
| 76 |
+
if analysis_result and hasattr(analysis_result, 'success') and analysis_result.success:
|
| 77 |
+
ai_data = {
|
| 78 |
+
'analysis_type': getattr(analysis_result, 'analysis_type', None),
|
| 79 |
+
'model_used': getattr(analysis_result, 'model_used', None),
|
| 80 |
+
'content': getattr(analysis_result, 'content', {}),
|
| 81 |
+
'processing_time': getattr(analysis_result, 'processing_time', 0)
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
return cls(
|
| 85 |
+
content=content,
|
| 86 |
+
metadata=metadata,
|
| 87 |
+
processing_metrics=processing_metrics,
|
| 88 |
+
ai_analysis_data=ai_data
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@dataclass(frozen=True)
|
| 93 |
+
class StructuralMetrics:
|
| 94 |
+
"""Immutable container for document structural analysis"""
|
| 95 |
+
header_count: int = 0
|
| 96 |
+
list_items: int = 0
|
| 97 |
+
table_rows: int = 0
|
| 98 |
+
code_blocks: int = 0
|
| 99 |
+
links: int = 0
|
| 100 |
+
max_header_depth: int = 0
|
| 101 |
+
structure_density: float = 0.0
|
| 102 |
+
|
| 103 |
+
def to_dict(self) -> JSONDict:
|
| 104 |
+
"""Convert to dictionary for external consumption"""
|
| 105 |
+
return {
|
| 106 |
+
'header_count': self.header_count,
|
| 107 |
+
'list_items': self.list_items,
|
| 108 |
+
'table_rows': self.table_rows,
|
| 109 |
+
'code_blocks': self.code_blocks,
|
| 110 |
+
'links': self.links,
|
| 111 |
+
'max_header_depth': self.max_header_depth,
|
| 112 |
+
'structure_density': self.structure_density
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@dataclass(frozen=True)
|
| 117 |
+
class QualityAssessment:
|
| 118 |
+
"""Comprehensive quality metrics container"""
|
| 119 |
+
composite_score: float = 0.0
|
| 120 |
+
structural_score: float = 0.0
|
| 121 |
+
content_score: float = 0.0
|
| 122 |
+
ai_score: float = 0.0
|
| 123 |
+
performance_score: float = 0.0
|
| 124 |
+
|
| 125 |
+
def to_dict(self) -> JSONDict:
|
| 126 |
+
return {
|
| 127 |
+
'composite_score': self.composite_score,
|
| 128 |
+
'structural_score': self.structural_score,
|
| 129 |
+
'content_score': self.content_score,
|
| 130 |
+
'ai_score': self.ai_score,
|
| 131 |
+
'performance_score': self.performance_score
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
@dataclass(frozen=True)
|
| 136 |
+
class VisualizationRequest:
|
| 137 |
+
"""Request abstraction for visualization generation"""
|
| 138 |
+
analysis_data: DocumentAnalysisData
|
| 139 |
+
chart_type: str
|
| 140 |
+
configuration: JSONDict = field(default_factory=dict)
|
| 141 |
+
theme: str = 'plotly_white'
|
| 142 |
+
dimensions: Tuple[int, int] = (800, 600)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# ==================== COMPONENT INTERFACES ====================
|
| 146 |
+
|
| 147 |
+
class ContentAnalyzer(Protocol):
|
| 148 |
+
"""Interface for content analysis components"""
|
| 149 |
+
|
| 150 |
+
def analyze_structure(self, content: str) -> StructuralMetrics:
|
| 151 |
+
"""Analyze document structural elements"""
|
| 152 |
+
...
|
| 153 |
+
|
| 154 |
+
def calculate_quality_metrics(self, analysis_data: DocumentAnalysisData) -> QualityAssessment:
|
| 155 |
+
"""Calculate comprehensive quality assessment"""
|
| 156 |
+
...
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class ChartRenderer(Protocol):
|
| 160 |
+
"""Interface for chart generation components"""
|
| 161 |
+
|
| 162 |
+
def render_radar_chart(self, data: Dict[str, float], **kwargs) -> go.Figure:
|
| 163 |
+
"""Render radar/polar chart"""
|
| 164 |
+
...
|
| 165 |
+
|
| 166 |
+
def render_bar_chart(self, data: Dict[str, float], **kwargs) -> go.Figure:
|
| 167 |
+
"""Render bar chart"""
|
| 168 |
+
...
|
| 169 |
+
|
| 170 |
+
def render_treemap(self, data: Dict[str, Any], **kwargs) -> go.Figure:
|
| 171 |
+
"""Render treemap visualization"""
|
| 172 |
+
...
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
class DashboardComposer(Protocol):
|
| 176 |
+
"""Interface for dashboard composition"""
|
| 177 |
+
|
| 178 |
+
def compose_quality_dashboard(
|
| 179 |
+
self,
|
| 180 |
+
quality_metrics: QualityAssessment,
|
| 181 |
+
structural_metrics: StructuralMetrics,
|
| 182 |
+
**kwargs
|
| 183 |
+
) -> go.Figure:
|
| 184 |
+
"""Compose comprehensive quality dashboard"""
|
| 185 |
+
...
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# ==================== CORE IMPLEMENTATION COMPONENTS ====================
|
| 189 |
+
|
| 190 |
+
class OptimizedContentAnalyzer:
|
| 191 |
+
"""
|
| 192 |
+
High-performance content analysis with single-pass parsing
|
| 193 |
+
|
| 194 |
+
Strategic Design:
|
| 195 |
+
- Single Responsibility: Content analysis only
|
| 196 |
+
- Performance Optimized: O(n) complexity for all operations
|
| 197 |
+
- Memory Efficient: Minimal object allocation during parsing
|
| 198 |
+
- Error Resilient: Handles malformed content gracefully
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
def __init__(self):
|
| 202 |
+
self._analysis_cache: Dict[str, StructuralMetrics] = {}
|
| 203 |
+
self._cache_hit_count = 0
|
| 204 |
+
self._cache_miss_count = 0
|
| 205 |
+
|
| 206 |
+
def analyze_structure(self, content: str) -> StructuralMetrics:
|
| 207 |
+
"""
|
| 208 |
+
Single-pass structural analysis with caching
|
| 209 |
+
|
| 210 |
+
Performance Strategy:
|
| 211 |
+
- Cache results by content hash for identical documents
|
| 212 |
+
- Single iteration through content lines
|
| 213 |
+
- Efficient pattern matching with early termination
|
| 214 |
+
"""
|
| 215 |
+
|
| 216 |
+
# Generate cache key from content hash
|
| 217 |
+
import hashlib
|
| 218 |
+
content_hash = hashlib.md5(content.encode()).hexdigest()
|
| 219 |
+
|
| 220 |
+
# Check cache first
|
| 221 |
+
if content_hash in self._analysis_cache:
|
| 222 |
+
self._cache_hit_count += 1
|
| 223 |
+
logger.debug(f"Cache hit for content analysis - {self._cache_hit_count} hits")
|
| 224 |
+
return self._analysis_cache[content_hash]
|
| 225 |
+
|
| 226 |
+
self._cache_miss_count += 1
|
| 227 |
+
logger.debug(f"Cache miss - analyzing content structure")
|
| 228 |
+
|
| 229 |
+
# Single-pass analysis
|
| 230 |
+
lines = content.split('\n')
|
| 231 |
+
total_lines = len(lines)
|
| 232 |
+
|
| 233 |
+
header_count = 0
|
| 234 |
+
list_items = 0
|
| 235 |
+
table_rows = 0
|
| 236 |
+
code_blocks = 0
|
| 237 |
+
links = 0
|
| 238 |
+
max_header_depth = 0
|
| 239 |
+
structural_elements = 0
|
| 240 |
+
|
| 241 |
+
in_code_block = False
|
| 242 |
+
|
| 243 |
+
for line in lines:
|
| 244 |
+
stripped_line = line.strip()
|
| 245 |
+
|
| 246 |
+
# Skip empty lines
|
| 247 |
+
if not stripped_line:
|
| 248 |
+
continue
|
| 249 |
+
|
| 250 |
+
# Code block detection
|
| 251 |
+
if stripped_line.startswith('```'):
|
| 252 |
+
if in_code_block:
|
| 253 |
+
code_blocks += 1
|
| 254 |
+
in_code_block = not in_code_block
|
| 255 |
+
structural_elements += 1
|
| 256 |
+
continue
|
| 257 |
+
|
| 258 |
+
# Skip analysis inside code blocks
|
| 259 |
+
if in_code_block:
|
| 260 |
+
continue
|
| 261 |
+
|
| 262 |
+
# Header analysis
|
| 263 |
+
if stripped_line.startswith('#'):
|
| 264 |
+
header_level = len(stripped_line) - len(stripped_line.lstrip('#'))
|
| 265 |
+
header_count += 1
|
| 266 |
+
max_header_depth = max(max_header_depth, header_level)
|
| 267 |
+
structural_elements += 1
|
| 268 |
+
continue
|
| 269 |
+
|
| 270 |
+
# List item analysis
|
| 271 |
+
if stripped_line.startswith(('- ', '* ', '+ ')) or (
|
| 272 |
+
len(stripped_line) > 2 and
|
| 273 |
+
stripped_line[0].isdigit() and
|
| 274 |
+
stripped_line[1:3] == '. '
|
| 275 |
+
):
|
| 276 |
+
list_items += 1
|
| 277 |
+
structural_elements += 1
|
| 278 |
+
continue
|
| 279 |
+
|
| 280 |
+
# Table row analysis
|
| 281 |
+
if '|' in stripped_line and stripped_line.count('|') >= 2:
|
| 282 |
+
table_rows += 1
|
| 283 |
+
structural_elements += 1
|
| 284 |
+
|
| 285 |
+
# Link analysis (can coexist with other elements)
|
| 286 |
+
links += stripped_line.count('](')
|
| 287 |
+
|
| 288 |
+
# Calculate structure density
|
| 289 |
+
structure_density = structural_elements / total_lines if total_lines > 0 else 0.0
|
| 290 |
+
|
| 291 |
+
# Create metrics object
|
| 292 |
+
metrics = StructuralMetrics(
|
| 293 |
+
header_count=header_count,
|
| 294 |
+
list_items=list_items,
|
| 295 |
+
table_rows=table_rows,
|
| 296 |
+
code_blocks=code_blocks,
|
| 297 |
+
links=links,
|
| 298 |
+
max_header_depth=max_header_depth,
|
| 299 |
+
structure_density=structure_density
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
# Cache the result
|
| 303 |
+
self._analysis_cache[content_hash] = metrics
|
| 304 |
+
|
| 305 |
+
return metrics
|
| 306 |
+
|
| 307 |
+
def calculate_quality_metrics(self, analysis_data: DocumentAnalysisData) -> QualityAssessment:
|
| 308 |
+
"""
|
| 309 |
+
Comprehensive quality assessment with weighted scoring
|
| 310 |
+
|
| 311 |
+
Strategic Approach:
|
| 312 |
+
- Multiple quality dimensions with configurable weights
|
| 313 |
+
- AI analysis integration when available
|
| 314 |
+
- Performance metrics consideration
|
| 315 |
+
- Normalized scoring (0-10 scale)
|
| 316 |
+
"""
|
| 317 |
+
|
| 318 |
+
# Analyze document structure
|
| 319 |
+
structural_metrics = self.analyze_structure(analysis_data.content)
|
| 320 |
+
|
| 321 |
+
# Calculate structural quality score (0-10)
|
| 322 |
+
structural_score = min(10.0, (
|
| 323 |
+
(structural_metrics.header_count * 1.0) +
|
| 324 |
+
(structural_metrics.list_items * 0.5) +
|
| 325 |
+
(structural_metrics.table_rows * 0.8) +
|
| 326 |
+
(structural_metrics.code_blocks * 0.6) +
|
| 327 |
+
(structural_metrics.links * 0.3) +
|
| 328 |
+
(structural_metrics.structure_density * 10.0)
|
| 329 |
+
))
|
| 330 |
+
|
| 331 |
+
# Calculate content quality score
|
| 332 |
+
content_length = len(analysis_data.content)
|
| 333 |
+
word_count = len(analysis_data.content.split()) if analysis_data.content else 0
|
| 334 |
+
|
| 335 |
+
content_score = min(10.0, (
|
| 336 |
+
(min(content_length / 1000, 5.0)) + # Length factor (up to 5 points)
|
| 337 |
+
(min(word_count / 200, 3.0)) + # Word density (up to 3 points)
|
| 338 |
+
(2.0 if structural_metrics.structure_density > 0.1 else 0.0) # Structure bonus
|
| 339 |
+
))
|
| 340 |
+
|
| 341 |
+
# AI analysis score integration
|
| 342 |
+
ai_score = 0.0
|
| 343 |
+
if analysis_data.ai_analysis_data:
|
| 344 |
+
ai_content = analysis_data.ai_analysis_data.get('content', {})
|
| 345 |
+
ai_score = ai_content.get('overall_score', 0.0)
|
| 346 |
+
|
| 347 |
+
# Fallback calculation if no overall score
|
| 348 |
+
if ai_score == 0.0:
|
| 349 |
+
ai_score = (
|
| 350 |
+
ai_content.get('structure_score', 0.0) +
|
| 351 |
+
ai_content.get('completeness_score', 0.0) +
|
| 352 |
+
ai_content.get('accuracy_score', 0.0) +
|
| 353 |
+
ai_content.get('readability_score', 0.0)
|
| 354 |
+
) / 4.0
|
| 355 |
+
|
| 356 |
+
# Performance score
|
| 357 |
+
processing_time = analysis_data.processing_metrics.get('processing_time', 0)
|
| 358 |
+
performance_score = max(0.0, min(10.0, 10.0 - (processing_time * 0.1)))
|
| 359 |
+
|
| 360 |
+
# Composite score calculation with weights
|
| 361 |
+
weights = {
|
| 362 |
+
'structural': 0.3,
|
| 363 |
+
'content': 0.25,
|
| 364 |
+
'ai': 0.3,
|
| 365 |
+
'performance': 0.15
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
# Adjust weights if AI analysis is not available
|
| 369 |
+
if ai_score == 0.0:
|
| 370 |
+
weights = {
|
| 371 |
+
'structural': 0.45,
|
| 372 |
+
'content': 0.35,
|
| 373 |
+
'ai': 0.0,
|
| 374 |
+
'performance': 0.2
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
composite_score = (
|
| 378 |
+
structural_score * weights['structural'] +
|
| 379 |
+
content_score * weights['content'] +
|
| 380 |
+
ai_score * weights['ai'] +
|
| 381 |
+
performance_score * weights['performance']
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
return QualityAssessment(
|
| 385 |
+
composite_score=round(composite_score, 2),
|
| 386 |
+
structural_score=round(structural_score, 2),
|
| 387 |
+
content_score=round(content_score, 2),
|
| 388 |
+
ai_score=round(ai_score, 2),
|
| 389 |
+
performance_score=round(performance_score, 2)
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
def get_cache_statistics(self) -> JSONDict:
|
| 393 |
+
"""Get cache performance statistics"""
|
| 394 |
+
total_requests = self._cache_hit_count + self._cache_miss_count
|
| 395 |
+
hit_rate = self._cache_hit_count / total_requests if total_requests > 0 else 0.0
|
| 396 |
+
|
| 397 |
+
return {
|
| 398 |
+
'cache_hits': self._cache_hit_count,
|
| 399 |
+
'cache_misses': self._cache_miss_count,
|
| 400 |
+
'hit_rate_percent': hit_rate * 100,
|
| 401 |
+
'cache_size': len(self._analysis_cache)
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
class PlotlyChartRenderer:
|
| 406 |
+
"""
|
| 407 |
+
Professional chart rendering with consistent styling
|
| 408 |
+
|
| 409 |
+
Strategic Design:
|
| 410 |
+
- Single Responsibility: Chart generation only
|
| 411 |
+
- Consistent Theming: Enterprise-appropriate visual standards
|
| 412 |
+
- Performance Optimized: Efficient Plotly figure generation
|
| 413 |
+
- Accessibility Compliant: Color-blind friendly palettes
|
| 414 |
+
"""
|
| 415 |
+
|
| 416 |
+
def __init__(self, theme: str = 'plotly_white'):
|
| 417 |
+
self.theme = theme
|
| 418 |
+
self.color_palette = [
|
| 419 |
+
'#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
|
| 420 |
+
'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
|
| 421 |
+
]
|
| 422 |
+
self.enterprise_colors = {
|
| 423 |
+
'primary': '#667eea',
|
| 424 |
+
'secondary': '#764ba2',
|
| 425 |
+
'success': '#28a745',
|
| 426 |
+
'warning': '#ffc107',
|
| 427 |
+
'danger': '#dc3545',
|
| 428 |
+
'info': '#17a2b8'
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
def render_radar_chart(self, data: Dict[str, float], **kwargs) -> go.Figure:
|
| 432 |
+
"""
|
| 433 |
+
Professional radar chart with enterprise styling
|
| 434 |
+
|
| 435 |
+
Strategic Features:
|
| 436 |
+
- Consistent color scheme
|
| 437 |
+
- Responsive design
|
| 438 |
+
- Clear labeling and legends
|
| 439 |
+
- Accessibility compliance
|
| 440 |
+
"""
|
| 441 |
+
|
| 442 |
+
title = kwargs.get('title', 'Quality Assessment Radar')
|
| 443 |
+
categories = list(data.keys())
|
| 444 |
+
values = list(data.values())
|
| 445 |
+
|
| 446 |
+
fig = go.Figure()
|
| 447 |
+
|
| 448 |
+
fig.add_trace(go.Scatterpolar(
|
| 449 |
+
r=values,
|
| 450 |
+
theta=categories,
|
| 451 |
+
fill='toself',
|
| 452 |
+
name='Quality Metrics',
|
| 453 |
+
line=dict(color=self.enterprise_colors['primary'], width=3),
|
| 454 |
+
fillcolor=f"rgba(102, 126, 234, 0.3)"
|
| 455 |
+
))
|
| 456 |
+
|
| 457 |
+
fig.update_layout(
|
| 458 |
+
polar=dict(
|
| 459 |
+
radialaxis=dict(
|
| 460 |
+
visible=True,
|
| 461 |
+
range=[0, 10],
|
| 462 |
+
tickfont=dict(size=12),
|
| 463 |
+
gridcolor='rgba(128, 128, 128, 0.3)'
|
| 464 |
+
),
|
| 465 |
+
angularaxis=dict(
|
| 466 |
+
tickfont=dict(size=12, color='#333333')
|
| 467 |
+
)
|
| 468 |
+
),
|
| 469 |
+
title=dict(
|
| 470 |
+
text=title,
|
| 471 |
+
x=0.5,
|
| 472 |
+
font=dict(size=16, color='#333333')
|
| 473 |
+
),
|
| 474 |
+
template=self.theme,
|
| 475 |
+
showlegend=False,
|
| 476 |
+
width=kwargs.get('width', 600),
|
| 477 |
+
height=kwargs.get('height', 600)
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
return fig
|
| 481 |
+
|
| 482 |
+
def render_bar_chart(self, data: Dict[str, float], **kwargs) -> go.Figure:
|
| 483 |
+
"""Professional bar chart with enterprise styling"""
|
| 484 |
+
|
| 485 |
+
title = kwargs.get('title', 'Metrics Comparison')
|
| 486 |
+
orientation = kwargs.get('orientation', 'v') # 'v' for vertical, 'h' for horizontal
|
| 487 |
+
|
| 488 |
+
categories = list(data.keys())
|
| 489 |
+
values = list(data.values())
|
| 490 |
+
|
| 491 |
+
# Color mapping based on values
|
| 492 |
+
colors = []
|
| 493 |
+
for value in values:
|
| 494 |
+
if value >= 8:
|
| 495 |
+
colors.append(self.enterprise_colors['success'])
|
| 496 |
+
elif value >= 6:
|
| 497 |
+
colors.append(self.enterprise_colors['info'])
|
| 498 |
+
elif value >= 4:
|
| 499 |
+
colors.append(self.enterprise_colors['warning'])
|
| 500 |
+
else:
|
| 501 |
+
colors.append(self.enterprise_colors['danger'])
|
| 502 |
+
|
| 503 |
+
fig = go.Figure()
|
| 504 |
+
|
| 505 |
+
if orientation == 'h':
|
| 506 |
+
fig.add_trace(go.Bar(
|
| 507 |
+
x=values,
|
| 508 |
+
y=categories,
|
| 509 |
+
orientation='h',
|
| 510 |
+
marker=dict(color=colors),
|
| 511 |
+
text=[f'{v:.1f}' for v in values],
|
| 512 |
+
textposition='inside',
|
| 513 |
+
textfont=dict(color='white', size=12)
|
| 514 |
+
))
|
| 515 |
+
else:
|
| 516 |
+
fig.add_trace(go.Bar(
|
| 517 |
+
x=categories,
|
| 518 |
+
y=values,
|
| 519 |
+
marker=dict(color=colors),
|
| 520 |
+
text=[f'{v:.1f}' for v in values],
|
| 521 |
+
textposition='outside',
|
| 522 |
+
textfont=dict(color='#333333', size=12)
|
| 523 |
+
))
|
| 524 |
+
|
| 525 |
+
fig.update_layout(
|
| 526 |
+
title=dict(
|
| 527 |
+
text=title,
|
| 528 |
+
x=0.5,
|
| 529 |
+
font=dict(size=16, color='#333333')
|
| 530 |
+
),
|
| 531 |
+
template=self.theme,
|
| 532 |
+
showlegend=False,
|
| 533 |
+
xaxis=dict(title=kwargs.get('x_title', '')),
|
| 534 |
+
yaxis=dict(title=kwargs.get('y_title', '')),
|
| 535 |
+
width=kwargs.get('width', 800),
|
| 536 |
+
height=kwargs.get('height', 500)
|
| 537 |
+
)
|
| 538 |
+
|
| 539 |
+
return fig
|
| 540 |
+
|
| 541 |
+
def render_treemap(self, data: Dict[str, Any], **kwargs) -> go.Figure:
|
| 542 |
+
"""Professional treemap visualization"""
|
| 543 |
+
|
| 544 |
+
title = kwargs.get('title', 'Structure Analysis')
|
| 545 |
+
|
| 546 |
+
# Prepare data for treemap
|
| 547 |
+
labels = data.get('labels', [])
|
| 548 |
+
values = data.get('values', [])
|
| 549 |
+
parents = data.get('parents', [])
|
| 550 |
+
|
| 551 |
+
if not labels or not values:
|
| 552 |
+
# Create placeholder treemap
|
| 553 |
+
labels = ['Content', 'Headers', 'Lists', 'Tables']
|
| 554 |
+
values = [100, 20, 15, 10]
|
| 555 |
+
parents = ['', 'Content', 'Content', 'Content']
|
| 556 |
+
|
| 557 |
+
fig = go.Figure(go.Treemap(
|
| 558 |
+
labels=labels,
|
| 559 |
+
values=values,
|
| 560 |
+
parents=parents,
|
| 561 |
+
textinfo="label+value+percent parent",
|
| 562 |
+
textfont=dict(size=12),
|
| 563 |
+
marker=dict(
|
| 564 |
+
colorscale='Viridis',
|
| 565 |
+
showscale=True
|
| 566 |
+
)
|
| 567 |
+
))
|
| 568 |
+
|
| 569 |
+
fig.update_layout(
|
| 570 |
+
title=dict(
|
| 571 |
+
text=title,
|
| 572 |
+
x=0.5,
|
| 573 |
+
font=dict(size=16, color='#333333')
|
| 574 |
+
),
|
| 575 |
+
template=self.theme,
|
| 576 |
+
width=kwargs.get('width', 800),
|
| 577 |
+
height=kwargs.get('height', 600)
|
| 578 |
+
)
|
| 579 |
+
|
| 580 |
+
return fig
|
| 581 |
+
|
| 582 |
+
def render_gauge_chart(self, value: float, **kwargs) -> go.Figure:
|
| 583 |
+
"""Professional gauge chart for single metrics"""
|
| 584 |
+
|
| 585 |
+
title = kwargs.get('title', 'Quality Score')
|
| 586 |
+
max_value = kwargs.get('max_value', 10)
|
| 587 |
+
|
| 588 |
+
fig = go.Figure(go.Indicator(
|
| 589 |
+
mode="gauge+number+delta",
|
| 590 |
+
value=value,
|
| 591 |
+
domain={'x': [0, 1], 'y': [0, 1]},
|
| 592 |
+
title={'text': title, 'font': {'size': 16}},
|
| 593 |
+
delta={'reference': kwargs.get('reference', 7.0)},
|
| 594 |
+
gauge={
|
| 595 |
+
'axis': {'range': [None, max_value], 'tickcolor': '#333333'},
|
| 596 |
+
'bar': {'color': self.enterprise_colors['primary']},
|
| 597 |
+
'steps': [
|
| 598 |
+
{'range': [0, max_value * 0.5], 'color': "lightgray"},
|
| 599 |
+
{'range': [max_value * 0.5, max_value * 0.8], 'color': "gray"}
|
| 600 |
+
],
|
| 601 |
+
'threshold': {
|
| 602 |
+
'line': {'color': self.enterprise_colors['danger'], 'width': 4},
|
| 603 |
+
'thickness': 0.75,
|
| 604 |
+
'value': max_value * 0.9
|
| 605 |
+
}
|
| 606 |
+
}
|
| 607 |
+
))
|
| 608 |
+
|
| 609 |
+
fig.update_layout(
|
| 610 |
+
template=self.theme,
|
| 611 |
+
width=kwargs.get('width', 400),
|
| 612 |
+
height=kwargs.get('height', 400)
|
| 613 |
+
)
|
| 614 |
+
|
| 615 |
+
return fig
|
| 616 |
+
|
| 617 |
+
|
| 618 |
+
class EnterpriseDashboardComposer:
|
| 619 |
+
"""
|
| 620 |
+
Strategic dashboard composition with enterprise-grade layouts
|
| 621 |
+
|
| 622 |
+
Design Philosophy:
|
| 623 |
+
- Executive-Friendly Layouts: Information hierarchy for decision makers
|
| 624 |
+
- Responsive Design: Works across different screen sizes
|
| 625 |
+
- Performance Optimized: Efficient subplot generation
|
| 626 |
+
- Accessibility Compliant: Clear navigation and labeling
|
| 627 |
+
"""
|
| 628 |
+
|
| 629 |
+
def __init__(self, chart_renderer: PlotlyChartRenderer):
|
| 630 |
+
self.chart_renderer = chart_renderer
|
| 631 |
+
|
| 632 |
+
def compose_quality_dashboard(
|
| 633 |
+
self,
|
| 634 |
+
quality_metrics: QualityAssessment,
|
| 635 |
+
structural_metrics: StructuralMetrics,
|
| 636 |
+
**kwargs
|
| 637 |
+
) -> go.Figure:
|
| 638 |
+
"""
|
| 639 |
+
Comprehensive quality dashboard with executive summary layout
|
| 640 |
+
|
| 641 |
+
Strategic Layout:
|
| 642 |
+
- Top Row: Executive Summary (Overall Score, Key Metrics)
|
| 643 |
+
- Middle Row: Detailed Analysis (Radar Chart, Bar Chart)
|
| 644 |
+
- Bottom Row: Supporting Data (Structure Analysis, Performance)
|
| 645 |
+
"""
|
| 646 |
+
|
| 647 |
+
# Create subplot layout with strategic positioning
|
| 648 |
+
fig = make_subplots(
|
| 649 |
+
rows=2, cols=3,
|
| 650 |
+
subplot_titles=(
|
| 651 |
+
'Quality Overview', 'Detailed Scores', 'Document Structure',
|
| 652 |
+
'Performance Metrics', 'Structural Elements', 'Analysis Summary'
|
| 653 |
+
),
|
| 654 |
+
specs=[
|
| 655 |
+
[{"type": "indicator"}, {"type": "polar"}, {"type": "treemap"}],
|
| 656 |
+
[{"type": "bar"}, {"type": "bar"}, {"type": "table"}]
|
| 657 |
+
],
|
| 658 |
+
vertical_spacing=0.12,
|
| 659 |
+
horizontal_spacing=0.08
|
| 660 |
+
)
|
| 661 |
+
|
| 662 |
+
# 1. Overall Quality Gauge (Executive Summary)
|
| 663 |
+
fig.add_trace(
|
| 664 |
+
go.Indicator(
|
| 665 |
+
mode="gauge+number+delta",
|
| 666 |
+
value=quality_metrics.composite_score,
|
| 667 |
+
domain={'x': [0, 1], 'y': [0, 1]},
|
| 668 |
+
title={'text': "Overall Quality Score"},
|
| 669 |
+
delta={'reference': 7.0},
|
| 670 |
+
gauge={
|
| 671 |
+
'axis': {'range': [None, 10]},
|
| 672 |
+
'bar': {'color': "#667eea"},
|
| 673 |
+
'steps': [
|
| 674 |
+
{'range': [0, 5], 'color': "lightgray"},
|
| 675 |
+
{'range': [5, 8], 'color': "gray"}
|
| 676 |
+
],
|
| 677 |
+
'threshold': {
|
| 678 |
+
'line': {'color': "red", 'width': 4},
|
| 679 |
+
'thickness': 0.75,
|
| 680 |
+
'value': 9
|
| 681 |
+
}
|
| 682 |
+
}
|
| 683 |
+
),
|
| 684 |
+
row=1, col=1
|
| 685 |
+
)
|
| 686 |
+
|
| 687 |
+
# 2. Quality Breakdown Radar Chart
|
| 688 |
+
quality_data = {
|
| 689 |
+
'Structural': quality_metrics.structural_score,
|
| 690 |
+
'Content': quality_metrics.content_score,
|
| 691 |
+
'AI Analysis': quality_metrics.ai_score,
|
| 692 |
+
'Performance': quality_metrics.performance_score
|
| 693 |
+
}
|
| 694 |
+
|
| 695 |
+
fig.add_trace(
|
| 696 |
+
go.Scatterpolar(
|
| 697 |
+
r=list(quality_data.values()),
|
| 698 |
+
theta=list(quality_data.keys()),
|
| 699 |
+
fill='toself',
|
| 700 |
+
name='Quality Breakdown',
|
| 701 |
+
line=dict(color='#764ba2', width=2),
|
| 702 |
+
fillcolor="rgba(118, 75, 162, 0.3)"
|
| 703 |
+
),
|
| 704 |
+
row=1, col=2
|
| 705 |
+
)
|
| 706 |
+
|
| 707 |
+
# 3. Document Structure Treemap
|
| 708 |
+
structure_data = self._prepare_structure_treemap_data(structural_metrics)
|
| 709 |
+
fig.add_trace(
|
| 710 |
+
go.Treemap(
|
| 711 |
+
labels=structure_data['labels'],
|
| 712 |
+
values=structure_data['values'],
|
| 713 |
+
parents=structure_data['parents'],
|
| 714 |
+
textinfo="label+value",
|
| 715 |
+
textfont=dict(size=10)
|
| 716 |
+
),
|
| 717 |
+
row=1, col=3
|
| 718 |
+
)
|
| 719 |
+
|
| 720 |
+
# 4. Performance Metrics Bar Chart
|
| 721 |
+
perf_data = {
|
| 722 |
+
'Processing Speed': quality_metrics.performance_score,
|
| 723 |
+
'Structure Density': min(structural_metrics.structure_density * 10, 10),
|
| 724 |
+
'Content Quality': quality_metrics.content_score
|
| 725 |
+
}
|
| 726 |
+
|
| 727 |
+
fig.add_trace(
|
| 728 |
+
go.Bar(
|
| 729 |
+
x=list(perf_data.keys()),
|
| 730 |
+
y=list(perf_data.values()),
|
| 731 |
+
marker=dict(color=['#28a745', '#17a2b8', '#ffc107']),
|
| 732 |
+
name='Performance Metrics'
|
| 733 |
+
),
|
| 734 |
+
row=2, col=1
|
| 735 |
+
)
|
| 736 |
+
|
| 737 |
+
# 5. Structural Elements Breakdown
|
| 738 |
+
structure_breakdown = {
|
| 739 |
+
'Headers': structural_metrics.header_count,
|
| 740 |
+
'Lists': structural_metrics.list_items,
|
| 741 |
+
'Tables': structural_metrics.table_rows,
|
| 742 |
+
'Code Blocks': structural_metrics.code_blocks,
|
| 743 |
+
'Links': structural_metrics.links
|
| 744 |
+
}
|
| 745 |
+
|
| 746 |
+
fig.add_trace(
|
| 747 |
+
go.Bar(
|
| 748 |
+
x=list(structure_breakdown.values()),
|
| 749 |
+
y=list(structure_breakdown.keys()),
|
| 750 |
+
orientation='h',
|
| 751 |
+
marker=dict(color='#667eea'),
|
| 752 |
+
name='Structural Elements'
|
| 753 |
+
),
|
| 754 |
+
row=2, col=2
|
| 755 |
+
)
|
| 756 |
+
|
| 757 |
+
# 6. Analysis Summary Table
|
| 758 |
+
summary_data = [
|
| 759 |
+
['Overall Score', f"{quality_metrics.composite_score:.1f}/10"],
|
| 760 |
+
['Structure Elements', f"{sum(structure_breakdown.values())} items"],
|
| 761 |
+
['Max Header Depth', f"{structural_metrics.max_header_depth} levels"],
|
| 762 |
+
['Structure Density', f"{structural_metrics.structure_density:.1%}"]
|
| 763 |
+
]
|
| 764 |
+
|
| 765 |
+
fig.add_trace(
|
| 766 |
+
go.Table(
|
| 767 |
+
header=dict(
|
| 768 |
+
values=['Metric', 'Value'],
|
| 769 |
+
fill_color='#667eea',
|
| 770 |
+
font=dict(color='white', size=12),
|
| 771 |
+
align='left'
|
| 772 |
+
),
|
| 773 |
+
cells=dict(
|
| 774 |
+
values=list(zip(*summary_data)),
|
| 775 |
+
fill_color='#f8f9fa',
|
| 776 |
+
font=dict(color='#333333', size=11),
|
| 777 |
+
align='left'
|
| 778 |
+
)
|
| 779 |
+
),
|
| 780 |
+
row=2, col=3
|
| 781 |
+
)
|
| 782 |
+
|
| 783 |
+
# Update layout with enterprise styling
|
| 784 |
+
fig.update_layout(
|
| 785 |
+
title=dict(
|
| 786 |
+
text="Document Conversion Quality Dashboard",
|
| 787 |
+
x=0.5,
|
| 788 |
+
font=dict(size=20, color='#333333')
|
| 789 |
+
),
|
| 790 |
+
template='plotly_white',
|
| 791 |
+
height=kwargs.get('height', 800),
|
| 792 |
+
showlegend=False,
|
| 793 |
+
margin=dict(t=100, b=50, l=50, r=50)
|
| 794 |
+
)
|
| 795 |
+
|
| 796 |
+
# Update polar chart layout
|
| 797 |
+
fig.update_polars(
|
| 798 |
+
radialaxis=dict(
|
| 799 |
+
visible=True,
|
| 800 |
+
range=[0, 10],
|
| 801 |
+
tickfont=dict(size=10)
|
| 802 |
+
)
|
| 803 |
+
)
|
| 804 |
+
|
| 805 |
+
return fig
|
| 806 |
+
|
| 807 |
+
def _prepare_structure_treemap_data(self, metrics: StructuralMetrics) -> Dict[str, List]:
|
| 808 |
+
"""Prepare data for structure treemap visualization"""
|
| 809 |
+
|
| 810 |
+
total_elements = (
|
| 811 |
+
metrics.header_count + metrics.list_items +
|
| 812 |
+
metrics.table_rows + metrics.code_blocks + metrics.links
|
| 813 |
+
)
|
| 814 |
+
|
| 815 |
+
if total_elements == 0:
|
| 816 |
+
return {
|
| 817 |
+
'labels': ['Document', 'Content'],
|
| 818 |
+
'values': [100, 100],
|
| 819 |
+
'parents': ['', 'Document']
|
| 820 |
+
}
|
| 821 |
+
|
| 822 |
+
return {
|
| 823 |
+
'labels': [
|
| 824 |
+
'Document', 'Headers', 'Lists', 'Tables', 'Code Blocks', 'Links'
|
| 825 |
+
],
|
| 826 |
+
'values': [
|
| 827 |
+
total_elements,
|
| 828 |
+
max(metrics.header_count, 1),
|
| 829 |
+
max(metrics.list_items, 1),
|
| 830 |
+
max(metrics.table_rows, 1),
|
| 831 |
+
max(metrics.code_blocks, 1),
|
| 832 |
+
max(metrics.links, 1)
|
| 833 |
+
],
|
| 834 |
+
'parents': [
|
| 835 |
+
'', 'Document', 'Document', 'Document', 'Document', 'Document'
|
| 836 |
+
]
|
| 837 |
+
}
|
| 838 |
+
|
| 839 |
+
|
| 840 |
+
# ==================== FACADE ORCHESTRATOR ====================
|
| 841 |
+
|
| 842 |
+
class VisualizationOrchestrator:
|
| 843 |
+
"""
|
| 844 |
+
Strategic orchestration layer - coordinates visualization components
|
| 845 |
+
|
| 846 |
+
Design Philosophy:
|
| 847 |
+
- Facade Pattern: Simple interface hiding complex component interactions
|
| 848 |
+
- Dependency Injection: All components provided at construction
|
| 849 |
+
- Error Boundary: Comprehensive error handling with graceful degradation
|
| 850 |
+
- Performance Monitoring: Built-in metrics and optimization
|
| 851 |
+
"""
|
| 852 |
+
|
| 853 |
+
def __init__(
|
| 854 |
+
self,
|
| 855 |
+
content_analyzer: Optional[ContentAnalyzer] = None,
|
| 856 |
+
chart_renderer: Optional[ChartRenderer] = None,
|
| 857 |
+
dashboard_composer: Optional[DashboardComposer] = None
|
| 858 |
+
):
|
| 859 |
+
# Use default implementations if not provided
|
| 860 |
+
self.content_analyzer = content_analyzer or OptimizedContentAnalyzer()
|
| 861 |
+
self.chart_renderer = chart_renderer or PlotlyChartRenderer()
|
| 862 |
+
self.dashboard_composer = dashboard_composer or EnterpriseDashboardComposer(
|
| 863 |
+
self.chart_renderer
|
| 864 |
+
)
|
| 865 |
+
|
| 866 |
+
# Performance metrics
|
| 867 |
+
self.visualization_count = 0
|
| 868 |
+
self.error_count = 0
|
| 869 |
+
self.total_processing_time = 0.0
|
| 870 |
+
|
| 871 |
+
def create_quality_dashboard(self, conversion_result, analysis_result=None) -> go.Figure:
|
| 872 |
+
"""
|
| 873 |
+
Primary interface for quality dashboard generation
|
| 874 |
+
|
| 875 |
+
Strategic Approach:
|
| 876 |
+
- Input Validation: Comprehensive parameter checking
|
| 877 |
+
- Data Transformation: Convert external formats to internal abstractions
|
| 878 |
+
- Component Coordination: Orchestrate analysis and visualization
|
| 879 |
+
- Error Recovery: Graceful degradation for failed components
|
| 880 |
+
"""
|
| 881 |
+
|
| 882 |
+
start_time = datetime.now()
|
| 883 |
+
self.visualization_count += 1
|
| 884 |
+
|
| 885 |
+
try:
|
| 886 |
+
# Convert external data to internal abstraction
|
| 887 |
+
analysis_data = DocumentAnalysisData.from_processing_result(
|
| 888 |
+
conversion_result, analysis_result
|
| 889 |
+
)
|
| 890 |
+
|
| 891 |
+
# Generate quality assessment
|
| 892 |
+
quality_metrics = self.content_analyzer.calculate_quality_metrics(analysis_data)
|
| 893 |
+
|
| 894 |
+
# Analyze document structure
|
| 895 |
+
structural_metrics = self.content_analyzer.analyze_structure(analysis_data.content)
|
| 896 |
+
|
| 897 |
+
# Create comprehensive dashboard
|
| 898 |
+
dashboard = self.dashboard_composer.compose_quality_dashboard(
|
| 899 |
+
quality_metrics, structural_metrics
|
| 900 |
+
)
|
| 901 |
+
|
| 902 |
+
# Track performance
|
| 903 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 904 |
+
self.total_processing_time += processing_time
|
| 905 |
+
|
| 906 |
+
logger.info(f"Quality dashboard generated in {processing_time:.2f}s")
|
| 907 |
+
return dashboard
|
| 908 |
+
|
| 909 |
+
except Exception as e:
|
| 910 |
+
self.error_count += 1
|
| 911 |
+
logger.error(f"Quality dashboard generation failed: {str(e)}")
|
| 912 |
+
|
| 913 |
+
# Return fallback visualization
|
| 914 |
+
return self._create_error_fallback_dashboard(str(e))
|
| 915 |
+
|
| 916 |
+
def create_structural_analysis_viz(self, conversion_result) -> go.Figure:
|
| 917 |
+
"""Generate detailed structural analysis visualization"""
|
| 918 |
+
|
| 919 |
+
try:
|
| 920 |
+
analysis_data = DocumentAnalysisData.from_processing_result(conversion_result)
|
| 921 |
+
structural_metrics = self.content_analyzer.analyze_structure(analysis_data.content)
|
| 922 |
+
|
| 923 |
+
# Create detailed structural visualization
|
| 924 |
+
return self._create_structure_analysis_dashboard(structural_metrics)
|
| 925 |
+
|
| 926 |
+
except Exception as e:
|
| 927 |
+
logger.error(f"Structural analysis visualization failed: {str(e)}")
|
| 928 |
+
return self._create_error_fallback_dashboard(str(e))
|
| 929 |
+
|
| 930 |
+
def create_export_ready_report(self, conversion_result, analysis_result=None) -> Dict[str, go.Figure]:
|
| 931 |
+
"""Generate comprehensive export-ready report with multiple visualizations"""
|
| 932 |
+
|
| 933 |
+
try:
|
| 934 |
+
analysis_data = DocumentAnalysisData.from_processing_result(
|
| 935 |
+
conversion_result, analysis_result
|
| 936 |
+
)
|
| 937 |
+
|
| 938 |
+
quality_metrics = self.content_analyzer.calculate_quality_metrics(analysis_data)
|
| 939 |
+
structural_metrics = self.content_analyzer.analyze_structure(analysis_data.content)
|
| 940 |
+
|
| 941 |
+
# Generate multiple visualization components
|
| 942 |
+
report_figures = {
|
| 943 |
+
'executive_dashboard': self.dashboard_composer.compose_quality_dashboard(
|
| 944 |
+
quality_metrics, structural_metrics
|
| 945 |
+
),
|
| 946 |
+
'quality_breakdown': self.chart_renderer.render_radar_chart(
|
| 947 |
+
quality_metrics.to_dict(),
|
| 948 |
+
title="Quality Assessment Breakdown"
|
| 949 |
+
),
|
| 950 |
+
'structural_analysis': self._create_structure_analysis_dashboard(structural_metrics),
|
| 951 |
+
'performance_summary': self.chart_renderer.render_gauge_chart(
|
| 952 |
+
quality_metrics.composite_score,
|
| 953 |
+
title="Overall Quality Score"
|
| 954 |
+
)
|
| 955 |
+
}
|
| 956 |
+
|
| 957 |
+
logger.info(f"Export report generated with {len(report_figures)} visualizations")
|
| 958 |
+
return report_figures
|
| 959 |
+
|
| 960 |
+
except Exception as e:
|
| 961 |
+
logger.error(f"Export report generation failed: {str(e)}")
|
| 962 |
+
return {
|
| 963 |
+
'error_report': self._create_error_fallback_dashboard(str(e))
|
| 964 |
+
}
|
| 965 |
+
|
| 966 |
+
def _create_structure_analysis_dashboard(self, structural_metrics: StructuralMetrics) -> go.Figure:
|
| 967 |
+
"""Create detailed structural analysis dashboard"""
|
| 968 |
+
|
| 969 |
+
# Create multi-panel structural analysis
|
| 970 |
+
fig = make_subplots(
|
| 971 |
+
rows=2, cols=2,
|
| 972 |
+
subplot_titles=(
|
| 973 |
+
'Element Distribution', 'Structure Hierarchy',
|
| 974 |
+
'Content Density', 'Quality Assessment'
|
| 975 |
+
),
|
| 976 |
+
specs=[
|
| 977 |
+
[{"type": "pie"}, {"type": "bar"}],
|
| 978 |
+
[{"type": "scatter"}, {"type": "indicator"}]
|
| 979 |
+
]
|
| 980 |
+
)
|
| 981 |
+
|
| 982 |
+
# 1. Element Distribution Pie Chart
|
| 983 |
+
elements = {
|
| 984 |
+
'Headers': structural_metrics.header_count,
|
| 985 |
+
'Lists': structural_metrics.list_items,
|
| 986 |
+
'Tables': structural_metrics.table_rows,
|
| 987 |
+
'Code': structural_metrics.code_blocks,
|
| 988 |
+
'Links': structural_metrics.links
|
| 989 |
+
}
|
| 990 |
+
|
| 991 |
+
# Filter out zero values for cleaner visualization
|
| 992 |
+
non_zero_elements = {k: v for k, v in elements.items() if v > 0}
|
| 993 |
+
|
| 994 |
+
if non_zero_elements:
|
| 995 |
+
fig.add_trace(
|
| 996 |
+
go.Pie(
|
| 997 |
+
labels=list(non_zero_elements.keys()),
|
| 998 |
+
values=list(non_zero_elements.values()),
|
| 999 |
+
hole=0.3,
|
| 1000 |
+
marker=dict(colors=self.chart_renderer.color_palette[:len(non_zero_elements)])
|
| 1001 |
+
),
|
| 1002 |
+
row=1, col=1
|
| 1003 |
+
)
|
| 1004 |
+
|
| 1005 |
+
# 2. Structure Hierarchy Bar Chart
|
| 1006 |
+
hierarchy_data = {
|
| 1007 |
+
'Max Depth': structural_metrics.max_header_depth,
|
| 1008 |
+
'Total Elements': sum(elements.values()),
|
| 1009 |
+
'Structure Score': min(structural_metrics.structure_density * 10, 10)
|
| 1010 |
+
}
|
| 1011 |
+
|
| 1012 |
+
fig.add_trace(
|
| 1013 |
+
go.Bar(
|
| 1014 |
+
x=list(hierarchy_data.keys()),
|
| 1015 |
+
y=list(hierarchy_data.values()),
|
| 1016 |
+
marker=dict(color='#667eea'),
|
| 1017 |
+
name='Structure Metrics'
|
| 1018 |
+
),
|
| 1019 |
+
row=1, col=2
|
| 1020 |
+
)
|
| 1021 |
+
|
| 1022 |
+
# 3. Content Density Analysis
|
| 1023 |
+
fig.add_trace(
|
| 1024 |
+
go.Scatter(
|
| 1025 |
+
x=['Structure Density'],
|
| 1026 |
+
y=[structural_metrics.structure_density],
|
| 1027 |
+
mode='markers',
|
| 1028 |
+
marker=dict(
|
| 1029 |
+
size=30,
|
| 1030 |
+
color=structural_metrics.structure_density,
|
| 1031 |
+
colorscale='Viridis',
|
| 1032 |
+
showscale=True
|
| 1033 |
+
),
|
| 1034 |
+
name='Density Score'
|
| 1035 |
+
),
|
| 1036 |
+
row=2, col=1
|
| 1037 |
+
)
|
| 1038 |
+
|
| 1039 |
+
# 4. Structure Quality Indicator
|
| 1040 |
+
structure_quality = min(structural_metrics.structure_density * 10, 10)
|
| 1041 |
+
|
| 1042 |
+
fig.add_trace(
|
| 1043 |
+
go.Indicator(
|
| 1044 |
+
mode="gauge+number",
|
| 1045 |
+
value=structure_quality,
|
| 1046 |
+
domain={'x': [0, 1], 'y': [0, 1]},
|
| 1047 |
+
title={'text': "Structure Quality"},
|
| 1048 |
+
gauge={
|
| 1049 |
+
'axis': {'range': [None, 10]},
|
| 1050 |
+
'bar': {'color': "#28a745"},
|
| 1051 |
+
'steps': [
|
| 1052 |
+
{'range': [0, 5], 'color': "lightgray"},
|
| 1053 |
+
{'range': [5, 8], 'color': "gray"}
|
| 1054 |
+
]
|
| 1055 |
+
}
|
| 1056 |
+
),
|
| 1057 |
+
row=2, col=2
|
| 1058 |
+
)
|
| 1059 |
+
|
| 1060 |
+
fig.update_layout(
|
| 1061 |
+
title="Document Structure Analysis",
|
| 1062 |
+
height=700,
|
| 1063 |
+
showlegend=True,
|
| 1064 |
+
template='plotly_white'
|
| 1065 |
+
)
|
| 1066 |
+
|
| 1067 |
+
return fig
|
| 1068 |
+
|
| 1069 |
+
def _create_error_fallback_dashboard(self, error_message: str) -> go.Figure:
|
| 1070 |
+
"""Create fallback visualization for error scenarios"""
|
| 1071 |
+
|
| 1072 |
+
fig = go.Figure()
|
| 1073 |
+
|
| 1074 |
+
fig.add_annotation(
|
| 1075 |
+
x=0.5, y=0.5,
|
| 1076 |
+
xref="paper", yref="paper",
|
| 1077 |
+
text=f"Visualization Error<br>{error_message[:100]}{'...' if len(error_message) > 100 else ''}",
|
| 1078 |
+
showarrow=False,
|
| 1079 |
+
font=dict(size=16, color="red"),
|
| 1080 |
+
bgcolor="rgba(255, 0, 0, 0.1)",
|
| 1081 |
+
bordercolor="red",
|
| 1082 |
+
borderwidth=2
|
| 1083 |
+
)
|
| 1084 |
+
|
| 1085 |
+
fig.update_layout(
|
| 1086 |
+
title="Visualization Generation Error",
|
| 1087 |
+
height=400,
|
| 1088 |
+
template='plotly_white'
|
| 1089 |
+
)
|
| 1090 |
+
|
| 1091 |
+
return fig
|
| 1092 |
+
|
| 1093 |
+
def get_performance_metrics(self) -> JSONDict:
|
| 1094 |
+
"""Get comprehensive performance metrics for monitoring"""
|
| 1095 |
+
|
| 1096 |
+
avg_processing_time = (
|
| 1097 |
+
self.total_processing_time / self.visualization_count
|
| 1098 |
+
if self.visualization_count > 0 else 0
|
| 1099 |
+
)
|
| 1100 |
+
|
| 1101 |
+
success_rate = (
|
| 1102 |
+
((self.visualization_count - self.error_count) / self.visualization_count * 100)
|
| 1103 |
+
if self.visualization_count > 0 else 0
|
| 1104 |
+
)
|
| 1105 |
+
|
| 1106 |
+
# Get content analyzer cache statistics if available
|
| 1107 |
+
cache_stats = {}
|
| 1108 |
+
if hasattr(self.content_analyzer, 'get_cache_statistics'):
|
| 1109 |
+
cache_stats = self.content_analyzer.get_cache_statistics()
|
| 1110 |
+
|
| 1111 |
+
return {
|
| 1112 |
+
'visualizations_generated': self.visualization_count,
|
| 1113 |
+
'error_count': self.error_count,
|
| 1114 |
+
'success_rate_percent': success_rate,
|
| 1115 |
+
'average_processing_time': avg_processing_time,
|
| 1116 |
+
'total_processing_time': self.total_processing_time,
|
| 1117 |
+
'cache_statistics': cache_stats,
|
| 1118 |
+
'status': 'healthy' if success_rate > 90 else 'degraded' if success_rate > 70 else 'unhealthy'
|
| 1119 |
+
}
|
| 1120 |
+
|
| 1121 |
+
|
| 1122 |
+
# ==================== BACKWARDS COMPATIBILITY LAYER ====================
|
| 1123 |
+
|
| 1124 |
+
class InteractiveVisualizationEngine:
|
| 1125 |
+
"""
|
| 1126 |
+
Backwards compatibility facade for existing code
|
| 1127 |
+
|
| 1128 |
+
Strategic Purpose:
|
| 1129 |
+
- Maintains existing API for legacy integration
|
| 1130 |
+
- Delegates to new architecture components
|
| 1131 |
+
- Provides migration path to new patterns
|
| 1132 |
+
- Zero breaking changes for existing consumers
|
| 1133 |
+
"""
|
| 1134 |
+
|
| 1135 |
+
def __init__(self, config=None):
|
| 1136 |
+
# Initialize new architecture components
|
| 1137 |
+
self.orchestrator = VisualizationOrchestrator()
|
| 1138 |
+
self.config = config or {}
|
| 1139 |
+
|
| 1140 |
+
logger.info("InteractiveVisualizationEngine initialized with new architecture")
|
| 1141 |
+
|
| 1142 |
+
def create_quality_dashboard(self, conversion_result, analysis_result=None):
|
| 1143 |
+
"""Legacy API compatibility method"""
|
| 1144 |
+
return self.orchestrator.create_quality_dashboard(conversion_result, analysis_result)
|
| 1145 |
+
|
| 1146 |
+
def create_structural_analysis_viz(self, conversion_result):
|
| 1147 |
+
"""Legacy API compatibility method"""
|
| 1148 |
+
return self.orchestrator.create_structural_analysis_viz(conversion_result)
|
| 1149 |
+
|
| 1150 |
+
def create_export_ready_report(self, conversion_result, analysis_result=None):
|
| 1151 |
+
"""Legacy API compatibility method"""
|
| 1152 |
+
return self.orchestrator.create_export_ready_report(conversion_result, analysis_result)
|
| 1153 |
+
|
| 1154 |
+
def create_comparison_analysis(self, results):
|
| 1155 |
+
"""Placeholder for comparison analysis - future implementation"""
|
| 1156 |
+
logger.warning("Comparison analysis not yet implemented in refactored architecture")
|
| 1157 |
+
|
| 1158 |
+
# Return placeholder visualization
|
| 1159 |
+
fig = go.Figure()
|
| 1160 |
+
fig.add_annotation(
|
| 1161 |
+
x=0.5, y=0.5,
|
| 1162 |
+
xref="paper", yref="paper",
|
| 1163 |
+
text="Comparison Analysis<br/>Coming Soon in Next Release",
|
| 1164 |
+
showarrow=False,
|
| 1165 |
+
font=dict(size=16, color="gray")
|
| 1166 |
+
)
|
| 1167 |
+
fig.update_layout(title="Feature Under Development", height=400)
|
| 1168 |
+
|
| 1169 |
+
return fig, pd.DataFrame()
|
| 1170 |
+
|
| 1171 |
+
|
| 1172 |
+
class QualityMetricsCalculator:
|
| 1173 |
+
"""
|
| 1174 |
+
Backwards compatibility wrapper for quality metrics calculation
|
| 1175 |
+
|
| 1176 |
+
Delegates to new OptimizedContentAnalyzer while maintaining existing interface
|
| 1177 |
+
"""
|
| 1178 |
+
|
| 1179 |
+
def __init__(self):
|
| 1180 |
+
self.analyzer = OptimizedContentAnalyzer()
|
| 1181 |
+
logger.info("QualityMetricsCalculator initialized with optimized backend")
|
| 1182 |
+
|
| 1183 |
+
@staticmethod
|
| 1184 |
+
def calculate_conversion_quality_metrics(conversion_result, analysis_result=None):
|
| 1185 |
+
"""Legacy API method - delegates to new architecture"""
|
| 1186 |
+
|
| 1187 |
+
# Create analyzer instance for static method compatibility
|
| 1188 |
+
analyzer = OptimizedContentAnalyzer()
|
| 1189 |
+
|
| 1190 |
+
# Convert to new data format
|
| 1191 |
+
analysis_data = DocumentAnalysisData.from_processing_result(
|
| 1192 |
+
conversion_result, analysis_result
|
| 1193 |
+
)
|
| 1194 |
+
|
| 1195 |
+
# Calculate quality assessment using new system
|
| 1196 |
+
quality_assessment = analyzer.calculate_quality_metrics(analysis_data)
|
| 1197 |
+
structural_metrics = analyzer.analyze_structure(analysis_data.content)
|
| 1198 |
+
|
| 1199 |
+
# Convert to legacy format for backwards compatibility
|
| 1200 |
+
return {
|
| 1201 |
+
'composite_score': quality_assessment.composite_score,
|
| 1202 |
+
'basic_metrics': {
|
| 1203 |
+
'total_words': len(analysis_data.content.split()) if analysis_data.content else 0,
|
| 1204 |
+
'total_lines': len(analysis_data.content.split('\n')) if analysis_data.content else 0,
|
| 1205 |
+
'total_characters': len(analysis_data.content)
|
| 1206 |
+
},
|
| 1207 |
+
'structural_metrics': structural_metrics.to_dict(),
|
| 1208 |
+
'content_metrics': {
|
| 1209 |
+
'information_density': structural_metrics.structure_density
|
| 1210 |
+
},
|
| 1211 |
+
'performance_metrics': {
|
| 1212 |
+
'processing_time_seconds': analysis_data.processing_metrics.get('processing_time', 0),
|
| 1213 |
+
'efficiency_score': quality_assessment.performance_score
|
| 1214 |
+
},
|
| 1215 |
+
'ai_analysis_metrics': {
|
| 1216 |
+
'overall_ai_score': quality_assessment.ai_score,
|
| 1217 |
+
'analysis_available': analysis_data.ai_analysis_data is not None
|
| 1218 |
+
}
|
| 1219 |
+
}
|
| 1220 |
+
|
| 1221 |
+
|
| 1222 |
+
# ==================== CONFIGURATION CLASSES ====================
|
| 1223 |
+
|
| 1224 |
+
@dataclass
|
| 1225 |
+
class VisualizationConfig:
|
| 1226 |
+
"""Configuration container for visualization settings"""
|
| 1227 |
+
|
| 1228 |
+
class VisualizationTheme(Enum):
|
| 1229 |
+
CORPORATE = "plotly_white"
|
| 1230 |
+
DARK_MODERN = "plotly_dark"
|
| 1231 |
+
MINIMAL = "simple_white"
|
| 1232 |
+
PRESENTATION = "presentation"
|
| 1233 |
+
|
| 1234 |
+
theme: VisualizationTheme = VisualizationTheme.CORPORATE
|
| 1235 |
+
width: int = 800
|
| 1236 |
+
height: int = 600
|
| 1237 |
+
show_legend: bool = True
|
| 1238 |
+
interactive: bool = True
|
| 1239 |
+
export_format: str = "html"
|
| 1240 |
+
color_palette: List[str] = field(default_factory=lambda: [
|
| 1241 |
+
'#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
|
| 1242 |
+
'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
|
| 1243 |
+
])
|
| 1244 |
+
|
| 1245 |
+
|
| 1246 |
+
class ReportGenerator:
|
| 1247 |
+
"""
|
| 1248 |
+
Enterprise report generation with multiple output formats
|
| 1249 |
+
|
| 1250 |
+
Backwards compatibility wrapper around new architecture
|
| 1251 |
+
"""
|
| 1252 |
+
|
| 1253 |
+
def __init__(self, viz_engine):
|
| 1254 |
+
if isinstance(viz_engine, InteractiveVisualizationEngine):
|
| 1255 |
+
self.orchestrator = viz_engine.orchestrator
|
| 1256 |
+
else:
|
| 1257 |
+
# Fallback for direct orchestrator usage
|
| 1258 |
+
self.orchestrator = viz_engine
|
| 1259 |
+
|
| 1260 |
+
def generate_executive_report(self, conversion_result, analysis_result=None, export_format="html"):
|
| 1261 |
+
"""Generate comprehensive executive report with new architecture"""
|
| 1262 |
+
|
| 1263 |
+
try:
|
| 1264 |
+
# Generate visualizations using new system
|
| 1265 |
+
report_figures = self.orchestrator.create_export_ready_report(
|
| 1266 |
+
conversion_result, analysis_result
|
| 1267 |
+
)
|
| 1268 |
+
|
| 1269 |
+
# Calculate metrics using new analyzer
|
| 1270 |
+
analysis_data = DocumentAnalysisData.from_processing_result(
|
| 1271 |
+
conversion_result, analysis_result
|
| 1272 |
+
)
|
| 1273 |
+
analyzer = OptimizedContentAnalyzer()
|
| 1274 |
+
quality_metrics = analyzer.calculate_quality_metrics(analysis_data)
|
| 1275 |
+
|
| 1276 |
+
# Generate executive summary
|
| 1277 |
+
executive_summary = self._generate_executive_summary(quality_metrics, analysis_result)
|
| 1278 |
+
|
| 1279 |
+
return {
|
| 1280 |
+
'metadata': {
|
| 1281 |
+
'generated_at': datetime.now().isoformat(),
|
| 1282 |
+
'document_name': analysis_data.metadata.get('original_file', {}).get('filename', 'Unknown'),
|
| 1283 |
+
'overall_score': quality_metrics.composite_score
|
| 1284 |
+
},
|
| 1285 |
+
'executive_summary': executive_summary,
|
| 1286 |
+
'visualizations': report_figures,
|
| 1287 |
+
'quality_metrics': quality_metrics.to_dict(),
|
| 1288 |
+
'export_format': export_format
|
| 1289 |
+
}
|
| 1290 |
+
|
| 1291 |
+
except Exception as e:
|
| 1292 |
+
logger.error(f"Executive report generation failed: {str(e)}")
|
| 1293 |
+
return {
|
| 1294 |
+
'metadata': {'generated_at': datetime.now().isoformat(), 'error': str(e)},
|
| 1295 |
+
'executive_summary': {'error': 'Report generation failed'},
|
| 1296 |
+
'visualizations': {},
|
| 1297 |
+
'quality_metrics': {},
|
| 1298 |
+
'export_format': export_format
|
| 1299 |
+
}
|
| 1300 |
+
|
| 1301 |
+
def _generate_executive_summary(self, quality_metrics: QualityAssessment, analysis_result):
|
| 1302 |
+
"""Generate executive summary with business-friendly language"""
|
| 1303 |
+
|
| 1304 |
+
score = quality_metrics.composite_score
|
| 1305 |
+
|
| 1306 |
+
if score >= 8:
|
| 1307 |
+
quality_assessment = "Excellent"
|
| 1308 |
+
recommendation = "Document conversion achieved outstanding quality. Ready for production deployment."
|
| 1309 |
+
elif score >= 6:
|
| 1310 |
+
quality_assessment = "Good"
|
| 1311 |
+
recommendation = "Document conversion quality is good with minor optimization opportunities."
|
| 1312 |
+
elif score >= 4:
|
| 1313 |
+
quality_assessment = "Acceptable"
|
| 1314 |
+
recommendation = "Document conversion quality is acceptable. Consider improvements for enhanced results."
|
| 1315 |
+
else:
|
| 1316 |
+
quality_assessment = "Needs Improvement"
|
| 1317 |
+
recommendation = "Document conversion quality requires attention. Review source document and processing settings."
|
| 1318 |
+
|
| 1319 |
+
key_insights = []
|
| 1320 |
+
if quality_metrics.structural_score > 7:
|
| 1321 |
+
key_insights.append("Strong document structure with well-organized content hierarchy.")
|
| 1322 |
+
if quality_metrics.ai_score > 7:
|
| 1323 |
+
key_insights.append("AI analysis confirms high-quality content extraction and processing.")
|
| 1324 |
+
if quality_metrics.performance_score > 7:
|
| 1325 |
+
key_insights.append("Efficient processing with optimal resource utilization.")
|
| 1326 |
+
|
| 1327 |
+
return {
|
| 1328 |
+
'quality_assessment': quality_assessment,
|
| 1329 |
+
'overall_score': f"{score:.1f}/10",
|
| 1330 |
+
'recommendation': recommendation,
|
| 1331 |
+
'key_insights': key_insights,
|
| 1332 |
+
'executive_summary': f"""
|
| 1333 |
+
Document conversion analysis completed with an overall quality score of {score:.1f}/10,
|
| 1334 |
+
rated as {quality_assessment}. {recommendation}
|
| 1335 |
+
|
| 1336 |
+
Key performance indicators show {len(key_insights)} positive quality factors identified
|
| 1337 |
+
during comprehensive analysis.
|
| 1338 |
+
"""
|
| 1339 |
+
}
|
| 1340 |
+
|
| 1341 |
+
|
| 1342 |
+
# ==================== PUBLIC API EXPORTS ====================
|
| 1343 |
+
|
| 1344 |
+
__all__ = [
|
| 1345 |
+
# Core abstractions
|
| 1346 |
+
'DocumentAnalysisData',
|
| 1347 |
+
'StructuralMetrics',
|
| 1348 |
+
'QualityAssessment',
|
| 1349 |
+
'VisualizationRequest',
|
| 1350 |
+
|
| 1351 |
+
# Primary components
|
| 1352 |
+
'OptimizedContentAnalyzer',
|
| 1353 |
+
'PlotlyChartRenderer',
|
| 1354 |
+
'EnterpriseDashboardComposer',
|
| 1355 |
+
'VisualizationOrchestrator',
|
| 1356 |
+
|
| 1357 |
+
# Backwards compatibility
|
| 1358 |
+
'InteractiveVisualizationEngine',
|
| 1359 |
+
'QualityMetricsCalculator',
|
| 1360 |
+
'ReportGenerator',
|
| 1361 |
+
'VisualizationConfig',
|
| 1362 |
+
|
| 1363 |
+
# Configuration
|
| 1364 |
+
'VisualizationConfig'
|
| 1365 |
+
]
|
| 1366 |
+
|
| 1367 |
+
|
| 1368 |
+
# ==================== MODULE INITIALIZATION ====================
|
| 1369 |
+
|
| 1370 |
+
if __name__ == "__main__":
|
| 1371 |
+
# Module self-test and performance benchmarking
|
| 1372 |
+
logger.info("MarkItDown Visualization Engine - Architecture Validation")
|
| 1373 |
+
|
| 1374 |
+
# Test component initialization
|
| 1375 |
+
try:
|
| 1376 |
+
analyzer = OptimizedContentAnalyzer()
|
| 1377 |
+
renderer = PlotlyChartRenderer()
|
| 1378 |
+
composer = EnterpriseDashboardComposer(renderer)
|
| 1379 |
+
orchestrator = VisualizationOrchestrator(analyzer, renderer, composer)
|
| 1380 |
+
|
| 1381 |
+
logger.info("✅ All components initialized successfully")
|
| 1382 |
+
|
| 1383 |
+
# Test backwards compatibility
|
| 1384 |
+
legacy_engine = InteractiveVisualizationEngine()
|
| 1385 |
+
legacy_calculator = QualityMetricsCalculator()
|
| 1386 |
+
|
| 1387 |
+
logger.info("✅ Backwards compatibility layer functional")
|
| 1388 |
+
logger.info("🚀 Visualization engine ready for production deployment")
|
| 1389 |
+
|
| 1390 |
+
except Exception as e:
|
| 1391 |
+
logger.error(f"❌ Component initialization failed: {str(e)}")
|
| 1392 |
+
raise
|
| 1393 |
+
JSONDict = Dict[str, JsonValue]
|