Roman commited on
Commit
070b924
·
1 Parent(s): 6b2b081

Add DeepSeek-OCR Gradio app for HF Space

Browse files
Files changed (4) hide show
  1. .gitignore +34 -0
  2. README.md +39 -1
  3. app.py +364 -0
  4. requirements.txt +14 -0
.gitignore ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.so
4
+ *.dylib
5
+ *.pyd
6
+
7
+ .DS_Store
8
+ .env
9
+ .env.*
10
+ .venv/
11
+ venv/
12
+
13
+ # Python cache & build artifacts
14
+ build/
15
+ dist/
16
+ *.egg-info/
17
+
18
+ # Hugging Face caches
19
+ /.cache/
20
+ /cache/
21
+ /huggingface/
22
+ /.huggingface/
23
+ /logs/
24
+
25
+ # Temporary OCR outputs
26
+ deepseek_ocr_out_*/
27
+ deepseek_upload_*
28
+ tmp/
29
+
30
+ # Notebooks & checkpoints
31
+ *.ipynb_checkpoints
32
+ *.ckpt
33
+ *.safetensors
34
+
README.md CHANGED
@@ -8,6 +8,44 @@ sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
  short_description: OCR interface for your PDF files
 
 
 
 
 
 
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  app_file: app.py
9
  pinned: false
10
  short_description: OCR interface for your PDF files
11
+ python_version: 3.10
12
+ hardware: t4-small
13
+ license: mit
14
+ tags:
15
+ - ocr
16
+ - pdf
17
+ - gradio
18
+ - deepseek
19
  ---
20
 
21
+ # DeepSeek-OCR PDF & Image Interface
22
+
23
+ This Space wraps [`deepseek-ai/DeepSeek-OCR`](https://huggingface.co/deepseek-ai/DeepSeek-OCR) with a polished Gradio UI that can transcribe both individual images and multi-page PDFs into clean Markdown. It targets the free T4 GPU tier for fast startup while enabling flash-attention and optional vLLM acceleration for multi-page batching.
24
+
25
+ ## Features
26
+
27
+ - Support for `.png`, `.jpg`, `.jpeg`, `.webp`, `.tiff`, and `.pdf`
28
+ - Automatic PDF page conversion with PyMuPDF at 192 DPI
29
+ - Gundam mode defaults (`base_size=1024`, `image_size=640`, `crop_mode=True`) for balanced speed and accuracy
30
+ - Markdown-formatted output with per-page sections
31
+ - Optional custom prompt to tailor extraction instructions
32
+
33
+ ## Running Locally
34
+
35
+ ```bash
36
+ python -m venv .venv
37
+ source .venv/bin/activate
38
+ pip install -r requirements.txt
39
+ python app.py
40
+ ```
41
+
42
+ The interface launches on `http://127.0.0.1:7860` by default. Set the environment variable `USE_VLLM=0` to disable the vLLM backend or leave it enabled to leverage faster batching when the dependency is available.
43
+
44
+ ## Space Configuration
45
+
46
+ - **Hardware**: `t4-small`
47
+ - **Python**: `3.10`
48
+ - **SDK**: `Gradio 5.49.1`
49
+ - **Model**: `deepseek-ai/DeepSeek-OCR`
50
+
51
+ Refer to the [Spaces configuration reference](https://huggingface.co/docs/hub/spaces-config-reference) for additional customization options.
app.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio interface for DeepSeek-OCR on Hugging Face Spaces.
2
+
3
+ This application loads the `deepseek-ai/DeepSeek-OCR` vision-language model
4
+ and exposes a simple interface capable of processing both image and PDF
5
+ documents. The implementation targets the Hugging Face free T4 GPU runtime and
6
+ optimizes throughput with bfloat16 precision, flash-attention, and optional
7
+ vLLM acceleration when available.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import contextlib
13
+ import dataclasses
14
+ import logging
15
+ import os
16
+ import shutil
17
+ import tempfile
18
+ from pathlib import Path
19
+ from typing import List, Optional
20
+
21
+ import gradio as gr
22
+ import torch
23
+ from PIL import Image
24
+ from transformers import AutoModel, AutoTokenizer
25
+
26
+ try: # Optional dependency for faster batching
27
+ from vllm import LLM, SamplingParams # type: ignore
28
+
29
+ _HAS_VLLM = True
30
+ except Exception: # pragma: no cover - optional path
31
+ LLM = None # type: ignore
32
+ SamplingParams = None # type: ignore
33
+ _HAS_VLLM = False
34
+
35
+ try:
36
+ import fitz # type: ignore[attr-defined]
37
+ except Exception as exc: # pragma: no cover - ensures import error is visible
38
+ raise RuntimeError(
39
+ "PyMuPDF (fitz) is required for PDF processing. Install pymupdf."
40
+ ) from exc
41
+
42
+
43
+ logging.basicConfig(level=logging.INFO)
44
+ LOGGER = logging.getLogger("deepseek_ocr_app")
45
+
46
+
47
+ MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
48
+ DEFAULT_PROMPT = "<image>\n<|grounding|>Convert the document to markdown."
49
+ GUNDAM_CONFIG = {
50
+ "base_size": 1024,
51
+ "image_size": 640,
52
+ "crop_mode": True,
53
+ "test_compress": True,
54
+ }
55
+
56
+
57
+ @dataclasses.dataclass
58
+ class PageResult:
59
+ """Result for a single page processed by DeepSeek-OCR."""
60
+
61
+ index: int
62
+ text: str
63
+
64
+
65
+ @dataclasses.dataclass
66
+ class DocumentResult:
67
+ """Aggregate OCR result for an input document."""
68
+
69
+ filename: str
70
+ page_results: List[PageResult]
71
+
72
+ def to_markdown(self) -> str:
73
+ sections = []
74
+ for page in self.page_results:
75
+ heading = f"### Page {page.index}"
76
+ sections.append(f"{heading}\n\n{page.text.strip()}".strip())
77
+ return "\n\n".join(sections).strip()
78
+
79
+
80
+ def has_cuda() -> bool:
81
+ return torch.cuda.is_available()
82
+
83
+
84
+ class DeepSeekOCREngine:
85
+ """Wrapper around the DeepSeek-OCR model for document processing."""
86
+
87
+ def __init__(
88
+ self,
89
+ model_name: str = MODEL_NAME,
90
+ prompt: str = DEFAULT_PROMPT,
91
+ config: Optional[dict] = None,
92
+ enable_vllm: bool = True,
93
+ ) -> None:
94
+ self.model_name = model_name
95
+ self.prompt_template = prompt
96
+ self.config = {**GUNDAM_CONFIG, **(config or {})}
97
+ self.enable_vllm = enable_vllm and _HAS_VLLM
98
+ self.device = torch.device("cuda" if has_cuda() else "cpu")
99
+ self._model = None
100
+ self._tokenizer = None
101
+ self._vllm_engine = None
102
+ self._vllm_sampling_params = None
103
+ self._output_root = Path(tempfile.mkdtemp(prefix="deepseek_ocr_out_"))
104
+ self._load_model()
105
+
106
+ @property
107
+ def tokenizer(self):
108
+ if self._tokenizer is None:
109
+ raise RuntimeError("Tokenizer not initialized")
110
+ return self._tokenizer
111
+
112
+ @property
113
+ def model(self):
114
+ if self._model is None:
115
+ raise RuntimeError("Model not initialized")
116
+ return self._model
117
+
118
+ def _load_model(self) -> None:
119
+ torch.backends.cudnn.allow_tf32 = True
120
+ torch.backends.cuda.matmul.allow_tf32 = True
121
+
122
+ if self.enable_vllm:
123
+ try:
124
+ LOGGER.info("Initializing DeepSeek-OCR with vLLM backend")
125
+ self._vllm_engine = LLM(
126
+ model=self.model_name,
127
+ dtype="bfloat16" if has_cuda() else "float32",
128
+ tokenizer=self.model_name,
129
+ trust_remote_code=True,
130
+ )
131
+ self._vllm_sampling_params = SamplingParams(
132
+ temperature=0.0,
133
+ top_p=0.9,
134
+ max_tokens=4096,
135
+ )
136
+ except Exception as vllm_error:
137
+ LOGGER.warning(
138
+ "vLLM initialization failed (%s). Falling back to HF AutoModel.",
139
+ vllm_error,
140
+ )
141
+ self.enable_vllm = False
142
+
143
+ if not self.enable_vllm:
144
+ LOGGER.info("Loading DeepSeek-OCR with transformers backend")
145
+ self._tokenizer = AutoTokenizer.from_pretrained(
146
+ self.model_name, trust_remote_code=True
147
+ )
148
+ torch_dtype = torch.bfloat16 if self.device.type == "cuda" else torch.float32
149
+ self._model = AutoModel.from_pretrained(
150
+ self.model_name,
151
+ trust_remote_code=True,
152
+ use_safetensors=True,
153
+ _attn_implementation="flash_attention_2",
154
+ torch_dtype=torch_dtype,
155
+ )
156
+ self._model = self._model.eval().to(self.device)
157
+
158
+ def cleanup(self) -> None:
159
+ if self._output_root.exists():
160
+ shutil.rmtree(self._output_root, ignore_errors=True)
161
+
162
+ def _infer_transformers(self, image_path: Path, prompt: str) -> str:
163
+ result = self.model.infer(
164
+ self.tokenizer,
165
+ prompt=prompt,
166
+ image_file=str(image_path),
167
+ output_path=str(self._output_root),
168
+ base_size=self.config["base_size"],
169
+ image_size=self.config["image_size"],
170
+ crop_mode=self.config["crop_mode"],
171
+ save_results=False,
172
+ test_compress=self.config.get("test_compress", True),
173
+ )
174
+
175
+ if isinstance(result, dict):
176
+ for key in ("text", "markdown", "raw_text", "result"):
177
+ if key in result and isinstance(result[key], str):
178
+ return result[key]
179
+ return "\n".join(str(v) for v in result.values())
180
+ if isinstance(result, (list, tuple)):
181
+ return "\n".join(str(item) for item in result)
182
+ return str(result)
183
+
184
+ def _infer_vllm(self, image_path: Path, prompt: str) -> str:
185
+ if not self.enable_vllm or self._vllm_engine is None:
186
+ raise RuntimeError("vLLM backend is not initialized")
187
+
188
+ formatted_prompt = f"<image>{prompt.replace('<image>', '').strip()}"
189
+ outputs = self._vllm_engine.generate(
190
+ prompts=[formatted_prompt],
191
+ image_data=[[Image.open(image_path)]],
192
+ sampling_params=self._vllm_sampling_params,
193
+ )
194
+ return outputs[0].outputs[0].text if outputs else ""
195
+
196
+ def _infer(self, image_path: Path, prompt: str) -> str:
197
+ if self.enable_vllm:
198
+ try:
199
+ return self._infer_vllm(image_path, prompt)
200
+ except Exception as error:
201
+ LOGGER.warning(
202
+ "Falling back to transformers backend after vLLM error: %s",
203
+ error,
204
+ )
205
+ self.enable_vllm = False
206
+ return self._infer_transformers(image_path, prompt)
207
+
208
+ def _convert_pdf_to_images(
209
+ self, pdf_path: Path, output_dir: Path, dpi: int = 192
210
+ ) -> List[Path]:
211
+ document = fitz.open(pdf_path)
212
+ image_paths: List[Path] = []
213
+ zoom = dpi / 72 # Default PDF DPI is 72
214
+ matrix = fitz.Matrix(zoom, zoom)
215
+
216
+ for page_index in range(len(document)):
217
+ page = document.load_page(page_index)
218
+ pixmap = page.get_pixmap(matrix=matrix, alpha=False)
219
+ page_path = output_dir / f"page-{page_index + 1:04d}.png"
220
+ pixmap.save(page_path)
221
+ image_paths.append(page_path)
222
+
223
+ document.close()
224
+ return image_paths
225
+
226
+ def process_document(
227
+ self,
228
+ file_path: Path,
229
+ prompt: Optional[str] = None,
230
+ progress: Optional[gr.Progress] = None,
231
+ ) -> DocumentResult:
232
+ prompt_to_use = prompt.strip() if prompt and prompt.strip() else self.prompt_template
233
+ suffix = file_path.suffix.lower()
234
+
235
+ with tempfile.TemporaryDirectory(prefix="deepseek_ocr_tmp_") as tmp_dir:
236
+ tmp_dir_path = Path(tmp_dir)
237
+ if suffix in {".png", ".jpg", ".jpeg", ".bmp", ".webp", ".tif", ".tiff"}:
238
+ image_paths = [self._ensure_rgb_image(file_path, tmp_dir_path)]
239
+ elif suffix == ".pdf":
240
+ if progress:
241
+ progress(0.0, desc="Converting PDF pages")
242
+ image_paths = self._convert_pdf_to_images(file_path, tmp_dir_path)
243
+ else:
244
+ raise ValueError("Unsupported file format. Please upload an image or PDF.")
245
+
246
+ total_pages = len(image_paths)
247
+ page_results: List[PageResult] = []
248
+
249
+ for idx, image_path in enumerate(image_paths, start=1):
250
+ if progress:
251
+ progress(
252
+ (idx - 1) / max(total_pages, 1),
253
+ desc=f"Processing page {idx}/{total_pages}"
254
+ )
255
+ text = self._infer(image_path, prompt_to_use)
256
+ page_results.append(PageResult(index=idx, text=text))
257
+ if progress:
258
+ progress(1.0, desc="Completed")
259
+
260
+ return DocumentResult(filename=file_path.name, page_results=page_results)
261
+
262
+ def _ensure_rgb_image(self, image_path: Path, output_dir: Path) -> Path:
263
+ """Ensure the provided image is saved as RGB PNG for the model."""
264
+
265
+ image = Image.open(image_path)
266
+ if image.mode != "RGB":
267
+ image = image.convert("RGB")
268
+ output_path = output_dir / f"image-{image_path.stem}.png"
269
+ image.save(output_path, format="PNG", optimize=True)
270
+ return output_path
271
+
272
+
273
+ @contextlib.contextmanager
274
+ def progress_tracker(progress: Optional[gr.Progress]):
275
+ yield progress if progress else None
276
+
277
+
278
+ ENGINE: Optional[DeepSeekOCREngine] = None
279
+
280
+
281
+ def get_engine() -> DeepSeekOCREngine:
282
+ global ENGINE
283
+ if ENGINE is None:
284
+ use_vllm_env = os.getenv("USE_VLLM", "1").strip().lower()
285
+ enable_vllm = use_vllm_env not in {"0", "false", "no"}
286
+ LOGGER.info("Instantiating DeepSeek-OCR engine (vLLM=%s)", enable_vllm)
287
+ ENGINE = DeepSeekOCREngine(enable_vllm=enable_vllm)
288
+ return ENGINE
289
+
290
+
291
+ def handle_upload(
292
+ file: gr.File | None,
293
+ prompt: str,
294
+ progress: gr.Progress = gr.Progress(track_tqdm=True),
295
+ ) -> str:
296
+ if file is None:
297
+ raise gr.Error("Please upload an image or PDF file to start OCR.")
298
+
299
+ uploaded_path = Path(file.name)
300
+ fd, tmp_path_str = tempfile.mkstemp(
301
+ prefix="deepseek_upload_",
302
+ suffix=uploaded_path.suffix,
303
+ )
304
+ os.close(fd)
305
+ tmp_copy = Path(tmp_path_str)
306
+ shutil.copy(uploaded_path, tmp_copy)
307
+
308
+ engine = get_engine()
309
+
310
+ try:
311
+ with progress_tracker(progress) as tracker:
312
+ result = engine.process_document(tmp_copy, prompt=prompt, progress=tracker)
313
+ finally:
314
+ tmp_copy.unlink(missing_ok=True)
315
+
316
+ return result.to_markdown()
317
+
318
+
319
+ def build_interface() -> gr.Blocks:
320
+ description = (
321
+ "Upload an image or PDF and DeepSeek-OCR will transcribe it into Markdown. "
322
+ "Optimized for Hugging Face free T4 GPU Spaces with flash-attention and "
323
+ "optional vLLM acceleration."
324
+ )
325
+
326
+ with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo:
327
+ gr.Markdown("# DeepSeek-OCR PDF & Image Reader")
328
+ gr.Markdown(description)
329
+
330
+ with gr.Row(equal_height=False):
331
+ with gr.Column(scale=1):
332
+ file_input = gr.File(
333
+ label="Upload document",
334
+ file_count="single",
335
+ type="file",
336
+ file_types=[".png", ".jpg", ".jpeg", ".pdf", ".bmp", ".webp", ".tiff", ".tif"],
337
+ )
338
+ prompt_box = gr.Textbox(
339
+ label="Prompt",
340
+ value=DEFAULT_PROMPT,
341
+ lines=3,
342
+ show_label=True,
343
+ placeholder="Enter the grounding instruction for OCR",
344
+ )
345
+ submit_btn = gr.Button("Run OCR", variant="primary")
346
+
347
+ with gr.Column(scale=1):
348
+ result_output = gr.Markdown(label="OCR Markdown Output")
349
+
350
+ submit_btn.click(
351
+ fn=handle_upload,
352
+ inputs=[file_input, prompt_box],
353
+ outputs=[result_output],
354
+ )
355
+
356
+ return demo
357
+
358
+
359
+ demo = build_interface()
360
+
361
+
362
+ if __name__ == "__main__":
363
+ demo.queue(concurrency_count=2, status_tracker=False).launch()
364
+
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.6.0
2
+ transformers==4.46.3
3
+ tokenizers==0.20.3
4
+ einops==0.8.0
5
+ addict==2.4.0
6
+ easydict==1.11
7
+ flash-attn==2.7.3
8
+ gradio==5.49.1
9
+ pymupdf==1.24.11
10
+ pillow==10.4.0
11
+ numpy==2.1.2
12
+ vllm==0.6.1
13
+ uvicorn==0.30.6
14
+