Spaces:

Illia56
/

DeepSeek-OCR-PDF

Build error

App Files Files Community

Roman commited on 27 days ago

Commit

070b924

1 Parent(s): 6b2b081

Add DeepSeek-OCR Gradio app for HF Space

Browse files

Files changed (4) hide show

.gitignore +34 -0
README.md +39 -1
app.py +364 -0
requirements.txt +14 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+__pycache__/
+*.py[cod]
+*.so
+*.dylib
+*.pyd
+.DS_Store
+.env
+.env.*
+.venv/
+venv/
+# Python cache & build artifacts
+build/
+dist/
+*.egg-info/
+# Hugging Face caches
+/.cache/
+/cache/
+/huggingface/
+/.huggingface/
+/logs/
+# Temporary OCR outputs
+deepseek_ocr_out_*/
+deepseek_upload_*
+tmp/
+# Notebooks & checkpoints
+*.ipynb_checkpoints
+*.ckpt
+*.safetensors

README.md CHANGED Viewed

@@ -8,6 +8,44 @@ sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 short_description: OCR interface for your PDF files
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 short_description: OCR interface for your PDF files
+python_version: 3.10
+hardware: t4-small
+license: mit
+tags:
+  - ocr
+  - pdf
+  - gradio
+  - deepseek
 ---
+# DeepSeek-OCR PDF & Image Interface
+This Space wraps [`deepseek-ai/DeepSeek-OCR`](https://huggingface.co/deepseek-ai/DeepSeek-OCR) with a polished Gradio UI that can transcribe both individual images and multi-page PDFs into clean Markdown. It targets the free T4 GPU tier for fast startup while enabling flash-attention and optional vLLM acceleration for multi-page batching.
+## Features
+- Support for `.png`, `.jpg`, `.jpeg`, `.webp`, `.tiff`, and `.pdf`
+- Automatic PDF page conversion with PyMuPDF at 192 DPI
+- Gundam mode defaults (`base_size=1024`, `image_size=640`, `crop_mode=True`) for balanced speed and accuracy
+- Markdown-formatted output with per-page sections
+- Optional custom prompt to tailor extraction instructions
+## Running Locally
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+python app.py
+```
+The interface launches on `http://127.0.0.1:7860` by default. Set the environment variable `USE_VLLM=0` to disable the vLLM backend or leave it enabled to leverage faster batching when the dependency is available.
+## Space Configuration
+- **Hardware**: `t4-small`
+- **Python**: `3.10`
+- **SDK**: `Gradio 5.49.1`
+- **Model**: `deepseek-ai/DeepSeek-OCR`
+Refer to the [Spaces configuration reference](https://huggingface.co/docs/hub/spaces-config-reference) for additional customization options.

app.py ADDED Viewed

	@@ -0,0 +1,364 @@

+"""Gradio interface for DeepSeek-OCR on Hugging Face Spaces.
+This application loads the `deepseek-ai/DeepSeek-OCR` vision-language model
+and exposes a simple interface capable of processing both image and PDF
+documents. The implementation targets the Hugging Face free T4 GPU runtime and
+optimizes throughput with bfloat16 precision, flash-attention, and optional
+vLLM acceleration when available.
+"""
+from __future__ import annotations
+import contextlib
+import dataclasses
+import logging
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from typing import List, Optional
+import gradio as gr
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+try:  # Optional dependency for faster batching
+    from vllm import LLM, SamplingParams  # type: ignore
+    _HAS_VLLM = True
+except Exception:  # pragma: no cover - optional path
+    LLM = None  # type: ignore
+    SamplingParams = None  # type: ignore
+    _HAS_VLLM = False
+try:
+    import fitz  # type: ignore[attr-defined]
+except Exception as exc:  # pragma: no cover - ensures import error is visible
+    raise RuntimeError(
+        "PyMuPDF (fitz) is required for PDF processing. Install pymupdf."
+    ) from exc
+logging.basicConfig(level=logging.INFO)
+LOGGER = logging.getLogger("deepseek_ocr_app")
+MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
+DEFAULT_PROMPT = "<image>\n<|grounding|>Convert the document to markdown."
+GUNDAM_CONFIG = {
+    "base_size": 1024,
+    "image_size": 640,
+    "crop_mode": True,
+    "test_compress": True,
+}
+@dataclasses.dataclass
+class PageResult:
+    """Result for a single page processed by DeepSeek-OCR."""
+    index: int
+    text: str
+@dataclasses.dataclass
+class DocumentResult:
+    """Aggregate OCR result for an input document."""
+    filename: str
+    page_results: List[PageResult]
+    def to_markdown(self) -> str:
+        sections = []
+        for page in self.page_results:
+            heading = f"### Page {page.index}"
+            sections.append(f"{heading}\n\n{page.text.strip()}".strip())
+        return "\n\n".join(sections).strip()
+def has_cuda() -> bool:
+    return torch.cuda.is_available()
+class DeepSeekOCREngine:
+    """Wrapper around the DeepSeek-OCR model for document processing."""
+    def __init__(
+        self,
+        model_name: str = MODEL_NAME,
+        prompt: str = DEFAULT_PROMPT,
+        config: Optional[dict] = None,
+        enable_vllm: bool = True,
+    ) -> None:
+        self.model_name = model_name
+        self.prompt_template = prompt
+        self.config = {**GUNDAM_CONFIG, **(config or {})}
+        self.enable_vllm = enable_vllm and _HAS_VLLM
+        self.device = torch.device("cuda" if has_cuda() else "cpu")
+        self._model = None
+        self._tokenizer = None
+        self._vllm_engine = None
+        self._vllm_sampling_params = None
+        self._output_root = Path(tempfile.mkdtemp(prefix="deepseek_ocr_out_"))
+        self._load_model()
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            raise RuntimeError("Tokenizer not initialized")
+        return self._tokenizer
+    @property
+    def model(self):
+        if self._model is None:
+            raise RuntimeError("Model not initialized")
+        return self._model
+    def _load_model(self) -> None:
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        if self.enable_vllm:
+            try:
+                LOGGER.info("Initializing DeepSeek-OCR with vLLM backend")
+                self._vllm_engine = LLM(
+                    model=self.model_name,
+                    dtype="bfloat16" if has_cuda() else "float32",
+                    tokenizer=self.model_name,
+                    trust_remote_code=True,
+                )
+                self._vllm_sampling_params = SamplingParams(
+                    temperature=0.0,
+                    top_p=0.9,
+                    max_tokens=4096,
+                )
+            except Exception as vllm_error:
+                LOGGER.warning(
+                    "vLLM initialization failed (%s). Falling back to HF AutoModel.",
+                    vllm_error,
+                )
+                self.enable_vllm = False
+        if not self.enable_vllm:
+            LOGGER.info("Loading DeepSeek-OCR with transformers backend")
+            self._tokenizer = AutoTokenizer.from_pretrained(
+                self.model_name, trust_remote_code=True
+            )
+            torch_dtype = torch.bfloat16 if self.device.type == "cuda" else torch.float32
+            self._model = AutoModel.from_pretrained(
+                self.model_name,
+                trust_remote_code=True,
+                use_safetensors=True,
+                _attn_implementation="flash_attention_2",
+                torch_dtype=torch_dtype,
+            )
+            self._model = self._model.eval().to(self.device)
+    def cleanup(self) -> None:
+        if self._output_root.exists():
+            shutil.rmtree(self._output_root, ignore_errors=True)
+    def _infer_transformers(self, image_path: Path, prompt: str) -> str:
+        result = self.model.infer(
+            self.tokenizer,
+            prompt=prompt,
+            image_file=str(image_path),
+            output_path=str(self._output_root),
+            base_size=self.config["base_size"],
+            image_size=self.config["image_size"],
+            crop_mode=self.config["crop_mode"],
+            save_results=False,
+            test_compress=self.config.get("test_compress", True),
+        )
+        if isinstance(result, dict):
+            for key in ("text", "markdown", "raw_text", "result"):
+                if key in result and isinstance(result[key], str):
+                    return result[key]
+            return "\n".join(str(v) for v in result.values())
+        if isinstance(result, (list, tuple)):
+            return "\n".join(str(item) for item in result)
+        return str(result)
+    def _infer_vllm(self, image_path: Path, prompt: str) -> str:
+        if not self.enable_vllm or self._vllm_engine is None:
+            raise RuntimeError("vLLM backend is not initialized")
+        formatted_prompt = f"<image>{prompt.replace('<image>', '').strip()}"
+        outputs = self._vllm_engine.generate(
+            prompts=[formatted_prompt],
+            image_data=[[Image.open(image_path)]],
+            sampling_params=self._vllm_sampling_params,
+        )
+        return outputs[0].outputs[0].text if outputs else ""
+    def _infer(self, image_path: Path, prompt: str) -> str:
+        if self.enable_vllm:
+            try:
+                return self._infer_vllm(image_path, prompt)
+            except Exception as error:
+                LOGGER.warning(
+                    "Falling back to transformers backend after vLLM error: %s",
+                    error,
+                )
+                self.enable_vllm = False
+        return self._infer_transformers(image_path, prompt)
+    def _convert_pdf_to_images(
+        self, pdf_path: Path, output_dir: Path, dpi: int = 192
+    ) -> List[Path]:
+        document = fitz.open(pdf_path)
+        image_paths: List[Path] = []
+        zoom = dpi / 72  # Default PDF DPI is 72
+        matrix = fitz.Matrix(zoom, zoom)
+        for page_index in range(len(document)):
+            page = document.load_page(page_index)
+            pixmap = page.get_pixmap(matrix=matrix, alpha=False)
+            page_path = output_dir / f"page-{page_index + 1:04d}.png"
+            pixmap.save(page_path)
+            image_paths.append(page_path)
+        document.close()
+        return image_paths
+    def process_document(
+        self,
+        file_path: Path,
+        prompt: Optional[str] = None,
+        progress: Optional[gr.Progress] = None,
+    ) -> DocumentResult:
+        prompt_to_use = prompt.strip() if prompt and prompt.strip() else self.prompt_template
+        suffix = file_path.suffix.lower()
+        with tempfile.TemporaryDirectory(prefix="deepseek_ocr_tmp_") as tmp_dir:
+            tmp_dir_path = Path(tmp_dir)
+            if suffix in {".png", ".jpg", ".jpeg", ".bmp", ".webp", ".tif", ".tiff"}:
+                image_paths = [self._ensure_rgb_image(file_path, tmp_dir_path)]
+            elif suffix == ".pdf":
+                if progress:
+                    progress(0.0, desc="Converting PDF pages")
+                image_paths = self._convert_pdf_to_images(file_path, tmp_dir_path)
+            else:
+                raise ValueError("Unsupported file format. Please upload an image or PDF.")
+            total_pages = len(image_paths)
+            page_results: List[PageResult] = []
+            for idx, image_path in enumerate(image_paths, start=1):
+                if progress:
+                    progress(
+                        (idx - 1) / max(total_pages, 1),
+                        desc=f"Processing page {idx}/{total_pages}"
+                    )
+                text = self._infer(image_path, prompt_to_use)
+                page_results.append(PageResult(index=idx, text=text))
+            if progress:
+                progress(1.0, desc="Completed")
+        return DocumentResult(filename=file_path.name, page_results=page_results)
+    def _ensure_rgb_image(self, image_path: Path, output_dir: Path) -> Path:
+        """Ensure the provided image is saved as RGB PNG for the model."""
+        image = Image.open(image_path)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        output_path = output_dir / f"image-{image_path.stem}.png"
+        image.save(output_path, format="PNG", optimize=True)
+        return output_path
+@contextlib.contextmanager
+def progress_tracker(progress: Optional[gr.Progress]):
+    yield progress if progress else None
+ENGINE: Optional[DeepSeekOCREngine] = None
+def get_engine() -> DeepSeekOCREngine:
+    global ENGINE
+    if ENGINE is None:
+        use_vllm_env = os.getenv("USE_VLLM", "1").strip().lower()
+        enable_vllm = use_vllm_env not in {"0", "false", "no"}
+        LOGGER.info("Instantiating DeepSeek-OCR engine (vLLM=%s)", enable_vllm)
+        ENGINE = DeepSeekOCREngine(enable_vllm=enable_vllm)
+    return ENGINE
+def handle_upload(
+    file: gr.File | None,
+    prompt: str,
+    progress: gr.Progress = gr.Progress(track_tqdm=True),
+) -> str:
+    if file is None:
+        raise gr.Error("Please upload an image or PDF file to start OCR.")
+    uploaded_path = Path(file.name)
+    fd, tmp_path_str = tempfile.mkstemp(
+        prefix="deepseek_upload_",
+        suffix=uploaded_path.suffix,
+    )
+    os.close(fd)
+    tmp_copy = Path(tmp_path_str)
+    shutil.copy(uploaded_path, tmp_copy)
+    engine = get_engine()
+    try:
+        with progress_tracker(progress) as tracker:
+            result = engine.process_document(tmp_copy, prompt=prompt, progress=tracker)
+    finally:
+        tmp_copy.unlink(missing_ok=True)
+    return result.to_markdown()
+def build_interface() -> gr.Blocks:
+    description = (
+        "Upload an image or PDF and DeepSeek-OCR will transcribe it into Markdown. "
+        "Optimized for Hugging Face free T4 GPU Spaces with flash-attention and "
+        "optional vLLM acceleration."
+    )
+    with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# DeepSeek-OCR PDF & Image Reader")
+        gr.Markdown(description)
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=1):
+                file_input = gr.File(
+                    label="Upload document",
+                    file_count="single",
+                    type="file",
+                    file_types=[".png", ".jpg", ".jpeg", ".pdf", ".bmp", ".webp", ".tiff", ".tif"],
+                )
+                prompt_box = gr.Textbox(
+                    label="Prompt",
+                    value=DEFAULT_PROMPT,
+                    lines=3,
+                    show_label=True,
+                    placeholder="Enter the grounding instruction for OCR",
+                )
+                submit_btn = gr.Button("Run OCR", variant="primary")
+            with gr.Column(scale=1):
+                result_output = gr.Markdown(label="OCR Markdown Output")
+        submit_btn.click(
+            fn=handle_upload,
+            inputs=[file_input, prompt_box],
+            outputs=[result_output],
+        )
+    return demo
+demo = build_interface()
+if __name__ == "__main__":
+    demo.queue(concurrency_count=2, status_tracker=False).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch==2.6.0
+transformers==4.46.3
+tokenizers==0.20.3
+einops==0.8.0
+addict==2.4.0
+easydict==1.11
+flash-attn==2.7.3
+gradio==5.49.1
+pymupdf==1.24.11
+pillow==10.4.0
+numpy==2.1.2
+vllm==0.6.1
+uvicorn==0.30.6