Spaces:

YagndeepKukadiya
/

Deepseek-OCR-TUL

Running

File size: 6,624 Bytes

import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
import os
import tempfile
from PIL import Image, ImageDraw
import re


# --------------------------------------------------------------
# Advanced monkey‑patch: force CPU + float32 everywhere
# --------------------------------------------------------------
import torch

# Keep original methods for later use
_original_to   = torch.Tensor.to
_original_half = torch.Tensor.half
_original_bf16 = torch.Tensor.bfloat16

def _patched_to(self, *args, **kwargs):
    """
    Intercept .to(device, dtype) calls.
    - Any device that starts with 'cuda' becomes 'cpu'.
    - Any dtype that is torch.bfloat16 or torch.float16 becomes torch.float32.
    """
    # ---- device handling -------------------------------------------------
    if 'device' in kwargs:
        dev = str(kwargs['device'])
        if dev.startswith('cuda'):
            kwargs['device'] = 'cpu'
    else:
        # positional args may contain a device string
        new_args = []
        for a in args:
            if isinstance(a, str) and a.startswith('cuda'):
                new_args.append('cpu')
            else:
                new_args.append(a)
        args = tuple(new_args)

    # ---- dtype handling --------------------------------------------------
    if 'dtype' in kwargs and kwargs['dtype'] in (torch.bfloat16, torch.float16):
        kwargs['dtype'] = torch.float32
    else:
        # positional dtype may appear after a device arg
        new_args = list(args)
        for i, a in enumerate(new_args):
            if isinstance(a, torch.dtype) and a in (torch.bfloat16, torch.float16):
                new_args[i] = torch.float32
        args = tuple(new_args)

    return _original_to(self, *args, **kwargs)

# Replace the methods
torch.Tensor.to   = _patched_to
torch.Tensor.half = lambda self, *a, **k: self.to(torch.float32)  # .half() → float32
torch.Tensor.bfloat16 = lambda self, *a, **k: self.to(torch.float32)

# Also keep the old .cuda() as a no‑op (you already added this)
torch.Tensor.cuda = lambda self, *a, **k: self.to("cpu")
# --------------------------------------------------------------
# --- End Monkey-Patch ---


# --- 1. Load Model and Tokenizer (Done only once at startup) ---
print("Loading model and tokenizer...")
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Load the model to CPU
model = AutoModel.from_pretrained(
    model_name,
    _attn_implementation="eager",
    trust_remote_code=True,
    use_safetensors=True,
)
model = model.eval()
print("✅ Model loaded successfully.")

# --- 2. Main Processing Function (UPDATED) ---
def process_ocr_task(image, model_size, task_type):
    """
    Processes an image with DeepSeek-OCR for all supported tasks.
    Now draws ALL detected bounding boxes for ANY task.
    """
    if image is None:
        return "Please upload an image first.", None

    # 4️⃣ Move to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    #model_gpu = model.eval().to(device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32)
    model_cpu = model.to("cpu", dtype=torch.float32)

    if device == "cuda":
        print("✅ Model is on GPU.")
    else:
        print("✅ Model is on CPU.")

    with tempfile.TemporaryDirectory() as output_path:
        if task_type == "📝 Free OCR":
            prompt = "<image>\nFree OCR."
        elif task_type == "📄 Convert to Markdown":
            prompt = "<image>\n<|grounding|>Convert the document to markdown."
        elif task_type == "📈 Parse Figure":
            prompt = "<image>\nParse the figure."
        else:
            prompt = "<image>\nFree OCR."

        temp_image_path = os.path.join(output_path, "temp_image.png")
        image.save(temp_image_path)

        # Configure model size... (same as before)
        size_configs = {
            "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
            "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
            "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
            "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
            "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
        }
        config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])

        print(f"🏃 Running inference with prompt: {prompt}")
        text_result = model_cpu.infer(
            tokenizer,
            prompt=prompt,
            image_file=temp_image_path,
            output_path=output_path,
            base_size=config["base_size"],
            image_size=config["image_size"],
            crop_mode=config["crop_mode"],
            save_results=True,
            test_compress=True,
            eval_mode=True,
        )

        print(f"====\n📄 Text Result: {text_result}\n====")
            
        return text_result

# --- 3. Build the Gradio Interface (UPDATED) ---
with gr.Blocks(title="DeepSeek-OCR X (t)", theme=gr.themes.Monochrome()) as demo:
    gr.Markdown(
        """
        # DeepSeek-OCR X TUL

        **💡 How to use:**
        1.  **Upload an image** using the upload box.
        2.  Select a **Resolution**. `Gundam` is recommended for most documents.
        3.  Choose a **Task Type**:
            - **📝 Free OCR**: Extracts raw text from the image.
            - **📄 Convert to Markdown**: Converts the document into Markdown, preserving structure.
            - **📈 Parse Figure**: Extracts structured data from charts and figures.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(type="pil", label="🖼️ Upload Image", sources=["upload", "clipboard"])
            model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="⚙️ Resolution Size")
            task_type = gr.Dropdown(choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure"], value="📝 Free OCR", label="🚀 Task Type")
            submit_btn = gr.Button("Process Image", variant="primary")

        with gr.Column(scale=2):
            output_text = gr.Textbox(label="📄 Text Result", lines=15, show_copy_button=True)

    submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type], outputs=[output_text])
    
# --- 4. Launch the App ---
if __name__ == "__main__":
    demo.queue(max_size=20).launch(share=True)