File size: 6,624 Bytes
7ee0b5a
 
 
 
 
 
 
a7c1a90
7ee0b5a
a7c1a90
 
 
 
d0595ee
 
a7c1a90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0595ee
 
 
7ee0b5a
 
 
 
07a8b7b
7ee0b5a
 
7d8b417
7ee0b5a
 
 
 
 
 
a3a2414
198685f
7ee0b5a
 
 
 
 
 
 
7d8b417
 
07a8b7b
a7c1a90
7d8b417
 
 
 
 
7ee0b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07a8b7b
7ee0b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3a2414
7ee0b5a
 
7d8b417
7ee0b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3a2414
7ee0b5a
 
 
 
 
dcbb32b
3f35c9c
7ee0b5a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
import os
import tempfile
from PIL import Image, ImageDraw
import re


# --------------------------------------------------------------
# Advanced monkey‑patch: force CPU + float32 everywhere
# --------------------------------------------------------------
import torch

# Keep original methods for later use
_original_to   = torch.Tensor.to
_original_half = torch.Tensor.half
_original_bf16 = torch.Tensor.bfloat16

def _patched_to(self, *args, **kwargs):
    """
    Intercept .to(device, dtype) calls.
    - Any device that starts with 'cuda' becomes 'cpu'.
    - Any dtype that is torch.bfloat16 or torch.float16 becomes torch.float32.
    """
    # ---- device handling -------------------------------------------------
    if 'device' in kwargs:
        dev = str(kwargs['device'])
        if dev.startswith('cuda'):
            kwargs['device'] = 'cpu'
    else:
        # positional args may contain a device string
        new_args = []
        for a in args:
            if isinstance(a, str) and a.startswith('cuda'):
                new_args.append('cpu')
            else:
                new_args.append(a)
        args = tuple(new_args)

    # ---- dtype handling --------------------------------------------------
    if 'dtype' in kwargs and kwargs['dtype'] in (torch.bfloat16, torch.float16):
        kwargs['dtype'] = torch.float32
    else:
        # positional dtype may appear after a device arg
        new_args = list(args)
        for i, a in enumerate(new_args):
            if isinstance(a, torch.dtype) and a in (torch.bfloat16, torch.float16):
                new_args[i] = torch.float32
        args = tuple(new_args)

    return _original_to(self, *args, **kwargs)

# Replace the methods
torch.Tensor.to   = _patched_to
torch.Tensor.half = lambda self, *a, **k: self.to(torch.float32)  # .half() β†’ float32
torch.Tensor.bfloat16 = lambda self, *a, **k: self.to(torch.float32)

# Also keep the old .cuda() as a no‑op (you already added this)
torch.Tensor.cuda = lambda self, *a, **k: self.to("cpu")
# --------------------------------------------------------------
# --- End Monkey-Patch ---


# --- 1. Load Model and Tokenizer (Done only once at startup) ---
print("Loading model and tokenizer...")
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Load the model to CPU
model = AutoModel.from_pretrained(
    model_name,
    _attn_implementation="eager",
    trust_remote_code=True,
    use_safetensors=True,
)
model = model.eval()
print("βœ… Model loaded successfully.")

# --- 2. Main Processing Function (UPDATED) ---
def process_ocr_task(image, model_size, task_type):
    """
    Processes an image with DeepSeek-OCR for all supported tasks.
    Now draws ALL detected bounding boxes for ANY task.
    """
    if image is None:
        return "Please upload an image first.", None

    # 4️⃣ Move to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    #model_gpu = model.eval().to(device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32)
    model_cpu = model.to("cpu", dtype=torch.float32)

    if device == "cuda":
        print("βœ… Model is on GPU.")
    else:
        print("βœ… Model is on CPU.")

    with tempfile.TemporaryDirectory() as output_path:
        if task_type == "πŸ“ Free OCR":
            prompt = "<image>\nFree OCR."
        elif task_type == "πŸ“„ Convert to Markdown":
            prompt = "<image>\n<|grounding|>Convert the document to markdown."
        elif task_type == "πŸ“ˆ Parse Figure":
            prompt = "<image>\nParse the figure."
        else:
            prompt = "<image>\nFree OCR."

        temp_image_path = os.path.join(output_path, "temp_image.png")
        image.save(temp_image_path)

        # Configure model size... (same as before)
        size_configs = {
            "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
            "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
            "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
            "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
            "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
        }
        config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])

        print(f"πŸƒ Running inference with prompt: {prompt}")
        text_result = model_cpu.infer(
            tokenizer,
            prompt=prompt,
            image_file=temp_image_path,
            output_path=output_path,
            base_size=config["base_size"],
            image_size=config["image_size"],
            crop_mode=config["crop_mode"],
            save_results=True,
            test_compress=True,
            eval_mode=True,
        )

        print(f"====\nπŸ“„ Text Result: {text_result}\n====")
            
        return text_result

# --- 3. Build the Gradio Interface (UPDATED) ---
with gr.Blocks(title="DeepSeek-OCR X (t)", theme=gr.themes.Monochrome()) as demo:
    gr.Markdown(
        """
        # DeepSeek-OCR X TUL

        **πŸ’‘ How to use:**
        1.  **Upload an image** using the upload box.
        2.  Select a **Resolution**. `Gundam` is recommended for most documents.
        3.  Choose a **Task Type**:
            - **πŸ“ Free OCR**: Extracts raw text from the image.
            - **πŸ“„ Convert to Markdown**: Converts the document into Markdown, preserving structure.
            - **πŸ“ˆ Parse Figure**: Extracts structured data from charts and figures.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(type="pil", label="πŸ–ΌοΈ Upload Image", sources=["upload", "clipboard"])
            model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="βš™οΈ Resolution Size")
            task_type = gr.Dropdown(choices=["πŸ“ Free OCR", "πŸ“„ Convert to Markdown", "πŸ“ˆ Parse Figure"], value="πŸ“ Free OCR", label="πŸš€ Task Type")
            submit_btn = gr.Button("Process Image", variant="primary")

        with gr.Column(scale=2):
            output_text = gr.Textbox(label="πŸ“„ Text Result", lines=15, show_copy_button=True)

    submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type], outputs=[output_text])
    
# --- 4. Launch the App ---
if __name__ == "__main__":
    demo.queue(max_size=20).launch(share=True)