File size: 6,624 Bytes
7ee0b5a a7c1a90 7ee0b5a a7c1a90 d0595ee a7c1a90 d0595ee 7ee0b5a 07a8b7b 7ee0b5a 7d8b417 7ee0b5a a3a2414 198685f 7ee0b5a 7d8b417 07a8b7b a7c1a90 7d8b417 7ee0b5a 07a8b7b 7ee0b5a a3a2414 7ee0b5a 7d8b417 7ee0b5a a3a2414 7ee0b5a dcbb32b 3f35c9c 7ee0b5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
import os
import tempfile
from PIL import Image, ImageDraw
import re
# --------------------------------------------------------------
# Advanced monkeyβpatch: force CPU + float32 everywhere
# --------------------------------------------------------------
import torch
# Keep original methods for later use
_original_to = torch.Tensor.to
_original_half = torch.Tensor.half
_original_bf16 = torch.Tensor.bfloat16
def _patched_to(self, *args, **kwargs):
"""
Intercept .to(device, dtype) calls.
- Any device that starts with 'cuda' becomes 'cpu'.
- Any dtype that is torch.bfloat16 or torch.float16 becomes torch.float32.
"""
# ---- device handling -------------------------------------------------
if 'device' in kwargs:
dev = str(kwargs['device'])
if dev.startswith('cuda'):
kwargs['device'] = 'cpu'
else:
# positional args may contain a device string
new_args = []
for a in args:
if isinstance(a, str) and a.startswith('cuda'):
new_args.append('cpu')
else:
new_args.append(a)
args = tuple(new_args)
# ---- dtype handling --------------------------------------------------
if 'dtype' in kwargs and kwargs['dtype'] in (torch.bfloat16, torch.float16):
kwargs['dtype'] = torch.float32
else:
# positional dtype may appear after a device arg
new_args = list(args)
for i, a in enumerate(new_args):
if isinstance(a, torch.dtype) and a in (torch.bfloat16, torch.float16):
new_args[i] = torch.float32
args = tuple(new_args)
return _original_to(self, *args, **kwargs)
# Replace the methods
torch.Tensor.to = _patched_to
torch.Tensor.half = lambda self, *a, **k: self.to(torch.float32) # .half() β float32
torch.Tensor.bfloat16 = lambda self, *a, **k: self.to(torch.float32)
# Also keep the old .cuda() as a noβop (you already added this)
torch.Tensor.cuda = lambda self, *a, **k: self.to("cpu")
# --------------------------------------------------------------
# --- End Monkey-Patch ---
# --- 1. Load Model and Tokenizer (Done only once at startup) ---
print("Loading model and tokenizer...")
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Load the model to CPU
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="eager",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval()
print("β
Model loaded successfully.")
# --- 2. Main Processing Function (UPDATED) ---
def process_ocr_task(image, model_size, task_type):
"""
Processes an image with DeepSeek-OCR for all supported tasks.
Now draws ALL detected bounding boxes for ANY task.
"""
if image is None:
return "Please upload an image first.", None
# 4οΈβ£ Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
#model_gpu = model.eval().to(device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32)
model_cpu = model.to("cpu", dtype=torch.float32)
if device == "cuda":
print("β
Model is on GPU.")
else:
print("β
Model is on CPU.")
with tempfile.TemporaryDirectory() as output_path:
if task_type == "π Free OCR":
prompt = "<image>\nFree OCR."
elif task_type == "π Convert to Markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown."
elif task_type == "π Parse Figure":
prompt = "<image>\nParse the figure."
else:
prompt = "<image>\nFree OCR."
temp_image_path = os.path.join(output_path, "temp_image.png")
image.save(temp_image_path)
# Configure model size... (same as before)
size_configs = {
"Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
"Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
"Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
"Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
"Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
}
config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
print(f"π Running inference with prompt: {prompt}")
text_result = model_cpu.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path=output_path,
base_size=config["base_size"],
image_size=config["image_size"],
crop_mode=config["crop_mode"],
save_results=True,
test_compress=True,
eval_mode=True,
)
print(f"====\nπ Text Result: {text_result}\n====")
return text_result
# --- 3. Build the Gradio Interface (UPDATED) ---
with gr.Blocks(title="DeepSeek-OCR X (t)", theme=gr.themes.Monochrome()) as demo:
gr.Markdown(
"""
# DeepSeek-OCR X TUL
**π‘ How to use:**
1. **Upload an image** using the upload box.
2. Select a **Resolution**. `Gundam` is recommended for most documents.
3. Choose a **Task Type**:
- **π Free OCR**: Extracts raw text from the image.
- **π Convert to Markdown**: Converts the document into Markdown, preserving structure.
- **π Parse Figure**: Extracts structured data from charts and figures.
"""
)
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(type="pil", label="πΌοΈ Upload Image", sources=["upload", "clipboard"])
model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="βοΈ Resolution Size")
task_type = gr.Dropdown(choices=["π Free OCR", "π Convert to Markdown", "π Parse Figure"], value="π Free OCR", label="π Task Type")
submit_btn = gr.Button("Process Image", variant="primary")
with gr.Column(scale=2):
output_text = gr.Textbox(label="π Text Result", lines=15, show_copy_button=True)
submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type], outputs=[output_text])
# --- 4. Launch the App ---
if __name__ == "__main__":
demo.queue(max_size=20).launch(share=True) |