|
|
import gradio as gr |
|
|
import torch |
|
|
from transformers import AutoModel, AutoTokenizer |
|
|
import spaces |
|
|
import os |
|
|
import tempfile |
|
|
from PIL import Image, ImageDraw |
|
|
import re |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
|
|
|
|
|
|
|
_original_to = torch.Tensor.to |
|
|
_original_half = torch.Tensor.half |
|
|
_original_bf16 = torch.Tensor.bfloat16 |
|
|
|
|
|
def _patched_to(self, *args, **kwargs): |
|
|
""" |
|
|
Intercept .to(device, dtype) calls. |
|
|
- Any device that starts with 'cuda' becomes 'cpu'. |
|
|
- Any dtype that is torch.bfloat16 or torch.float16 becomes torch.float32. |
|
|
""" |
|
|
|
|
|
if 'device' in kwargs: |
|
|
dev = str(kwargs['device']) |
|
|
if dev.startswith('cuda'): |
|
|
kwargs['device'] = 'cpu' |
|
|
else: |
|
|
|
|
|
new_args = [] |
|
|
for a in args: |
|
|
if isinstance(a, str) and a.startswith('cuda'): |
|
|
new_args.append('cpu') |
|
|
else: |
|
|
new_args.append(a) |
|
|
args = tuple(new_args) |
|
|
|
|
|
|
|
|
if 'dtype' in kwargs and kwargs['dtype'] in (torch.bfloat16, torch.float16): |
|
|
kwargs['dtype'] = torch.float32 |
|
|
else: |
|
|
|
|
|
new_args = list(args) |
|
|
for i, a in enumerate(new_args): |
|
|
if isinstance(a, torch.dtype) and a in (torch.bfloat16, torch.float16): |
|
|
new_args[i] = torch.float32 |
|
|
args = tuple(new_args) |
|
|
|
|
|
return _original_to(self, *args, **kwargs) |
|
|
|
|
|
|
|
|
torch.Tensor.to = _patched_to |
|
|
torch.Tensor.half = lambda self, *a, **k: self.to(torch.float32) |
|
|
torch.Tensor.bfloat16 = lambda self, *a, **k: self.to(torch.float32) |
|
|
|
|
|
|
|
|
torch.Tensor.cuda = lambda self, *a, **k: self.to("cpu") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Loading model and tokenizer...") |
|
|
model_name = "deepseek-ai/DeepSeek-OCR" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
|
|
|
|
model = AutoModel.from_pretrained( |
|
|
model_name, |
|
|
_attn_implementation="eager", |
|
|
trust_remote_code=True, |
|
|
use_safetensors=True, |
|
|
) |
|
|
model = model.eval() |
|
|
print("β
Model loaded successfully.") |
|
|
|
|
|
|
|
|
def process_ocr_task(image, model_size, task_type): |
|
|
""" |
|
|
Processes an image with DeepSeek-OCR for all supported tasks. |
|
|
Now draws ALL detected bounding boxes for ANY task. |
|
|
""" |
|
|
if image is None: |
|
|
return "Please upload an image first.", None |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
model_cpu = model.to("cpu", dtype=torch.float32) |
|
|
|
|
|
if device == "cuda": |
|
|
print("β
Model is on GPU.") |
|
|
else: |
|
|
print("β
Model is on CPU.") |
|
|
|
|
|
with tempfile.TemporaryDirectory() as output_path: |
|
|
if task_type == "π Free OCR": |
|
|
prompt = "<image>\nFree OCR." |
|
|
elif task_type == "π Convert to Markdown": |
|
|
prompt = "<image>\n<|grounding|>Convert the document to markdown." |
|
|
elif task_type == "π Parse Figure": |
|
|
prompt = "<image>\nParse the figure." |
|
|
else: |
|
|
prompt = "<image>\nFree OCR." |
|
|
|
|
|
temp_image_path = os.path.join(output_path, "temp_image.png") |
|
|
image.save(temp_image_path) |
|
|
|
|
|
|
|
|
size_configs = { |
|
|
"Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False}, |
|
|
"Small": {"base_size": 640, "image_size": 640, "crop_mode": False}, |
|
|
"Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False}, |
|
|
"Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}, |
|
|
"Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True}, |
|
|
} |
|
|
config = size_configs.get(model_size, size_configs["Gundam (Recommended)"]) |
|
|
|
|
|
print(f"π Running inference with prompt: {prompt}") |
|
|
text_result = model_cpu.infer( |
|
|
tokenizer, |
|
|
prompt=prompt, |
|
|
image_file=temp_image_path, |
|
|
output_path=output_path, |
|
|
base_size=config["base_size"], |
|
|
image_size=config["image_size"], |
|
|
crop_mode=config["crop_mode"], |
|
|
save_results=True, |
|
|
test_compress=True, |
|
|
eval_mode=True, |
|
|
) |
|
|
|
|
|
print(f"====\nπ Text Result: {text_result}\n====") |
|
|
|
|
|
return text_result |
|
|
|
|
|
|
|
|
with gr.Blocks(title="DeepSeek-OCR X (t)", theme=gr.themes.Monochrome()) as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# DeepSeek-OCR X TUL |
|
|
|
|
|
**π‘ How to use:** |
|
|
1. **Upload an image** using the upload box. |
|
|
2. Select a **Resolution**. `Gundam` is recommended for most documents. |
|
|
3. Choose a **Task Type**: |
|
|
- **π Free OCR**: Extracts raw text from the image. |
|
|
- **π Convert to Markdown**: Converts the document into Markdown, preserving structure. |
|
|
- **π Parse Figure**: Extracts structured data from charts and figures. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
image_input = gr.Image(type="pil", label="πΌοΈ Upload Image", sources=["upload", "clipboard"]) |
|
|
model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="βοΈ Resolution Size") |
|
|
task_type = gr.Dropdown(choices=["π Free OCR", "π Convert to Markdown", "π Parse Figure"], value="π Free OCR", label="π Task Type") |
|
|
submit_btn = gr.Button("Process Image", variant="primary") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
output_text = gr.Textbox(label="π Text Result", lines=15, show_copy_button=True) |
|
|
|
|
|
submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type], outputs=[output_text]) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue(max_size=20).launch(share=True) |