YagndeepKukadiya's picture
Update app.py
a3a2414 verified
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
import os
import tempfile
from PIL import Image, ImageDraw
import re
# --------------------------------------------------------------
# Advanced monkey‑patch: force CPU + float32 everywhere
# --------------------------------------------------------------
import torch
# Keep original methods for later use
_original_to = torch.Tensor.to
_original_half = torch.Tensor.half
_original_bf16 = torch.Tensor.bfloat16
def _patched_to(self, *args, **kwargs):
"""
Intercept .to(device, dtype) calls.
- Any device that starts with 'cuda' becomes 'cpu'.
- Any dtype that is torch.bfloat16 or torch.float16 becomes torch.float32.
"""
# ---- device handling -------------------------------------------------
if 'device' in kwargs:
dev = str(kwargs['device'])
if dev.startswith('cuda'):
kwargs['device'] = 'cpu'
else:
# positional args may contain a device string
new_args = []
for a in args:
if isinstance(a, str) and a.startswith('cuda'):
new_args.append('cpu')
else:
new_args.append(a)
args = tuple(new_args)
# ---- dtype handling --------------------------------------------------
if 'dtype' in kwargs and kwargs['dtype'] in (torch.bfloat16, torch.float16):
kwargs['dtype'] = torch.float32
else:
# positional dtype may appear after a device arg
new_args = list(args)
for i, a in enumerate(new_args):
if isinstance(a, torch.dtype) and a in (torch.bfloat16, torch.float16):
new_args[i] = torch.float32
args = tuple(new_args)
return _original_to(self, *args, **kwargs)
# Replace the methods
torch.Tensor.to = _patched_to
torch.Tensor.half = lambda self, *a, **k: self.to(torch.float32) # .half() β†’ float32
torch.Tensor.bfloat16 = lambda self, *a, **k: self.to(torch.float32)
# Also keep the old .cuda() as a no‑op (you already added this)
torch.Tensor.cuda = lambda self, *a, **k: self.to("cpu")
# --------------------------------------------------------------
# --- End Monkey-Patch ---
# --- 1. Load Model and Tokenizer (Done only once at startup) ---
print("Loading model and tokenizer...")
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Load the model to CPU
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="eager",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval()
print("βœ… Model loaded successfully.")
# --- 2. Main Processing Function (UPDATED) ---
def process_ocr_task(image, model_size, task_type):
"""
Processes an image with DeepSeek-OCR for all supported tasks.
Now draws ALL detected bounding boxes for ANY task.
"""
if image is None:
return "Please upload an image first.", None
# 4️⃣ Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
#model_gpu = model.eval().to(device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32)
model_cpu = model.to("cpu", dtype=torch.float32)
if device == "cuda":
print("βœ… Model is on GPU.")
else:
print("βœ… Model is on CPU.")
with tempfile.TemporaryDirectory() as output_path:
if task_type == "πŸ“ Free OCR":
prompt = "<image>\nFree OCR."
elif task_type == "πŸ“„ Convert to Markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown."
elif task_type == "πŸ“ˆ Parse Figure":
prompt = "<image>\nParse the figure."
else:
prompt = "<image>\nFree OCR."
temp_image_path = os.path.join(output_path, "temp_image.png")
image.save(temp_image_path)
# Configure model size... (same as before)
size_configs = {
"Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
"Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
"Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
"Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
"Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
}
config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
print(f"πŸƒ Running inference with prompt: {prompt}")
text_result = model_cpu.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path=output_path,
base_size=config["base_size"],
image_size=config["image_size"],
crop_mode=config["crop_mode"],
save_results=True,
test_compress=True,
eval_mode=True,
)
print(f"====\nπŸ“„ Text Result: {text_result}\n====")
return text_result
# --- 3. Build the Gradio Interface (UPDATED) ---
with gr.Blocks(title="DeepSeek-OCR X (t)", theme=gr.themes.Monochrome()) as demo:
gr.Markdown(
"""
# DeepSeek-OCR X TUL
**πŸ’‘ How to use:**
1. **Upload an image** using the upload box.
2. Select a **Resolution**. `Gundam` is recommended for most documents.
3. Choose a **Task Type**:
- **πŸ“ Free OCR**: Extracts raw text from the image.
- **πŸ“„ Convert to Markdown**: Converts the document into Markdown, preserving structure.
- **πŸ“ˆ Parse Figure**: Extracts structured data from charts and figures.
"""
)
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(type="pil", label="πŸ–ΌοΈ Upload Image", sources=["upload", "clipboard"])
model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="βš™οΈ Resolution Size")
task_type = gr.Dropdown(choices=["πŸ“ Free OCR", "πŸ“„ Convert to Markdown", "πŸ“ˆ Parse Figure"], value="πŸ“ Free OCR", label="πŸš€ Task Type")
submit_btn = gr.Button("Process Image", variant="primary")
with gr.Column(scale=2):
output_text = gr.Textbox(label="πŸ“„ Text Result", lines=15, show_copy_button=True)
submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type], outputs=[output_text])
# --- 4. Launch the App ---
if __name__ == "__main__":
demo.queue(max_size=20).launch(share=True)