Spaces:

YagndeepKukadiya
/

Deepseek-OCR-TUL

Running

App Files Files Community

Deepseek-OCR-TUL / app.py

YagndeepKukadiya

Update app.py

a3a2414 verified 12 days ago

raw

history blame contribute delete

6.62 kB

	import gradio as gr
	import torch
	from transformers import AutoModel, AutoTokenizer
	import spaces
	import os
	import tempfile
	from PIL import Image, ImageDraw
	import re


	# --------------------------------------------------------------
	# Advanced monkey‑patch: force CPU + float32 everywhere
	# --------------------------------------------------------------
	import torch

	# Keep original methods for later use
	_original_to = torch.Tensor.to
	_original_half = torch.Tensor.half
	_original_bf16 = torch.Tensor.bfloat16

	def _patched_to(self, args, *kwargs):
	"""
	Intercept .to(device, dtype) calls.
	- Any device that starts with 'cuda' becomes 'cpu'.
	- Any dtype that is torch.bfloat16 or torch.float16 becomes torch.float32.
	"""
	# ---- device handling -------------------------------------------------
	if 'device' in kwargs:
	dev = str(kwargs['device'])
	if dev.startswith('cuda'):
	kwargs['device'] = 'cpu'
	else:
	# positional args may contain a device string
	new_args = []
	for a in args:
	if isinstance(a, str) and a.startswith('cuda'):
	new_args.append('cpu')
	else:
	new_args.append(a)
	args = tuple(new_args)

	# ---- dtype handling --------------------------------------------------
	if 'dtype' in kwargs and kwargs['dtype'] in (torch.bfloat16, torch.float16):
	kwargs['dtype'] = torch.float32
	else:
	# positional dtype may appear after a device arg
	new_args = list(args)
	for i, a in enumerate(new_args):
	if isinstance(a, torch.dtype) and a in (torch.bfloat16, torch.float16):
	new_args[i] = torch.float32
	args = tuple(new_args)

	return _original_to(self, args, *kwargs)

	# Replace the methods
	torch.Tensor.to = _patched_to
	torch.Tensor.half = lambda self, a, *k: self.to(torch.float32) # .half() → float32
	torch.Tensor.bfloat16 = lambda self, a, *k: self.to(torch.float32)

	# Also keep the old .cuda() as a no‑op (you already added this)
	torch.Tensor.cuda = lambda self, a, *k: self.to("cpu")
	# --------------------------------------------------------------
	# --- End Monkey-Patch ---


	# --- 1. Load Model and Tokenizer (Done only once at startup) ---
	print("Loading model and tokenizer...")
	model_name = "deepseek-ai/DeepSeek-OCR"
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	# Load the model to CPU
	model = AutoModel.from_pretrained(
	model_name,
	_attn_implementation="eager",
	trust_remote_code=True,
	use_safetensors=True,
	)
	model = model.eval()
	print("✅ Model loaded successfully.")

	# --- 2. Main Processing Function (UPDATED) ---
	def process_ocr_task(image, model_size, task_type):
	"""
	Processes an image with DeepSeek-OCR for all supported tasks.
	Now draws ALL detected bounding boxes for ANY task.
	"""
	if image is None:
	return "Please upload an image first.", None

	# 4️⃣ Move to GPU if available
	device = "cuda" if torch.cuda.is_available() else "cpu"
	#model_gpu = model.eval().to(device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32)
	model_cpu = model.to("cpu", dtype=torch.float32)

	if device == "cuda":
	print("✅ Model is on GPU.")
	else:
	print("✅ Model is on CPU.")

	with tempfile.TemporaryDirectory() as output_path:
	if task_type == "📝 Free OCR":
	prompt = "<image>\nFree OCR."
	elif task_type == "📄 Convert to Markdown":
	prompt = "<image>\n<\|grounding\|>Convert the document to markdown."
	elif task_type == "📈 Parse Figure":
	prompt = "<image>\nParse the figure."
	else:
	prompt = "<image>\nFree OCR."

	temp_image_path = os.path.join(output_path, "temp_image.png")
	image.save(temp_image_path)

	# Configure model size... (same as before)
	size_configs = {
	"Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
	"Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
	"Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
	"Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
	"Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
	}
	config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])

	print(f"🏃 Running inference with prompt: {prompt}")
	text_result = model_cpu.infer(
	tokenizer,
	prompt=prompt,
	image_file=temp_image_path,
	output_path=output_path,
	base_size=config["base_size"],
	image_size=config["image_size"],
	crop_mode=config["crop_mode"],
	save_results=True,
	test_compress=True,
	eval_mode=True,
	)

	print(f"====\n📄 Text Result: {text_result}\n====")

	return text_result

	# --- 3. Build the Gradio Interface (UPDATED) ---
	with gr.Blocks(title="DeepSeek-OCR X (t)", theme=gr.themes.Monochrome()) as demo:
	gr.Markdown(
	"""
	# DeepSeek-OCR X TUL

	💡 How to use:
	1. Upload an image using the upload box.
	2. Select a Resolution. `Gundam` is recommended for most documents.
	3. Choose a Task Type:
	- 📝 Free OCR: Extracts raw text from the image.
	- 📄 Convert to Markdown: Converts the document into Markdown, preserving structure.
	- 📈 Parse Figure: Extracts structured data from charts and figures.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	image_input = gr.Image(type="pil", label="🖼️ Upload Image", sources=["upload", "clipboard"])
	model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="⚙️ Resolution Size")
	task_type = gr.Dropdown(choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure"], value="📝 Free OCR", label="🚀 Task Type")
	submit_btn = gr.Button("Process Image", variant="primary")

	with gr.Column(scale=2):
	output_text = gr.Textbox(label="📄 Text Result", lines=15, show_copy_button=True)

	submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type], outputs=[output_text])

	# --- 4. Launch the App ---
	if __name__ == "__main__":
	demo.queue(max_size=20).launch(share=True)