Spaces:

YagndeepKukadiya
/

Deepseek-OCR-TUL

Running

App Files Files Community

YagndeepKukadiya commited on 13 days ago

Commit

7ee0b5a

verified ·

1 Parent(s): 8a0253d

Upload 2 files

Browse files

Added app.py & requirements.txt

Files changed (2) hide show

app.py +120 -0
requirements.txt +15 -0

app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import gradio as gr
+import torch
+from transformers import AutoModel, AutoTokenizer
+import spaces
+import os
+import tempfile
+from PIL import Image, ImageDraw
+import re # Import thư viện regular expression
+# --- 1. Load Model and Tokenizer (Done only once at startup) ---
+print("Loading model and tokenizer...")
+model_name = "deepseek-ai/DeepSeek-OCR"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# Load the model to CPU first; it will be moved to GPU during processing
+model = AutoModel.from_pretrained(
+    model_name,
+    _attn_implementation="flash_attention_2",
+    trust_remote_code=True,
+    use_safetensors=True,
+)
+model = model.eval()
+print("✅ Model loaded successfully.")
+# --- Helper function to find pre-generated result images ---
+def find_result_image(path):
+    for filename in os.listdir(path):
+        if "grounding" in filename or "result" in filename:
+            try:
+                image_path = os.path.join(path, filename)
+                return Image.open(image_path)
+            except Exception as e:
+                print(f"Error opening result image {filename}: {e}")
+    return None
+# --- 2. Main Processing Function (UPDATED for multi-bbox drawing) ---
+@spaces.GPU
+def process_ocr_task(image, model_size, task_type, ref_text):
+    """
+    Processes an image with DeepSeek-OCR for all supported tasks.
+    Now draws ALL detected bounding boxes for ANY task.
+    """
+    if image is None:
+        return "Please upload an image first.", None
+    print("🚀 Moving model to GPU...")
+    model_gpu = model.cuda().to(torch.bfloat16)
+    print("✅ Model is on GPU.")
+    with tempfile.TemporaryDirectory() as output_path:
+        # Build the prompt... (same as before)
+        if task_type == "📝 Free OCR":
+            prompt = "<image>\nFree OCR."
+        elif task_type == "📄 Convert to Markdown":
+            prompt = "<image>\n<|grounding|>Convert the document to markdown."
+        elif task_type == "📈 Parse Figure":
+            prompt = "<image>\nParse the figure."
+        else:
+            prompt = "<image>\nFree OCR."
+        temp_image_path = os.path.join(output_path, "temp_image.png")
+        image.save(temp_image_path)
+        # Configure model size... (same as before)
+        size_configs = {
+            "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
+            "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
+            "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
+            "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
+            "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
+        }
+        config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
+        print(f"🏃 Running inference with prompt: {prompt}")
+        text_result = model_gpu.infer(
+            tokenizer,
+            prompt=prompt,
+            image_file=temp_image_path,
+            output_path=output_path,
+            base_size=config["base_size"],
+            image_size=config["image_size"],
+            crop_mode=config["crop_mode"],
+            save_results=True,
+            test_compress=True,
+            eval_mode=True,
+        )
+        print(f"====\n📄 Text Result: {text_result}\n====")
+        return text_result
+# --- 3. Build the Gradio Interface (UPDATED) ---
+with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🐳 Full Demo of DeepSeek-OCR 🐳
+        **💡 How to use:**
+        1.  **Upload an image** using the upload box.
+        2.  Select a **Resolution**. `Gundam` is recommended for most documents.
+        3.  Choose a **Task Type**:
+            - **📝 Free OCR**: Extracts raw text from the image.
+            - **📄 Convert to Markdown**: Converts the document into Markdown, preserving structure.
+            - **📈 Parse Figure**: Extracts structured data from charts and figures.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="🖼️ Upload Image", sources=["upload", "clipboard"])
+            model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="⚙️ Resolution Size")
+            task_type = gr.Dropdown(choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure"], value="📄 Convert to Markdown", label="🚀 Task Type")
+            submit_btn = gr.Button("Process Image", variant="primary")
+        with gr.Column(scale=2):
+            output_text = gr.Textbox(label="📄 Text Result", lines=15, show_copy_button=True)
+            output_image = gr.Image(label="🖼️ Image Result (if any)", type="pil")
+# --- 4. Launch the App ---
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+torch==2.6.0
+transformers==4.46.3
+tokenizers==0.20.3
+einops
+addict
+easydict
+gradio>=4.0.0
+spaces>=0.20.0
+Pillow>=10.0.0
+safetensors>=0.4.0
+accelerate>=0.24.0
+sentencepiece>=0.1.99
+protobuf>=3.20.0
+torchvision
+flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl