YagndeepKukadiya commited on
Commit
7ee0b5a
Β·
verified Β·
1 Parent(s): 8a0253d

Upload 2 files

Browse files

Added app.py & requirements.txt

Files changed (2) hide show
  1. app.py +120 -0
  2. requirements.txt +15 -0
app.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModel, AutoTokenizer
4
+ import spaces
5
+ import os
6
+ import tempfile
7
+ from PIL import Image, ImageDraw
8
+ import re # Import thΖ° viện regular expression
9
+
10
+ # --- 1. Load Model and Tokenizer (Done only once at startup) ---
11
+ print("Loading model and tokenizer...")
12
+ model_name = "deepseek-ai/DeepSeek-OCR"
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
14
+ # Load the model to CPU first; it will be moved to GPU during processing
15
+ model = AutoModel.from_pretrained(
16
+ model_name,
17
+ _attn_implementation="flash_attention_2",
18
+ trust_remote_code=True,
19
+ use_safetensors=True,
20
+ )
21
+ model = model.eval()
22
+ print("βœ… Model loaded successfully.")
23
+
24
+ # --- Helper function to find pre-generated result images ---
25
+ def find_result_image(path):
26
+ for filename in os.listdir(path):
27
+ if "grounding" in filename or "result" in filename:
28
+ try:
29
+ image_path = os.path.join(path, filename)
30
+ return Image.open(image_path)
31
+ except Exception as e:
32
+ print(f"Error opening result image {filename}: {e}")
33
+ return None
34
+
35
+ # --- 2. Main Processing Function (UPDATED for multi-bbox drawing) ---
36
+ @spaces.GPU
37
+ def process_ocr_task(image, model_size, task_type, ref_text):
38
+ """
39
+ Processes an image with DeepSeek-OCR for all supported tasks.
40
+ Now draws ALL detected bounding boxes for ANY task.
41
+ """
42
+ if image is None:
43
+ return "Please upload an image first.", None
44
+
45
+ print("πŸš€ Moving model to GPU...")
46
+ model_gpu = model.cuda().to(torch.bfloat16)
47
+ print("βœ… Model is on GPU.")
48
+
49
+ with tempfile.TemporaryDirectory() as output_path:
50
+ # Build the prompt... (same as before)
51
+ if task_type == "πŸ“ Free OCR":
52
+ prompt = "<image>\nFree OCR."
53
+ elif task_type == "πŸ“„ Convert to Markdown":
54
+ prompt = "<image>\n<|grounding|>Convert the document to markdown."
55
+ elif task_type == "πŸ“ˆ Parse Figure":
56
+ prompt = "<image>\nParse the figure."
57
+ else:
58
+ prompt = "<image>\nFree OCR."
59
+
60
+ temp_image_path = os.path.join(output_path, "temp_image.png")
61
+ image.save(temp_image_path)
62
+
63
+ # Configure model size... (same as before)
64
+ size_configs = {
65
+ "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
66
+ "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
67
+ "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
68
+ "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
69
+ "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
70
+ }
71
+ config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
72
+
73
+ print(f"πŸƒ Running inference with prompt: {prompt}")
74
+ text_result = model_gpu.infer(
75
+ tokenizer,
76
+ prompt=prompt,
77
+ image_file=temp_image_path,
78
+ output_path=output_path,
79
+ base_size=config["base_size"],
80
+ image_size=config["image_size"],
81
+ crop_mode=config["crop_mode"],
82
+ save_results=True,
83
+ test_compress=True,
84
+ eval_mode=True,
85
+ )
86
+
87
+ print(f"====\nπŸ“„ Text Result: {text_result}\n====")
88
+
89
+ return text_result
90
+
91
+ # --- 3. Build the Gradio Interface (UPDATED) ---
92
+ with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
93
+ gr.Markdown(
94
+ """
95
+ # 🐳 Full Demo of DeepSeek-OCR 🐳
96
+
97
+ **πŸ’‘ How to use:**
98
+ 1. **Upload an image** using the upload box.
99
+ 2. Select a **Resolution**. `Gundam` is recommended for most documents.
100
+ 3. Choose a **Task Type**:
101
+ - **πŸ“ Free OCR**: Extracts raw text from the image.
102
+ - **πŸ“„ Convert to Markdown**: Converts the document into Markdown, preserving structure.
103
+ - **πŸ“ˆ Parse Figure**: Extracts structured data from charts and figures.
104
+ """
105
+ )
106
+
107
+ with gr.Row():
108
+ with gr.Column(scale=1):
109
+ image_input = gr.Image(type="pil", label="πŸ–ΌοΈ Upload Image", sources=["upload", "clipboard"])
110
+ model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="βš™οΈ Resolution Size")
111
+ task_type = gr.Dropdown(choices=["πŸ“ Free OCR", "πŸ“„ Convert to Markdown", "πŸ“ˆ Parse Figure"], value="πŸ“„ Convert to Markdown", label="πŸš€ Task Type")
112
+ submit_btn = gr.Button("Process Image", variant="primary")
113
+
114
+ with gr.Column(scale=2):
115
+ output_text = gr.Textbox(label="πŸ“„ Text Result", lines=15, show_copy_button=True)
116
+ output_image = gr.Image(label="πŸ–ΌοΈ Image Result (if any)", type="pil")
117
+
118
+ # --- 4. Launch the App ---
119
+ if __name__ == "__main__":
120
+ demo.queue(max_size=20).launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.6.0
2
+ transformers==4.46.3
3
+ tokenizers==0.20.3
4
+ einops
5
+ addict
6
+ easydict
7
+ gradio>=4.0.0
8
+ spaces>=0.20.0
9
+ Pillow>=10.0.0
10
+ safetensors>=0.4.0
11
+ accelerate>=0.24.0
12
+ sentencepiece>=0.1.99
13
+ protobuf>=3.20.0
14
+ torchvision
15
+ flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl