import gradio as gr import spaces from transformers import AutoModelForImageTextToText, AutoProcessor, Qwen3VLMoeForConditionalGeneration from qwen_vl_utils import process_vision_info import torch from PIL import Image import subprocess from datetime import datetime import numpy as np import os import json import tempfile import zipfile def array_to_image_path(image_array): # Convert numpy array to PIL Image img = Image.fromarray(np.uint8(image_array)) img.thumbnail((1024, 1024)) # Generate a unique filename using timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"image_{timestamp}.png" # Save the image img.save(filename) # Get the full path of the saved image full_path = os.path.abspath(filename) return full_path models = { "nanonets/Nanonets-OCR-s": AutoModelForImageTextToText.from_pretrained( "nanonets/Nanonets-OCR-s", trust_remote_code=True, dtype="auto" ).cuda().eval(), "Qwen/Qwen3-VL-30B-A3B-Instruct": Qwen3VLMoeForConditionalGeneration.from_pretrained( "Qwen/Qwen3-VL-30B-A3B-Instruct", trust_remote_code=True, dtype="auto", device_map="auto" ).cuda().eval() } processors = { "nanonets/Nanonets-OCR-s": AutoProcessor.from_pretrained( "nanonets/Nanonets-OCR-s", trust_remote_code=True ), "Qwen/Qwen3-VL-30B-A3B-Instruct": AutoProcessor.from_pretrained( "Qwen/Qwen3-VL-30B-A3B-Instruct", trust_remote_code=True ) } DESCRIPTION = "This demo uses[Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)" kwargs = {} kwargs['dtype'] = torch.bfloat16 user_prompt = '<|user|>\n' assistant_prompt = '<|assistant|>\n' prompt_suffix = "<|end|>\n" @spaces.GPU def run_example(image, model_id= "nanonets/Nanonets-OCR-s", prompt= """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the tag; otherwise, add the image caption inside . Watermarks should be wrapped in brackets. Ex: OFFICIAL COPY. Page numbers should be wrapped in brackets. Ex: 14 or 9/22. Prefer using ☐ and ☑ for check boxes."""): image_path = array_to_image_path(image) model = models[model_id] processor = processors[model_id] image = Image.fromarray(image).convert("RGB") messages = [ { "role": "user", "content": [ { "type": "image", "image": image_path, }, {"type": "text", "text": prompt}, ], } ] # Preparation for inference text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") # Inference: Generation of the output generated_ids = model.generate(**inputs, max_new_tokens=1024) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) ocr_text = output_text[0] return ocr_text, ocr_text @spaces.GPU def run_video(image_paths:list, model_id= "Qwen/Qwen3-VL-30B-A3B-Instruct", prompt= """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the tag; otherwise, add the image caption inside . Watermarks should be wrapped in brackets. Ex: OFFICIAL COPY. Page numbers should be wrapped in brackets. Ex: 14 or 9/22. Prefer using ☐ and ☑ for check boxes."""): for image_path in image_paths: image_path = image_path.replace('/full/full/', f'/full/400,/') print('also image_paths:', image_paths) model = models[model_id] processor = processors[model_id] messages = [ { "role": "user", "content": [ { "type": "video", "video": image_paths, }, {"type": "text", "text": prompt}, ], } ] # Preparation for inference text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, fps=1.0 ) images, videos, video_kwargs = process_vision_info(messages, image_patch_size=16, return_video_kwargs=True, return_video_metadata=True) # split the videos and according metadatas if videos is not None: videos, video_metadatas = zip(*videos) videos, video_metadatas = list(videos), list(video_metadatas) else: video_metadatas = None inputs = processor(text=text, images=images, videos=videos, video_metadata=video_metadatas, return_tensors="pt", do_resize=False, **video_kwargs) #image_inputs, video_inputs = process_vision_info(messages) inputs = inputs.to("cuda") # Inference: Generation of the output generated_ids = model.generate(**inputs, max_new_tokens=1024) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) ocr_text = output_text[0] return ocr_text, ocr_text with gr.Blocks() as demo: # Add state variables to store OCR results ocr_state = gr.State() with gr.Tab(label="Image Input", elem_classes="tabs"): with gr.Row(): with gr.Column(elem_classes="input-container"): input_img = gr.Image(label="Input Picture", elem_classes="gr-image-input") model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2.5-VL-7B-Instruct", elem_classes="gr-dropdown") prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...", elem_classes="gr-textbox") submit_btn = gr.Button(value="Submit", elem_classes="submit-btn") with gr.Column(elem_classes="output-container"): output_text = gr.Textbox(label="Output Text", elem_id="output") # Modify the submit button click handler to update state submit_btn.click( run_example, inputs=[input_img, model_selector,prompt], outputs=[output_text, ocr_state] # Add ocr_state to outputs ) with gr.Tab(label="Video Input", elem_classes="tabs"): with gr.Row(): with gr.Column(elem_classes="input-container"): input_video = gr.Textbox(label="Input Video", elem_classes="gr-video-input") model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2.5-VL-7B-Instruct", elem_classes="gr-dropdown") prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...", elem_classes="gr-textbox") submit_btn = gr.Button(value="Submit", elem_classes="submit-btn") with gr.Column(elem_classes="output-container"): output_text = gr.Textbox(label="Output Text", elem_id="output") # Modify the submit button click handler to update state submit_btn.click( run_video, inputs=[input_video, model_selector, prompt], outputs=[output_text, ocr_state] # Add ocr_state to outputs ) demo.queue(api_open=False) demo.launch(debug=True, show_error=True)