import os import re import json import gradio as gr from PIL import Image import torch from transformers import AutoProcessor, Qwen2VLForConditionalGeneration import spaces # ๐Ÿ‘ˆ Hugging Face ZeroGPU MODEL_NAME = os.environ.get("MODEL_NAME", "NAMAA-Space/Qari-OCR-0.1-VL-2B-Instruct") MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "1024")) # ---- Device selection ---- device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 print(f"Device being used: {device}") # ---- Load model & processor ---- processor = AutoProcessor.from_pretrained(MODEL_NAME) model = Qwen2VLForConditionalGeneration.from_pretrained( MODEL_NAME, torch_dtype=dtype, device_map="auto" if device == "cuda" else {"": "cpu"}, ) print("Model loaded successfully!") def _mk_messages(image_path: str, prompt_info: str): return [ { "role": "user", "content": [ {"type": "image", "image": f"file://{image_path}"}, {"type": "text", "text": f""" You are an advanced invoice OCR system... (extractions instructions same as notebook) Extra hints from user: {prompt_info} """.strip()}, ], } ] def _extract_json(text: str): text = text.strip() if text.startswith("{") and text.endswith("}"): try: return json.loads(text) except Exception: pass m = re.search(r"\{[\s\S]*\}", text) if m: block = m.group(0) try: return json.loads(block) except Exception: pass return {"other_text": text} @spaces.GPU(duration=120) # ๐Ÿ‘ˆ Request ZeroGPU for 2 minutes def infer(image: Image.Image, prompt_info: str): if image is None: return "Please upload an image.", {} tmp_path = "input_image.png" image.save(tmp_path) messages = _mk_messages(tmp_path, prompt_info) chat_text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor( text=[chat_text], images=[Image.open(tmp_path)], return_tensors="pt", ) inputs = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in inputs.items()} with torch.no_grad(): generated_ids = model.generate( **inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False, ) gen_only = generated_ids[:, inputs["input_ids"].shape[1]:] text_out = processor.batch_decode(gen_only, skip_special_tokens=True)[0].strip() parsed = _extract_json(text_out) return text_out, parsed with gr.Blocks(title="Qari OCR (ZeroGPU)") as demo: gr.Markdown("# Qari OCR ยท ZeroGPU\nUpload an invoice image and (optionally) add extraction hints.") with gr.Row(): with gr.Column(): img_in = gr.Image(type="pil", label="Invoice Image") prompt_box = gr.Textbox(label="Extra hints (optional)") run_btn = gr.Button("Run OCR") with gr.Column(): txt_out = gr.Textbox(label="Raw Model Output", lines=10) json_out = gr.JSON(label="Parsed JSON") run_btn.click(infer, inputs=[img_in, prompt_box], outputs=[txt_out, json_out]) if __name__ == "__main__": demo.launch()