import os
import re
import json
import gradio as gr
from PIL import Image

import torch
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
import spaces  # 👈 Hugging Face ZeroGPU

MODEL_NAME = os.environ.get("MODEL_NAME", "NAMAA-Space/Qari-OCR-0.1-VL-2B-Instruct")
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "1024"))

# ---- Device selection ----
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32

print(f"Device being used: {device}")

# ---- Load model & processor ----
processor = AutoProcessor.from_pretrained(MODEL_NAME)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=dtype,
    device_map="auto" if device == "cuda" else {"": "cpu"},
)

print("Model loaded successfully!")

def _mk_messages(image_path: str, prompt_info: str):
    return [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": f"file://{image_path}"},
                {"type": "text", "text": f"""
You are an advanced invoice OCR system...
(extractions instructions same as notebook)
Extra hints from user: {prompt_info}
""".strip()},
            ],
        }
    ]

def _extract_json(text: str):
    text = text.strip()
    if text.startswith("{") and text.endswith("}"):
        try:
            return json.loads(text)
        except Exception:
            pass
    m = re.search(r"\{[\s\S]*\}", text)
    if m:
        block = m.group(0)
        try:
            return json.loads(block)
        except Exception:
            pass
    return {"other_text": text}

@spaces.GPU(duration=120)  # 👈 Request ZeroGPU for 2 minutes
def infer(image: Image.Image, prompt_info: str):
    if image is None:
        return "Please upload an image.", {}

    tmp_path = "input_image.png"
    image.save(tmp_path)

    messages = _mk_messages(tmp_path, prompt_info)
    chat_text = processor.apply_chat_template(messages, add_generation_prompt=True)

    inputs = processor(
        text=[chat_text],
        images=[Image.open(tmp_path)],
        return_tensors="pt",
    )
    inputs = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in inputs.items()}

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
        )

    gen_only = generated_ids[:, inputs["input_ids"].shape[1]:]
    text_out = processor.batch_decode(gen_only, skip_special_tokens=True)[0].strip()
    parsed = _extract_json(text_out)

    return text_out, parsed

with gr.Blocks(title="Qari OCR (ZeroGPU)") as demo:
    gr.Markdown("# Qari OCR · ZeroGPU\nUpload an invoice image and (optionally) add extraction hints.")
    with gr.Row():
        with gr.Column():
            img_in = gr.Image(type="pil", label="Invoice Image")
            prompt_box = gr.Textbox(label="Extra hints (optional)")
            run_btn = gr.Button("Run OCR")
        with gr.Column():
            txt_out = gr.Textbox(label="Raw Model Output", lines=10)
            json_out = gr.JSON(label="Parsed JSON")

    run_btn.click(infer, inputs=[img_in, prompt_box], outputs=[txt_out, json_out])

if __name__ == "__main__":
    demo.launch()