import torch
from transformers import pipeline
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
import numpy as np
import draw_utils
import spaces

from huggingface_hub import login
import os

for variable_name in os.environ.keys():
    print(variable_name)

login(token=os.environ.get('gemma_access_token'))


device = 'cuda' if torch.cuda.is_available() else 'cpu'


pipe = pipeline(
    "image-text-to-text",
    model="google/gemma-3-4b-it",
    #device="cuda:1",
    device_map=device,
    torch_dtype=torch.bfloat16

)


model_id = "IDEA-Research/grounding-dino-tiny"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

@spaces.GPU
def laod_gdino(image):
    messages = [
            {
                "role": "system",
                "content": [{"type": "text", "text": "Just Give the list of objects in given picture seperated by comma. Do not write anything else."}]},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "List the objects that you see in given picture."},
                    {"type": "image", "url": image},

                ]
            },

        ]


    output = pipe(text=messages, max_new_tokens=500)
    print(output[0]["generated_text"][-1]["content"])

    llm_response = output[0]["generated_text"][-1]["content"]

    llm_response = llm_response.lower()
    llm_response = llm_response.replace('pedestrian', 'person')
    llm_response = llm_response.replace('people', 'person')
    llm_response = llm_response.replace('man', 'person')
    llm_response = llm_response.replace('woman', 'person')

    llm_labels = llm_response.replace(', ', ',').split(',')

    print(llm_labels)

    llm_labels = [llm_labels]

    inputs = processor(images=image, text=llm_labels, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    results = processor.post_process_grounded_object_detection(
        outputs,
        inputs.input_ids,
        threshold=0.4,
        text_threshold=0.3,
        target_sizes=[image.size[::-1]]
    )

    result = results[0]
    image = np.array(image)

    draw_results = [result["boxes"], result["scores"], result["labels"]]
    return draw_utils.visualize_detections(image, draw_results)