import os
import cv2
import json
from ultralytics import YOLO
import matplotlib.pyplot as plt
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoProcessor, LlavaForConditionalGeneration
import requests
device = "cuda" if torch.cuda.is_available() else "cpu"

from huggingface_hub import hf_hub_download
os.environ["KERAS_BACKEND"] = "jax"

import keras
import numpy as np
import tensorflow as tf # Or from keras.utils import load_img, img_to_array
import threading
import subprocess
from ultralytics.nn.tasks import DetectionModel
from torch.nn import Sequential
from ultralytics.nn.modules.conv import Conv
mport the class: from torch.nn import Conv2d
torch.serialization.add_safe_globals([DetectionModel, Sequential, Conv, Conv2d])

from io import BytesIO
import glob
import requests

import gradio as gr

# IMPORTING MODELS
description_model = "llava-hf/bakLlava-v1-hf"
print(f"Loading vision model: {description_model} (this should be much faster)...")

llava_model = LlavaForConditionalGeneration.from_pretrained(
    description_model,
    dtype=torch.float16,
    low_cpu_mem_usage=True,
    load_in_4bit=True
)
llava_processor = AutoProcessor.from_pretrained(description_model)

repo_car = "thirarbi/vehicle_model"
car_file = "vehicle_model.keras"
car_model_path = hf_hub_download(repo_id=repo_car, filename=car_file)

repo_ethnicity = "thirarbi/ethnicity"
ethnicity_file = "ethnicity_model.keras"
ethnicity_model_path = hf_hub_download(repo_id=repo_ethnicity, filename=ethnicity_file)

repo_gender = "thirarbi/gender"
gender_file = "gender_model.keras"
gender_model_path = hf_hub_download(repo_id=repo_gender, filename=gender_file)


# Load the fine-tuned YOLOv8n model
try:
    yolo_model = YOLO('yolo11n.pt') # Using the base model as an example
    # general_model = YOLO('/content/yolov8n_plus_fruitsnveggies.pt')
    phone_model = YOLO('phone_model.pt')
    car_model = keras.saving.load_model(car_model_path)
    motorcycle_model = YOLO('motorcycle_model.pt')
    fruitsnveggy_model = YOLO('best_fruitveggy.pt')
    # Person.keras
    ethnicity_model = keras.saving.load_model(ethnicity_model_path)
    gender_model = keras.saving.load_model(gender_model_path)
    #age_model = coming soon

    yolo_model.to(device)
except Exception as e:
    print(f"Error loading model: {e}")
    exit()

import numpy as np
import tensorflow as tf # Or from keras.utils import load_img, img_to_array

# CLASS NAMES (OBJECT CLASSIFICATION MODELS)
car_class_names = [
    'Audi', 'Bentley', 'Benz', 'BMW', 'Cadillac',
    'Dodge', 'Ferrari', 'Ford', 'Ford Mustang', 'Kia',
    'Lamborghini', 'Lexus', 'Maserati', 'Porsche', 'Rolls-Royce',
    'Tesla', 'Toyota', 'Alfa Romeo', 'Hyundai'
]
ethnicity_class_names = [
    'White', 'Black', 'Asian', 'Indian', 'Mexican'
]
gender_class_names = [
    'Male', 'Female'
]

# IMAGE PRE-PROCESSING
IMG_HEIGHT = 224
IMG_WIDTH = 224

# 1. Load and Preprocess the Image
# The image is loaded and resized to the target dimensions.
img = tf.keras.utils.load_img(
    img_path, target_size=(IMG_HEIGHT, IMG_WIDTH)
)
img_array = tf.keras.utils.img_to_array(img)
img_array = img_array / 255.0
# Expects a shape of (1, 224, 224, 3).
img_array = np.expand_dims(img_array, 0)

def preprocess_person_image(cropped_image):
  """
  Preprocesses a cropped person image for the ethnicity and gender models.

  Args:
    cropped_image: A PIL Image object of the cropped person.

  Returns:
    A numpy array of the preprocessed image with shape (1, 48, 48, 1).
  """
  # Convert to grayscale
  gray_image = cropped_image.convert('L')
  # Resize to 48x48
  resized_image = gray_image.resize((48, 48))
  # Convert to numpy array
  img_array = tf.keras.utils.img_to_array(resized_image)
  # Normalize pixel values (assuming the models were trained with normalized data)
  img_array = img_array / 255.0
  # Add batch dimension
  img_array = np.expand_dims(img_array, 0)
  # Add channel dimension for grayscale (explicitly 1)
  img_array = np.expand_dims(img_array, -1)
  return img_array

def car_pred(car_img_array):
    # 2. Make the Prediction
    img = car_img_array.resize((IMG_WIDTH, IMG_HEIGHT))
    img_array = tf.keras.utils.img_to_array(img)
    img_array = img_array / 255.0
    img_array = np.expand_dims(img_array, 0) # Create batch dimension

  # 3. Decode the Prediction
  # We find the index with the highest score and use it to get the brand name.
    predicted_index = np.argmax(prediction_scores)
    predicted_class_name = car_class_names[predicted_index]
    confidence_score = 100 * np.max(prediction_scores)

    return predicted_class_name
    #print(f"Confidence: {confidence_score:.2f}%")

def person_pred(person_img_array):
  # Preprocess the cropped person image
  preprocessed_img = preprocess_person_image(person_img_array)

  # 2. Make the Prediction
  ethnicity_predictions = ethnicity_model.predict(preprocessed_img)
  ethnicity_prediction_scores = ethnicity_predictions[0]

  gender_predictions = gender_model.predict(preprocessed_img)
  gender_prediction_scores = gender_predictions[0]

  # 3. Decode the Prediction
  # We find the index with the highest score and use it to get the brand name.
  ethnicity_predicted_index = np.argmax(ethnicity_prediction_scores)
  ethnicity_predicted_class_name = ethnicity_class_names[ethnicity_predicted_index]
  ethnicity_confidence_score = 100 * np.max(ethnicity_prediction_scores)

  gender_predicted_index = np.argmax(gender_prediction_scores)
  gender_predicted_class_name = gender_class_names[gender_predicted_index]
  gender_confidence_score = 100 * np.max(gender_prediction_scores)

  return f"{ethnicity_predicted_class_name} {gender_predicted_class_name}"
  #print(f"Confidence: {confidence_score:.2f}%")


def encode_image(filepath):
    """Encodes an image to a base64 string."""
    with open(filepath, "rb") as image_file:
        return image_file.read()


def final_description(image_file, text_prompt):
    try:
        raw_image = Image.open(image_file)
        prompt_template = f"USER: <image>\n{text_prompt}\nASSISTANT:"
        inputs = llava_processor(prompt_template, images=raw_image, return_tensors="pt").to("cuda") # Ensure it runs on GPU
        output = llava_model.generate(**inputs, max_new_tokens=200, do_sample=False)
        response = llava_processor.decode(output[0], skip_special_tokens=True)
        assistant_response = response.split("ASSISTANT:")[-1].strip()
        
        return assistant_response

    except Exception as e:
        print(f"Error during Llava description generation: {e}")
        return "Sorry, I was unable to generate a description for this part of the image."
#     # Process
#     with Image.open(image_file) as img:
#         with BytesIO() as buffer:
#             img.save(buffer, format='PNG')
#             image_bytes = buffer.getvalue()

#     full_response = ''
#     # Generate a description of the image
#     response = ollama.chat(
#         model = 'llava',
#         messages=
#          [
#             {
#                 'role': 'user',
#                 'content': text_prompt,
#                 'images': [encode_image(img_path)],
#             },
#         ]
#     )
#     return response['message']['content']
    #=======================


# DETECTION & LOCALIZATION ==========================================
input_folder = "input_folder"
output_folder = "output_folder"
os.makedirs(output_folder, exist_ok=True)


def process_image(img_path, user_input, min_words):
  
  general_prompt = f"""

  Provide a highly detailed, long-form description of this image, suitable for a visually impaired person (include text you see in the image).
  1. **Overall Summary:** A brief, one-paragraph overview of the entire scene.
  2. **Foreground Elements:** Describe the objects, people, or elements closest to the viewer in meticulous detail. Mention their appearance, textures, and any actions they are performing.
  3. **Midground and Background:** Detail the environment behind the main subject. Describe the setting, landscape, architecture, and any other distant elements.
  4. **Atmosphere and Mood:** Analyze the image's lighting, color palette, and overall composition to describe the mood or feeling it evokes (e.g., joyful, somber, peaceful, chaotic).
  5. **Hint:** The objects that are included in the pictures are: {combined_captions}
  6. With addition of the following request: {user_input}

  Ensure that your total description is at least {min_words} words
  """
  
  results = yolo_model(img_path) # GENERAL OBJECT DETECTION
  annotated_img = results[0].plot()
  specialized_captions_list = []
  specialized_objects_list = []

  detections = []
  for i, box in enumerate(results[0].boxes): # Neat percentage
    cls = yolo_model.names[int(box.cls)]
    conf = round(float(box.conf) * 100, 2)

    # BOXING
    x1, y1, x2, y2 = map(float, box.xyxy[0])
    x1, y1, x2, y2 = round(x1, 2), round(y1, 2), round(x2, 2), round(y2, 2)
    detections.append({
        "label": cls,
        "confidence (%)": f"{conf}%",
        "bbox": {
        "x1": x1,
        "y1": y1,
        "x2": x2,
        "y2": y2
        }
    })
    # CROPPING
    # Open the image file inside the loop for each detected object
    i_image = Image.open(img_path)
    cropped = i_image.crop((int(x1), int(y1), int(x2), int(y2)))
    json_output = json.dumps(detections, indent=4) #json plotting file

    cropped_filename = f"{cls}_{i+1}.png"
    output_img_path = os.path.join(output_folder, cropped_filename)
    cropped.save(output_img_path)

    # SPECIALIZATION FILTER
    # Save the cropped image with a unique filename
    if cls == 'person': # DONE
      specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, 'Please describe the race and gender of this person, their outfit, their emotion, what are they doing, and other details about it, summarize it with 15 - 25 words.')}') # join list of specialized objects caption
      #specialized_objects_list.append(person_pred(cropped))
    elif cls == 'car': # DONE
      specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, f'Describe this {car_pred(cropped)} car, its color, the details about it, with less than 15 words.')}')
      specialized_objects_list.append(car_pred(cropped))

    elif cls == 'motorcycle': # YOLO # JUST RETREIVE CLASS NAME.
      results = motorcycle_model(cropped)
      for box in results[0].boxes:
          class_id = int(box.cls[0])
          class_name = motorcycle_model.names[class_id]  # Get the class name using the class ID
      #specialized_captions_list.append(f'[{i+1}] {specialized_caption(f'{class_name} motorcycle', cropped)}.')
      specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, f'Describe the {class_name} motorcycle, its color, and the details about it, with less than 15 words.')}')
      specialized_objects_list.append(class_name)

    elif cls == 'cell phone': # YOLO
      results = phone_model(cropped)
      for box in results[0].boxes:
          class_id = int(box.cls[0])
          class_name = phone_model.names[class_id]  # Get the class name using the class ID
      specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, f'Describe the {class_name} smartphone, its color, and the details about it, with less than 15 words.')}')
      specialized_objects_list.append(class_name)

    elif cls in ('banana', 'apple', 'carrot', 'broccoli'): #fruitnvegetables # YOLO
      results = fruitsnveggy_model(cropped)
      for box in results[0].boxes:
          class_id = int(box.cls[0])
          class_name = fruitsnveggy_model.names[class_id]  # Get the class name using the class ID
      specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, f'Describe the vegetable/vegetables or fruit/fruits that are in the picture. What it is, how its presented, its details, with less than 15 words.')}')
      specialized_objects_list.append(class_name)

    else:
      specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, f'Describe the object ({class_name}) in this picture. Describe any details about in in less than 15 words.')}')
      specialized_objects_list.append(cls)

  combined_captions = f"\n".join(specialized_captions_list) # list of llama(aided) specialized captions
  general_caption = final_description(img_path, general_prompt) # general caption
  caption = f'List of Detected Objects: \n{combined_captions}\n================== GENERAL CAPTION ================== \n{general_caption}'


  return Image.fromarray(annotated_img), caption, json_output

iface = gr.Interface(
    fn=process_image,
    inputs=[gr.Image(type="filepath"),
            gr.Textbox(label="How do you want the image to be described?"),
            gr.Textbox(label="Minimum words?", value="300")],
    outputs=[gr.Image(label="Annotated Image"),
             gr.Textbox(label="Description of the image"),
             gr.JSON(label="Detected Objects (JSON)")],
    title="Image Object Detection & Captioning",
    description="Upload an image to detect objects, view bounding boxes, and get captions."
)

iface.launch(share=True)