import os import cv2 import json from ultralytics import YOLO import matplotlib.pyplot as plt import torch from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration, AutoProcessor, LlavaForConditionalGeneration import requests device = "cuda" if torch.cuda.is_available() else "cpu" from huggingface_hub import hf_hub_download os.environ["KERAS_BACKEND"] = "jax" import keras import numpy as np import tensorflow as tf # Or from keras.utils import load_img, img_to_array import threading import subprocess from ultralytics.nn.tasks import DetectionModel from torch.nn import Sequential from ultralytics.nn.modules.conv import Conv mport the class: from torch.nn import Conv2d torch.serialization.add_safe_globals([DetectionModel, Sequential, Conv, Conv2d]) from io import BytesIO import glob import requests import gradio as gr # IMPORTING MODELS description_model = "llava-hf/bakLlava-v1-hf" print(f"Loading vision model: {description_model} (this should be much faster)...") llava_model = LlavaForConditionalGeneration.from_pretrained( description_model, dtype=torch.float16, low_cpu_mem_usage=True, load_in_4bit=True ) llava_processor = AutoProcessor.from_pretrained(description_model) repo_car = "thirarbi/vehicle_model" car_file = "vehicle_model.keras" car_model_path = hf_hub_download(repo_id=repo_car, filename=car_file) repo_ethnicity = "thirarbi/ethnicity" ethnicity_file = "ethnicity_model.keras" ethnicity_model_path = hf_hub_download(repo_id=repo_ethnicity, filename=ethnicity_file) repo_gender = "thirarbi/gender" gender_file = "gender_model.keras" gender_model_path = hf_hub_download(repo_id=repo_gender, filename=gender_file) # Load the fine-tuned YOLOv8n model try: yolo_model = YOLO('yolo11n.pt') # Using the base model as an example # general_model = YOLO('/content/yolov8n_plus_fruitsnveggies.pt') phone_model = YOLO('phone_model.pt') car_model = keras.saving.load_model(car_model_path) motorcycle_model = YOLO('motorcycle_model.pt') fruitsnveggy_model = YOLO('best_fruitveggy.pt') # Person.keras ethnicity_model = keras.saving.load_model(ethnicity_model_path) gender_model = keras.saving.load_model(gender_model_path) #age_model = coming soon yolo_model.to(device) except Exception as e: print(f"Error loading model: {e}") exit() import numpy as np import tensorflow as tf # Or from keras.utils import load_img, img_to_array # CLASS NAMES (OBJECT CLASSIFICATION MODELS) car_class_names = [ 'Audi', 'Bentley', 'Benz', 'BMW', 'Cadillac', 'Dodge', 'Ferrari', 'Ford', 'Ford Mustang', 'Kia', 'Lamborghini', 'Lexus', 'Maserati', 'Porsche', 'Rolls-Royce', 'Tesla', 'Toyota', 'Alfa Romeo', 'Hyundai' ] ethnicity_class_names = [ 'White', 'Black', 'Asian', 'Indian', 'Mexican' ] gender_class_names = [ 'Male', 'Female' ] # IMAGE PRE-PROCESSING IMG_HEIGHT = 224 IMG_WIDTH = 224 # 1. Load and Preprocess the Image # The image is loaded and resized to the target dimensions. img = tf.keras.utils.load_img( img_path, target_size=(IMG_HEIGHT, IMG_WIDTH) ) img_array = tf.keras.utils.img_to_array(img) img_array = img_array / 255.0 # Expects a shape of (1, 224, 224, 3). img_array = np.expand_dims(img_array, 0) def preprocess_person_image(cropped_image): """ Preprocesses a cropped person image for the ethnicity and gender models. Args: cropped_image: A PIL Image object of the cropped person. Returns: A numpy array of the preprocessed image with shape (1, 48, 48, 1). """ # Convert to grayscale gray_image = cropped_image.convert('L') # Resize to 48x48 resized_image = gray_image.resize((48, 48)) # Convert to numpy array img_array = tf.keras.utils.img_to_array(resized_image) # Normalize pixel values (assuming the models were trained with normalized data) img_array = img_array / 255.0 # Add batch dimension img_array = np.expand_dims(img_array, 0) # Add channel dimension for grayscale (explicitly 1) img_array = np.expand_dims(img_array, -1) return img_array def car_pred(car_img_array): # 2. Make the Prediction img = car_img_array.resize((IMG_WIDTH, IMG_HEIGHT)) img_array = tf.keras.utils.img_to_array(img) img_array = img_array / 255.0 img_array = np.expand_dims(img_array, 0) # Create batch dimension # 3. Decode the Prediction # We find the index with the highest score and use it to get the brand name. predicted_index = np.argmax(prediction_scores) predicted_class_name = car_class_names[predicted_index] confidence_score = 100 * np.max(prediction_scores) return predicted_class_name #print(f"Confidence: {confidence_score:.2f}%") def person_pred(person_img_array): # Preprocess the cropped person image preprocessed_img = preprocess_person_image(person_img_array) # 2. Make the Prediction ethnicity_predictions = ethnicity_model.predict(preprocessed_img) ethnicity_prediction_scores = ethnicity_predictions[0] gender_predictions = gender_model.predict(preprocessed_img) gender_prediction_scores = gender_predictions[0] # 3. Decode the Prediction # We find the index with the highest score and use it to get the brand name. ethnicity_predicted_index = np.argmax(ethnicity_prediction_scores) ethnicity_predicted_class_name = ethnicity_class_names[ethnicity_predicted_index] ethnicity_confidence_score = 100 * np.max(ethnicity_prediction_scores) gender_predicted_index = np.argmax(gender_prediction_scores) gender_predicted_class_name = gender_class_names[gender_predicted_index] gender_confidence_score = 100 * np.max(gender_prediction_scores) return f"{ethnicity_predicted_class_name} {gender_predicted_class_name}" #print(f"Confidence: {confidence_score:.2f}%") def encode_image(filepath): """Encodes an image to a base64 string.""" with open(filepath, "rb") as image_file: return image_file.read() def final_description(image_file, text_prompt): try: raw_image = Image.open(image_file) prompt_template = f"USER: \n{text_prompt}\nASSISTANT:" inputs = llava_processor(prompt_template, images=raw_image, return_tensors="pt").to("cuda") # Ensure it runs on GPU output = llava_model.generate(**inputs, max_new_tokens=200, do_sample=False) response = llava_processor.decode(output[0], skip_special_tokens=True) assistant_response = response.split("ASSISTANT:")[-1].strip() return assistant_response except Exception as e: print(f"Error during Llava description generation: {e}") return "Sorry, I was unable to generate a description for this part of the image." # # Process # with Image.open(image_file) as img: # with BytesIO() as buffer: # img.save(buffer, format='PNG') # image_bytes = buffer.getvalue() # full_response = '' # # Generate a description of the image # response = ollama.chat( # model = 'llava', # messages= # [ # { # 'role': 'user', # 'content': text_prompt, # 'images': [encode_image(img_path)], # }, # ] # ) # return response['message']['content'] #======================= # DETECTION & LOCALIZATION ========================================== input_folder = "input_folder" output_folder = "output_folder" os.makedirs(output_folder, exist_ok=True) def process_image(img_path, user_input, min_words): general_prompt = f""" Provide a highly detailed, long-form description of this image, suitable for a visually impaired person (include text you see in the image). 1. **Overall Summary:** A brief, one-paragraph overview of the entire scene. 2. **Foreground Elements:** Describe the objects, people, or elements closest to the viewer in meticulous detail. Mention their appearance, textures, and any actions they are performing. 3. **Midground and Background:** Detail the environment behind the main subject. Describe the setting, landscape, architecture, and any other distant elements. 4. **Atmosphere and Mood:** Analyze the image's lighting, color palette, and overall composition to describe the mood or feeling it evokes (e.g., joyful, somber, peaceful, chaotic). 5. **Hint:** The objects that are included in the pictures are: {combined_captions} 6. With addition of the following request: {user_input} Ensure that your total description is at least {min_words} words """ results = yolo_model(img_path) # GENERAL OBJECT DETECTION annotated_img = results[0].plot() specialized_captions_list = [] specialized_objects_list = [] detections = [] for i, box in enumerate(results[0].boxes): # Neat percentage cls = yolo_model.names[int(box.cls)] conf = round(float(box.conf) * 100, 2) # BOXING x1, y1, x2, y2 = map(float, box.xyxy[0]) x1, y1, x2, y2 = round(x1, 2), round(y1, 2), round(x2, 2), round(y2, 2) detections.append({ "label": cls, "confidence (%)": f"{conf}%", "bbox": { "x1": x1, "y1": y1, "x2": x2, "y2": y2 } }) # CROPPING # Open the image file inside the loop for each detected object i_image = Image.open(img_path) cropped = i_image.crop((int(x1), int(y1), int(x2), int(y2))) json_output = json.dumps(detections, indent=4) #json plotting file cropped_filename = f"{cls}_{i+1}.png" output_img_path = os.path.join(output_folder, cropped_filename) cropped.save(output_img_path) # SPECIALIZATION FILTER # Save the cropped image with a unique filename if cls == 'person': # DONE specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, 'Please describe the race and gender of this person, their outfit, their emotion, what are they doing, and other details about it, summarize it with 15 - 25 words.')}') # join list of specialized objects caption #specialized_objects_list.append(person_pred(cropped)) elif cls == 'car': # DONE specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, f'Describe this {car_pred(cropped)} car, its color, the details about it, with less than 15 words.')}') specialized_objects_list.append(car_pred(cropped)) elif cls == 'motorcycle': # YOLO # JUST RETREIVE CLASS NAME. results = motorcycle_model(cropped) for box in results[0].boxes: class_id = int(box.cls[0]) class_name = motorcycle_model.names[class_id] # Get the class name using the class ID #specialized_captions_list.append(f'[{i+1}] {specialized_caption(f'{class_name} motorcycle', cropped)}.') specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, f'Describe the {class_name} motorcycle, its color, and the details about it, with less than 15 words.')}') specialized_objects_list.append(class_name) elif cls == 'cell phone': # YOLO results = phone_model(cropped) for box in results[0].boxes: class_id = int(box.cls[0]) class_name = phone_model.names[class_id] # Get the class name using the class ID specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, f'Describe the {class_name} smartphone, its color, and the details about it, with less than 15 words.')}') specialized_objects_list.append(class_name) elif cls in ('banana', 'apple', 'carrot', 'broccoli'): #fruitnvegetables # YOLO results = fruitsnveggy_model(cropped) for box in results[0].boxes: class_id = int(box.cls[0]) class_name = fruitsnveggy_model.names[class_id] # Get the class name using the class ID specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, f'Describe the vegetable/vegetables or fruit/fruits that are in the picture. What it is, how its presented, its details, with less than 15 words.')}') specialized_objects_list.append(class_name) else: specialized_captions_list.append(f'[{i+1}] {final_description(output_img_path, f'Describe the object ({class_name}) in this picture. Describe any details about in in less than 15 words.')}') specialized_objects_list.append(cls) combined_captions = f"\n".join(specialized_captions_list) # list of llama(aided) specialized captions general_caption = final_description(img_path, general_prompt) # general caption caption = f'List of Detected Objects: \n{combined_captions}\n================== GENERAL CAPTION ================== \n{general_caption}' return Image.fromarray(annotated_img), caption, json_output iface = gr.Interface( fn=process_image, inputs=[gr.Image(type="filepath"), gr.Textbox(label="How do you want the image to be described?"), gr.Textbox(label="Minimum words?", value="300")], outputs=[gr.Image(label="Annotated Image"), gr.Textbox(label="Description of the image"), gr.JSON(label="Detected Objects (JSON)")], title="Image Object Detection & Captioning", description="Upload an image to detect objects, view bounding boxes, and get captions." ) iface.launch(share=True)