Spaces:

Taizun
/

Object-detection

Runtime error

App Files Files Community

Object-detection / app.py

aleehassan

Update app.py

6e2faa9 verified 11 months ago

raw

history blame

4.9 kB

	import gradio as gr
	from PIL import Image, ImageDraw, ImageFont
	import numpy as np
	import scipy.io.wavfile as wavfile
	from transformers import pipeline

	# Load pipelines
	narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
	object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")

	# Function to apply Non-Maximum Suppression (NMS)
	def compute_iou(box1, boxes):
	x1 = np.maximum(box1['xmin'], boxes[:, 0])
	y1 = np.maximum(box1['ymin'], boxes[:, 1])
	x2 = np.minimum(box1['xmax'], boxes[:, 2])
	y2 = np.minimum(box1['ymax'], boxes[:, 3])

	intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1)
	box1_area = (box1['xmax'] - box1['xmin']) * (box1['ymax'] - box1['ymin'])
	boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])

	union = box1_area + boxes_area - intersection
	return intersection / union

	def nms(detections, iou_threshold=0.5):
	if len(detections) == 0:
	return []

	boxes = np.array([[d['box']['xmin'], d['box']['ymin'], d['box']['xmax'], d['box']['ymax']] for d in detections])
	scores = np.array([d['score'] for d in detections])
	indices = np.argsort(scores)[::-1]

	keep = []
	while len(indices) > 0:
	current = indices[0]
	keep.append(current)
	rest = indices[1:]

	ious = compute_iou({
	'xmin': boxes[current, 0],
	'ymin': boxes[current, 1],
	'xmax': boxes[current, 2],
	'ymax': boxes[current, 3]
	}, boxes[rest])

	indices = rest[np.where(ious < iou_threshold)[0]]

	return [detections[i] for i in keep]

	# Function to generate audio from text
	def generate_audio(text):
	narrated_text = narrator(text)
	wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
	return "output.wav"

	# Function to read and summarize detected objects
	def read_objects(detection_objects):
	object_counts = {}
	for detection in detection_objects:
	label = detection['label']
	object_counts[label] = object_counts.get(label, 0) + 1

	response = "This picture contains"
	labels = list(object_counts.keys())
	for i, label in enumerate(labels):
	response += f" {object_counts[label]} {label}"
	if object_counts[label] > 1:
	response += "s"
	if i < len(labels) - 2:
	response += ","
	elif i == len(labels) - 2:
	response += " and"
	response += "."
	return response

	# Function to draw bounding boxes on the image
	def draw_bounding_boxes(image, detections):
	draw_image = image.copy()
	draw = ImageDraw.Draw(draw_image)
	font = ImageFont.load_default()

	for detection in detections:
	box = detection['box']
	xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
	draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)

	label = detection['label']
	score = detection['score']
	text = f"{label}: {score:.2f}"
	text_size = draw.textbbox((xmin, ymin), text, font=font)
	draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
	draw.text((xmin, ymin), text, fill="white", font=font)

	return draw_image

	# Main function to process the image
	def detect_object(image):
	detections = object_detector(image)

	# Apply confidence threshold and NMS
	confidence_threshold = 0.5
	filtered_detections = [d for d in detections if d['score'] > confidence_threshold]
	filtered_detections = nms(filtered_detections)

	processed_image = draw_bounding_boxes(image, filtered_detections)
	description_text = read_objects(filtered_detections)
	processed_audio = generate_audio(description_text)
	return processed_image, processed_audio

	description_text = """
	Upload an image to detect objects and hear a natural language description.
	### Credits:
	Developed by Taizun S
	"""

	# Google Analytics script
	ga_script = """
	<script async src="https://www.googletagmanager.com/gtag/js?id=G-WEYXHDZ3GQ"></script>
	<script>
	window.dataLayer = window.dataLayer \|\| [];
	function gtag(){dataLayer.push(arguments);}
	gtag('js', new Date());
	gtag('config', 'G-WEYXHDZ3GQ');
	</script>
	"""

	# Use Gradio Blocks to organize the layout
	with gr.Blocks() as demo:
	gr.HTML(ga_script) # Injecting Google Analytics script
	gr.Markdown(description_text) # Adding the description as Markdown

	# Define the Interface components within Blocks
	gr.Interface(
	fn=detect_object,
	inputs=gr.Image(label="Upload an Image", type="pil"),
	outputs=[
	gr.Image(label="Processed Image", type="pil"),
	gr.Audio(label="Generated Audio")
	],
	title="Multi-Object Detection with Audio Narration",
	)

	# Launch the Blocks interface
	demo.launch()