Spaces:
Runtime error
Runtime error
Update app.py
#1
by
aleehassan
- opened
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
| 3 |
import scipy.io.wavfile as wavfile
|
| 4 |
from transformers import pipeline
|
| 5 |
|
|
@@ -7,6 +8,45 @@ from transformers import pipeline
|
|
| 7 |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
|
| 8 |
object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
# Function to generate audio from text
|
| 11 |
def generate_audio(text):
|
| 12 |
narrated_text = narrator(text)
|
|
@@ -46,7 +86,7 @@ def draw_bounding_boxes(image, detections):
|
|
| 46 |
|
| 47 |
label = detection['label']
|
| 48 |
score = detection['score']
|
| 49 |
-
text = f"{label} {score:.2f}"
|
| 50 |
text_size = draw.textbbox((xmin, ymin), text, font=font)
|
| 51 |
draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
|
| 52 |
draw.text((xmin, ymin), text, fill="white", font=font)
|
|
@@ -56,27 +96,30 @@ def draw_bounding_boxes(image, detections):
|
|
| 56 |
# Main function to process the image
|
| 57 |
def detect_object(image):
|
| 58 |
detections = object_detector(image)
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
processed_audio = generate_audio(description_text)
|
| 62 |
return processed_image, processed_audio
|
| 63 |
|
| 64 |
-
|
| 65 |
description_text = """
|
| 66 |
Upload an image to detect objects and hear a natural language description.
|
| 67 |
-
|
| 68 |
### Credits:
|
| 69 |
Developed by Taizun S
|
| 70 |
"""
|
| 71 |
|
| 72 |
-
#
|
| 73 |
ga_script = """
|
| 74 |
<script async src="https://www.googletagmanager.com/gtag/js?id=G-WEYXHDZ3GQ"></script>
|
| 75 |
<script>
|
| 76 |
window.dataLayer = window.dataLayer || [];
|
| 77 |
function gtag(){dataLayer.push(arguments);}
|
| 78 |
gtag('js', new Date());
|
| 79 |
-
|
| 80 |
gtag('config', 'G-WEYXHDZ3GQ');
|
| 81 |
</script>
|
| 82 |
"""
|
|
@@ -99,4 +142,3 @@ with gr.Blocks() as demo:
|
|
| 99 |
|
| 100 |
# Launch the Blocks interface
|
| 101 |
demo.launch()
|
| 102 |
-
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from PIL import Image, ImageDraw, ImageFont
|
| 3 |
+
import numpy as np
|
| 4 |
import scipy.io.wavfile as wavfile
|
| 5 |
from transformers import pipeline
|
| 6 |
|
|
|
|
| 8 |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
|
| 9 |
object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
|
| 10 |
|
| 11 |
+
# Function to apply Non-Maximum Suppression (NMS)
|
| 12 |
+
def compute_iou(box1, boxes):
|
| 13 |
+
x1 = np.maximum(box1['xmin'], boxes[:, 0])
|
| 14 |
+
y1 = np.maximum(box1['ymin'], boxes[:, 1])
|
| 15 |
+
x2 = np.minimum(box1['xmax'], boxes[:, 2])
|
| 16 |
+
y2 = np.minimum(box1['ymax'], boxes[:, 3])
|
| 17 |
+
|
| 18 |
+
intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1)
|
| 19 |
+
box1_area = (box1['xmax'] - box1['xmin']) * (box1['ymax'] - box1['ymin'])
|
| 20 |
+
boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
| 21 |
+
|
| 22 |
+
union = box1_area + boxes_area - intersection
|
| 23 |
+
return intersection / union
|
| 24 |
+
|
| 25 |
+
def nms(detections, iou_threshold=0.5):
|
| 26 |
+
if len(detections) == 0:
|
| 27 |
+
return []
|
| 28 |
+
|
| 29 |
+
boxes = np.array([[d['box']['xmin'], d['box']['ymin'], d['box']['xmax'], d['box']['ymax']] for d in detections])
|
| 30 |
+
scores = np.array([d['score'] for d in detections])
|
| 31 |
+
indices = np.argsort(scores)[::-1]
|
| 32 |
+
|
| 33 |
+
keep = []
|
| 34 |
+
while len(indices) > 0:
|
| 35 |
+
current = indices[0]
|
| 36 |
+
keep.append(current)
|
| 37 |
+
rest = indices[1:]
|
| 38 |
+
|
| 39 |
+
ious = compute_iou({
|
| 40 |
+
'xmin': boxes[current, 0],
|
| 41 |
+
'ymin': boxes[current, 1],
|
| 42 |
+
'xmax': boxes[current, 2],
|
| 43 |
+
'ymax': boxes[current, 3]
|
| 44 |
+
}, boxes[rest])
|
| 45 |
+
|
| 46 |
+
indices = rest[np.where(ious < iou_threshold)[0]]
|
| 47 |
+
|
| 48 |
+
return [detections[i] for i in keep]
|
| 49 |
+
|
| 50 |
# Function to generate audio from text
|
| 51 |
def generate_audio(text):
|
| 52 |
narrated_text = narrator(text)
|
|
|
|
| 86 |
|
| 87 |
label = detection['label']
|
| 88 |
score = detection['score']
|
| 89 |
+
text = f"{label}: {score:.2f}"
|
| 90 |
text_size = draw.textbbox((xmin, ymin), text, font=font)
|
| 91 |
draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
|
| 92 |
draw.text((xmin, ymin), text, fill="white", font=font)
|
|
|
|
| 96 |
# Main function to process the image
|
| 97 |
def detect_object(image):
|
| 98 |
detections = object_detector(image)
|
| 99 |
+
|
| 100 |
+
# Apply confidence threshold and NMS
|
| 101 |
+
confidence_threshold = 0.5
|
| 102 |
+
filtered_detections = [d for d in detections if d['score'] > confidence_threshold]
|
| 103 |
+
filtered_detections = nms(filtered_detections)
|
| 104 |
+
|
| 105 |
+
processed_image = draw_bounding_boxes(image, filtered_detections)
|
| 106 |
+
description_text = read_objects(filtered_detections)
|
| 107 |
processed_audio = generate_audio(description_text)
|
| 108 |
return processed_image, processed_audio
|
| 109 |
|
|
|
|
| 110 |
description_text = """
|
| 111 |
Upload an image to detect objects and hear a natural language description.
|
|
|
|
| 112 |
### Credits:
|
| 113 |
Developed by Taizun S
|
| 114 |
"""
|
| 115 |
|
| 116 |
+
# Google Analytics script
|
| 117 |
ga_script = """
|
| 118 |
<script async src="https://www.googletagmanager.com/gtag/js?id=G-WEYXHDZ3GQ"></script>
|
| 119 |
<script>
|
| 120 |
window.dataLayer = window.dataLayer || [];
|
| 121 |
function gtag(){dataLayer.push(arguments);}
|
| 122 |
gtag('js', new Date());
|
|
|
|
| 123 |
gtag('config', 'G-WEYXHDZ3GQ');
|
| 124 |
</script>
|
| 125 |
"""
|
|
|
|
| 142 |
|
| 143 |
# Launch the Blocks interface
|
| 144 |
demo.launch()
|
|
|