Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,32 +2,14 @@ import gradio as gr
|
|
2 |
from PIL import Image, ImageDraw, ImageFont
|
3 |
import scipy.io.wavfile as wavfile
|
4 |
|
5 |
-
|
6 |
-
# Use a pipeline as a high-level helper
|
7 |
from transformers import pipeline
|
8 |
|
9 |
-
# model_path = ("../Models/models--facebook--detr-resnet-50/snapshots"
|
10 |
-
# "/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
|
11 |
-
#
|
12 |
-
# tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
|
13 |
-
# "/3bcb8321394f671bd948ebf0d086d694dda95464")
|
14 |
-
|
15 |
-
|
16 |
narrator = pipeline("text-to-speech",
|
17 |
model="kakao-enterprise/vits-ljs")
|
18 |
|
19 |
object_detector = pipeline("object-detection",
|
20 |
model="facebook/detr-resnet-50")
|
21 |
|
22 |
-
# object_detector = pipeline("object-detection",
|
23 |
-
# model=model_path)
|
24 |
-
#
|
25 |
-
# narrator = pipeline("text-to-speech",
|
26 |
-
# model=tts_model_path)
|
27 |
-
|
28 |
-
# [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
|
29 |
-
|
30 |
-
# Define the function to generate audio from text
|
31 |
def generate_audio(text):
|
32 |
# Generate the narrated text
|
33 |
narrated_text = narrator(text)
|
@@ -39,10 +21,6 @@ def generate_audio(text):
|
|
39 |
# Return the path to the saved audio file
|
40 |
return "output.wav"
|
41 |
|
42 |
-
# Could you please write me a python code that will take list of detection object as an input and it will give the response that will include all the objects (labels) provided in the input. For example if the input is like this: [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
|
43 |
-
# The output should be, This pictuture contains 1 person and 1 dog. If there are multiple objects, do not add 'and' between every objects but 'and' should be at the end only
|
44 |
-
|
45 |
-
|
46 |
def read_objects(detection_objects):
|
47 |
# Initialize counters for each object label
|
48 |
object_counts = {}
|
@@ -136,6 +114,5 @@ def detect_object(image):
|
|
136 |
demo = gr.Interface(fn=detect_object,
|
137 |
inputs=[gr.Image(label="Select Image",type="pil")],
|
138 |
outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
|
139 |
-
title="@GenAILearniverse Project 7: Object Detector with Audio",
|
140 |
description="THIS APPLICATION WILL BE USED TO HIGHLIGHT OBJECTS AND GIVES AUDIO DESCRIPTION FOR THE PROVIDED INPUT IMAGE.")
|
141 |
demo.launch()
|
|
|
2 |
from PIL import Image, ImageDraw, ImageFont
|
3 |
import scipy.io.wavfile as wavfile
|
4 |
|
|
|
|
|
5 |
from transformers import pipeline
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
narrator = pipeline("text-to-speech",
|
8 |
model="kakao-enterprise/vits-ljs")
|
9 |
|
10 |
object_detector = pipeline("object-detection",
|
11 |
model="facebook/detr-resnet-50")
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def generate_audio(text):
|
14 |
# Generate the narrated text
|
15 |
narrated_text = narrator(text)
|
|
|
21 |
# Return the path to the saved audio file
|
22 |
return "output.wav"
|
23 |
|
|
|
|
|
|
|
|
|
24 |
def read_objects(detection_objects):
|
25 |
# Initialize counters for each object label
|
26 |
object_counts = {}
|
|
|
114 |
demo = gr.Interface(fn=detect_object,
|
115 |
inputs=[gr.Image(label="Select Image",type="pil")],
|
116 |
outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
|
|
|
117 |
description="THIS APPLICATION WILL BE USED TO HIGHLIGHT OBJECTS AND GIVES AUDIO DESCRIPTION FOR THE PROVIDED INPUT IMAGE.")
|
118 |
demo.launch()
|