Spaces:
Sleeping
Sleeping
Ankitajadhav
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -86,3 +86,112 @@ collection_text.add(
|
|
86 |
ids=loaded_ids
|
87 |
)
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
ids=loaded_ids
|
87 |
)
|
88 |
|
89 |
+
# Initialize the transcriber
|
90 |
+
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en",device ='cuda')
|
91 |
+
|
92 |
+
# Preload TTS models
|
93 |
+
preload_models()
|
94 |
+
|
95 |
+
image_path = "dom_bremen.jpg"
|
96 |
+
absolute_path = os.path.abspath(image_path)
|
97 |
+
|
98 |
+
def transcribe(audio):
|
99 |
+
sr, y = audio
|
100 |
+
y = y.astype(np.float32)
|
101 |
+
y /= np.max(np.abs(y))
|
102 |
+
return transcriber({"sampling_rate": sr, "raw": y})["text"]
|
103 |
+
|
104 |
+
fixed_prompt = "en_speaker_5"
|
105 |
+
|
106 |
+
def generate_audio_output(text):
|
107 |
+
audio_arr = generate_audio(text, history_prompt=fixed_prompt)
|
108 |
+
audio_arr = (audio_arr * 32767).astype(np.int16)
|
109 |
+
return (SAMPLE_RATE, audio_arr)
|
110 |
+
|
111 |
+
# Function to retrieve and generate text based on input query
|
112 |
+
def generate_text(message, max_tokens=150, temperature=0.2, top_p=0.9):
|
113 |
+
try:
|
114 |
+
# Retrieve context and image from vector store
|
115 |
+
retrieved_image = collection_images.query(query_texts=message, include=['data'], n_results=1)
|
116 |
+
context_text = collection_text.query(query_texts=message, n_results=1)
|
117 |
+
|
118 |
+
context = context_text['documents'][0] if context_text else "No relevant context found."
|
119 |
+
image_data = retrieved_image['uris'][0] if retrieved_image else None
|
120 |
+
image_url = image_data if image_data else None
|
121 |
+
|
122 |
+
# Log the image URL for debugging
|
123 |
+
print(f"Retrieved image URL: {image_url}")
|
124 |
+
|
125 |
+
# Create prompt template for LLM
|
126 |
+
prompt_template = (
|
127 |
+
f"Context: {context}\n\n"
|
128 |
+
f"Question: {message}\n\n"
|
129 |
+
f"You are a guide to city of Bremen from Germany, generate response based on context."
|
130 |
+
)
|
131 |
+
|
132 |
+
# Generate text using the language model
|
133 |
+
output = llm(
|
134 |
+
prompt_template,
|
135 |
+
temperature=temperature,
|
136 |
+
top_p=top_p,
|
137 |
+
top_k=50,
|
138 |
+
repeat_penalty=1.1,
|
139 |
+
max_tokens=max_tokens,
|
140 |
+
)
|
141 |
+
|
142 |
+
# Process the output
|
143 |
+
input_string = output['choices'][0]['text'].strip()
|
144 |
+
cleaned_text = input_string.strip("[]'").replace('\\n', '\n')
|
145 |
+
continuous_text = '\n'.join(cleaned_text.split('\n'))
|
146 |
+
|
147 |
+
return continuous_text, image_url[0]
|
148 |
+
except Exception as e:
|
149 |
+
return f"Error: {str(e)}", None
|
150 |
+
|
151 |
+
# Function to load and display an image from a file path
|
152 |
+
def load_image_from_path(file_path):
|
153 |
+
try:
|
154 |
+
img = Image.open(file_path)
|
155 |
+
return img
|
156 |
+
except Exception as e:
|
157 |
+
print(f"Error loading image: {str(e)}")
|
158 |
+
return None
|
159 |
+
|
160 |
+
def process_audio(audio):
|
161 |
+
# Transcribe the audio
|
162 |
+
transcribed_text = transcribe(audio)
|
163 |
+
text_output, image_path = generate_text(transcribed_text)
|
164 |
+
if image_path:
|
165 |
+
image_output = load_image_from_path(image_path)
|
166 |
+
else:
|
167 |
+
image_output = None # Handle cases where no image is retrieved
|
168 |
+
# return text_output, image_output
|
169 |
+
# Generate audio output
|
170 |
+
audio_output = generate_audio_output(text_output)
|
171 |
+
return text_output,audio_output,image_output
|
172 |
+
|
173 |
+
def gen_tts(text):
|
174 |
+
audio_arr = generate_audio(text, history_prompt=fixed_prompt)
|
175 |
+
audio_arr = (audio_arr * 32767).astype(np.int16)
|
176 |
+
return (SAMPLE_RATE, audio_arr)
|
177 |
+
|
178 |
+
# Define the Gradio interface
|
179 |
+
# with gr.Blocks() as app:
|
180 |
+
demo = gr.Interface(
|
181 |
+
fn=process_audio,
|
182 |
+
inputs=gr.Audio(sources=["microphone"], label="Input Audio"),
|
183 |
+
outputs=[
|
184 |
+
gr.Textbox(label="Generated Text"),
|
185 |
+
gr.Audio(label="Generated Audio"),
|
186 |
+
gr.Image(label="Retrieved Image") # New output component for the image
|
187 |
+
],
|
188 |
+
title="moinBremen - Your Personal Tour Guide for our City of Bremen",
|
189 |
+
description="Ask your question about Bremen by speaking into the microphone. The system will transcribe your question, generate a response, and read it out loud.",
|
190 |
+
css=""".gradio-container {
|
191 |
+
background: url('file=/content/dom_bremen.jpg') no-repeat center center fixed;
|
192 |
+
background-size: cover;
|
193 |
+
}""",
|
194 |
+
cache_examples=False,
|
195 |
+
)
|
196 |
+
demo.launch(allowed_paths=[absolute_path])
|
197 |
+
|