Pijush2023 commited on
Commit
4d969d0
·
verified ·
1 Parent(s): 69ce2dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -0
app.py CHANGED
@@ -243,6 +243,47 @@ def insert_prompt(current_text, prompt):
243
  return prompt[0] if prompt else current_text
244
 
245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  # Create the Gradio Blocks interface
248
  with gr.Blocks(theme="rawrsor1/Everforest") as demo:
@@ -250,6 +291,7 @@ with gr.Blocks(theme="rawrsor1/Everforest") as demo:
250
  with gr.Row():
251
  with gr.Column():
252
  question_input = gr.Textbox(label="Ask a Question", placeholder="Type your question here...")
 
253
 
254
 
255
 
@@ -276,6 +318,10 @@ with gr.Blocks(theme="rawrsor1/Everforest") as demo:
276
  # Define interaction for hitting the Enter key
277
  question_input.submit(fn=add_message, inputs=[chatbot, question_input], outputs=[chatbot, question_input])\
278
  .then(fn=chat_with_bot, inputs=[chatbot], outputs=chatbot)
 
 
 
 
279
 
280
 
281
  generate_audio_btn.click(fn=generate_audio_from_last_response, inputs=chatbot, outputs=audio_output)
 
243
  return prompt[0] if prompt else current_text
244
 
245
 
246
+ # Define the ASR model with Whisper
247
+ model_id = 'openai/whisper-small'
248
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
249
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
250
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
251
+ processor = AutoProcessor.from_pretrained(model_id)
252
+
253
+ pipe_asr = pipeline(
254
+ "automatic-speech-recognition",
255
+ model=model,
256
+ tokenizer=processor.tokenizer,
257
+ feature_extractor=processor.feature_extractor,
258
+ max_new_tokens=128,
259
+ chunk_length_s=15,
260
+ batch_size=16,
261
+ torch_dtype=torch_dtype,
262
+ device=device,
263
+ return_timestamps=True
264
+ )
265
+
266
+ # Define the transcription function for streaming audio
267
+ def transcribe_function(stream, new_chunk):
268
+ try:
269
+ sr, y = new_chunk[0], new_chunk[1]
270
+ except TypeError:
271
+ print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
272
+ return stream, "", None
273
+
274
+ y = y.astype(np.float32) / np.max(np.abs(y))
275
+
276
+ if stream is not None:
277
+ stream = np.concatenate([stream, y])
278
+ else:
279
+ stream = y
280
+
281
+ result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
282
+ full_text = result.get("text", "")
283
+
284
+ return stream, full_text, full_text
285
+
286
+
287
 
288
  # Create the Gradio Blocks interface
289
  with gr.Blocks(theme="rawrsor1/Everforest") as demo:
 
291
  with gr.Row():
292
  with gr.Column():
293
  question_input = gr.Textbox(label="Ask a Question", placeholder="Type your question here...")
294
+ audio_input = gr.Audio(sources=["microphone"],streaming=True,type='numpy',every=0.1,label="Speak to Ask")
295
 
296
 
297
 
 
318
  # Define interaction for hitting the Enter key
319
  question_input.submit(fn=add_message, inputs=[chatbot, question_input], outputs=[chatbot, question_input])\
320
  .then(fn=chat_with_bot, inputs=[chatbot], outputs=chatbot)
321
+
322
+ # Speech-to-Text functionality
323
+ state = gr.State()
324
+ audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, question_input])
325
 
326
 
327
  generate_audio_btn.click(fn=generate_audio_from_last_response, inputs=chatbot, outputs=audio_output)