Convx / app.py
Fabriwin's picture
Upload 3 files
3175dca verified
raw
history blame
2.49 kB
import gradio as gr
import torch
from transformers import pipeline
import time
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
# Define the models using pipeline
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", chunk_length_s=30)
text_pipe = pipeline("text-generation", model="HuggingFaceTB/SmolLM2-360M", max_length=512, temperature=0.7, top_p=0.9)
tts_pipe = pipeline("text-to-speech", model="mussacharles60/swahili-tts-female-voice")
# Define conversation rules
MAX_INPUT_SIZE = 100
PREDEFINED_ATTRIBUTES = ["name", "age", "location"]
CONTEXT_HISTORY = []
# Define the function to recognize speech
def recognize_speech(audio):
retries = 3
for _ in range(retries):
try:
result = asr_pipe(audio, return_timestamps=True)
return result['text']
except Exception as e:
logging.error(f"ASR failed: {e}")
time.sleep(1)
return ""
# Define the function to generate text
def generate_text(prompt):
global CONTEXT_HISTORY
CONTEXT_HISTORY.append(prompt)
if len(CONTEXT_HISTORY) > 5:
CONTEXT_HISTORY.pop(0)
context = " ".join(CONTEXT_HISTORY)
outputs = text_pipe(context, max_length=512, num_return_sequences=1)
generated_text = outputs[0]['generated_text']
return generated_text
# Define the function to synthesize speech
def synthesize_speech(text):
audio = tts_pipe(text, output_format="wav", sample_rate=16000)
return audio
# Define the function to handle conversation
def handle_conversation(audio):
recognized_text = recognize_speech(audio)
if any(attr in recognized_text.lower() for attr in PREDEFINED_ATTRIBUTES):
generated_text = generate_text(f"Please provide your {recognized_text}")
else:
generated_text = generate_text(recognized_text)
synthesized_audio = synthesize_speech(generated_text)
return synthesized_audio, generated_text
# Define the Gradio app
demo = gr.Blocks()
# Define the input and output components
input_audio = gr.Audio(label="Input Audio")
output_audio = gr.Audio(label="Output Audio")
output_text = gr.Textbox(label="Output Text")
# Define the buttons
conversation_button = gr.Button("Start Conversation")
# Define the event listeners
conversation_button.click(handle_conversation, inputs=input_audio, outputs=[output_audio, output_text])
# Launch the app
demo.launch()