|
import gradio as gr
|
|
import torch
|
|
from transformers import pipeline
|
|
import time
|
|
import logging
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
|
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", chunk_length_s=30)
|
|
text_pipe = pipeline("text-generation", model="HuggingFaceTB/SmolLM2-360M", max_length=512, temperature=0.7, top_p=0.9)
|
|
tts_pipe = pipeline("text-to-speech", model="mussacharles60/swahili-tts-female-voice")
|
|
|
|
|
|
MAX_INPUT_SIZE = 100
|
|
PREDEFINED_ATTRIBUTES = ["name", "age", "location"]
|
|
CONTEXT_HISTORY = []
|
|
|
|
|
|
def recognize_speech(audio):
|
|
retries = 3
|
|
for _ in range(retries):
|
|
try:
|
|
result = asr_pipe(audio, return_timestamps=True)
|
|
return result['text']
|
|
except Exception as e:
|
|
logging.error(f"ASR failed: {e}")
|
|
time.sleep(1)
|
|
return ""
|
|
|
|
|
|
def generate_text(prompt):
|
|
global CONTEXT_HISTORY
|
|
CONTEXT_HISTORY.append(prompt)
|
|
if len(CONTEXT_HISTORY) > 5:
|
|
CONTEXT_HISTORY.pop(0)
|
|
context = " ".join(CONTEXT_HISTORY)
|
|
outputs = text_pipe(context, max_length=512, num_return_sequences=1)
|
|
generated_text = outputs[0]['generated_text']
|
|
return generated_text
|
|
|
|
|
|
def synthesize_speech(text):
|
|
audio = tts_pipe(text, output_format="wav", sample_rate=16000)
|
|
return audio
|
|
|
|
|
|
def handle_conversation(audio):
|
|
recognized_text = recognize_speech(audio)
|
|
if any(attr in recognized_text.lower() for attr in PREDEFINED_ATTRIBUTES):
|
|
generated_text = generate_text(f"Please provide your {recognized_text}")
|
|
else:
|
|
generated_text = generate_text(recognized_text)
|
|
synthesized_audio = synthesize_speech(generated_text)
|
|
return synthesized_audio, generated_text
|
|
|
|
|
|
demo = gr.Blocks()
|
|
|
|
|
|
input_audio = gr.Audio(label="Input Audio")
|
|
output_audio = gr.Audio(label="Output Audio")
|
|
output_text = gr.Textbox(label="Output Text")
|
|
|
|
|
|
conversation_button = gr.Button("Start Conversation")
|
|
|
|
|
|
conversation_button.click(handle_conversation, inputs=input_audio, outputs=[output_audio, output_text])
|
|
|
|
|
|
demo.launch()
|
|
|