import torch from transformers import pipeline import gradio as gr # Import Kokoro components from kokoro import generate from models import build_model # Set device (use GPU if available) device = 'cuda' if torch.cuda.is_available() else 'cpu' # Load Kokoro model and voicepack from the root directory MODEL = build_model('kokoro-v0_19.pth', device) VOICE_NAME = 'af' # Default voice VOICEPACK = torch.load(f'{VOICE_NAME}.pt', weights_only=True).to(device) # Load pre-trained models for speech-to-text and text generation stt_model = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") nlp_model = pipeline("text-generation", model="sshleifer/tiny-gpt2") # Define the conversation function def conversation(audio): # Step 1: Convert speech to text text = stt_model(audio)["text"] # Step 2: Generate a response response = nlp_model(text, max_length=50)[0]["generated_text"] # Step 3: Convert response text to speech using Kokoro model audio_response, out_ps = generate(MODEL, response, VOICEPACK, lang=VOICE_NAME) return text, response, audio_response # Create Gradio Interface interface = gr.Interface( fn=conversation, inputs=gr.Audio(source="microphone", type="filepath"), outputs=[ gr.Textbox(label="Transcription"), gr.Textbox(label="AI Response"), gr.Audio(label="Generated Speech") ] ) # Launch the app interface.launch()