Spaces:

Mendoza33
/

test-do-call

Runtime error

File size: 1,423 Bytes

95daf41
8b34f22
 
95daf41
8b34f22
95daf41
8b34f22
1b48e7b
8b34f22
95daf41
 
8b34f22
 
 
 
95daf41
8b34f22
 
 
080b398
8b34f22
9a81fc4
 
8b34f22
 
95daf41
8b34f22
 
 
9a81fc4
 
 
 
8b34f22
9a81fc4
 
 
 
 
 
 
 
8b34f22

import torch
from transformers import pipeline
import gradio as gr

# Import Kokoro components
from kokoro import generate
from models import build_model

# Set device (use GPU if available)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load Kokoro model and voicepack from the root directory
MODEL = build_model('kokoro-v0_19.pth', device)
VOICE_NAME = 'af'  # Default voice
VOICEPACK = torch.load(f'{VOICE_NAME}.pt', weights_only=True).to(device)

# Load pre-trained models for speech-to-text and text generation
stt_model = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
nlp_model = pipeline("text-generation", model="sshleifer/tiny-gpt2")

# Define the conversation function
def conversation(audio):
    # Step 1: Convert speech to text
    text = stt_model(audio)["text"]
    # Step 2: Generate a response
    response = nlp_model(text, max_length=50)[0]["generated_text"]
    # Step 3: Convert response text to speech using Kokoro model
    audio_response, out_ps = generate(MODEL, response, VOICEPACK, lang=VOICE_NAME)
    return text, response, audio_response

# Create Gradio Interface
interface = gr.Interface(
    fn=conversation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="AI Response"),
        gr.Audio(label="Generated Speech")
    ]
)

# Launch the app
interface.launch()