File size: 1,423 Bytes
95daf41
8b34f22
 
95daf41
8b34f22
95daf41
8b34f22
1b48e7b
8b34f22
95daf41
 
8b34f22
 
 
 
95daf41
8b34f22
 
 
080b398
8b34f22
9a81fc4
 
8b34f22
 
95daf41
8b34f22
 
 
9a81fc4
 
 
 
8b34f22
9a81fc4
 
 
 
 
 
 
 
8b34f22
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import torch
from transformers import pipeline
import gradio as gr

# Import Kokoro components
from kokoro import generate
from models import build_model

# Set device (use GPU if available)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load Kokoro model and voicepack from the root directory
MODEL = build_model('kokoro-v0_19.pth', device)
VOICE_NAME = 'af'  # Default voice
VOICEPACK = torch.load(f'{VOICE_NAME}.pt', weights_only=True).to(device)

# Load pre-trained models for speech-to-text and text generation
stt_model = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
nlp_model = pipeline("text-generation", model="sshleifer/tiny-gpt2")

# Define the conversation function
def conversation(audio):
    # Step 1: Convert speech to text
    text = stt_model(audio)["text"]
    # Step 2: Generate a response
    response = nlp_model(text, max_length=50)[0]["generated_text"]
    # Step 3: Convert response text to speech using Kokoro model
    audio_response, out_ps = generate(MODEL, response, VOICEPACK, lang=VOICE_NAME)
    return text, response, audio_response

# Create Gradio Interface
interface = gr.Interface(
    fn=conversation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="AI Response"),
        gr.Audio(label="Generated Speech")
    ]
)

# Launch the app
interface.launch()