Mendoza33 commited on
Commit
95daf41
·
verified ·
1 Parent(s): ac1d4d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -16
app.py CHANGED
@@ -1,35 +1,50 @@
1
- from transformers import pipeline
2
  import gradio as gr
 
 
 
 
 
 
 
 
 
3
 
4
  # Load pre-trained models
5
  stt_model = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
6
  nlp_model = pipeline("text-generation", model="sshleifer/tiny-gpt2")
7
 
8
- # Attempt to load Kokoro-82M for TTS (this might require adjustments depending on model support)
9
- tts_model = pipeline("text-to-speech", model="hexgrad/Kokoro-82M")
 
 
 
 
 
 
 
 
 
 
10
 
11
- # Define a function to handle the workflow
12
  def conversation(audio):
13
  # Step 1: Convert speech to text
14
- text = stt_model(audio)["text"]
 
15
 
16
- # Step 2: Generate a response (contextual supermarket-related training)
17
- if "supermarket" in text.lower():
18
- # Simple supermarket-based response; this can be expanded with more specific data
19
- response = "Are you looking for something in particular at the supermarket?"
20
- else:
21
- # Default response generation (using GPT-2 model)
22
- response = nlp_model(text, max_length=50)[0]["generated_text"]
23
 
24
  # Step 3: Convert response text to speech using Kokoro-82M
25
- tts_audio = tts_model(response)
26
 
27
- return text, response, tts_audio
 
28
 
29
  # Create Gradio Interface
30
  interface = gr.Interface(
31
  fn=conversation,
32
- inputs=gr.Audio(source="microphone", type="filepath"),
33
  outputs=[
34
  gr.Textbox(label="Transcription"),
35
  gr.Textbox(label="AI Response"),
@@ -38,4 +53,4 @@ interface = gr.Interface(
38
  )
39
 
40
  # Launch the app
41
- interface.launch()
 
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
+ import torch
4
+ import librosa
5
+ import os
6
+
7
+ # Custom imports for Kokoro-82M
8
+ from models import build_model
9
+ from kokoro import generate
10
+ from IPython.display import Audio
11
 
12
  # Load pre-trained models
13
  stt_model = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
14
  nlp_model = pipeline("text-generation", model="sshleifer/tiny-gpt2")
15
 
16
+ # Device setup (use GPU if available, otherwise fallback to CPU)
17
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
+
19
+ # Load Kokoro-82M model and voicepack
20
+ def load_kokoro_model():
21
+ model = build_model('kokoro-v0_19.pth', device) # Adjust with the model path
22
+ voice_name = 'af' # Default voice is a 50-50 mix of Bella & Sarah
23
+ voicepack = torch.load(f'voices/{voice_name}.pt', weights_only=True).to(device)
24
+ return model, voicepack
25
+
26
+ # Load the Kokoro model once when the app starts
27
+ kokoro_model, kokoro_voicepack = load_kokoro_model()
28
 
29
+ # Define the function to handle the full workflow
30
  def conversation(audio):
31
  # Step 1: Convert speech to text
32
+ audio_input, _ = librosa.load(audio, sr=16000) # Ensure correct audio sample rate
33
+ text = stt_model(audio_input)["text"]
34
 
35
+ # Step 2: Generate a response using GPT-2
36
+ response = nlp_model(text, max_length=50)[0]["generated_text"]
 
 
 
 
 
37
 
38
  # Step 3: Convert response text to speech using Kokoro-82M
39
+ audio_response, _ = generate(kokoro_model, response, kokoro_voicepack, lang='af') # Using 'af' as language (adjust if needed)
40
 
41
+ # Return transcription, AI response, and generated audio
42
+ return text, response, Audio(data=audio_response, rate=24000, autoplay=True)
43
 
44
  # Create Gradio Interface
45
  interface = gr.Interface(
46
  fn=conversation,
47
+ inputs=gr.Audio(source="microphone", type="filepath"), # Microphone input for live audio
48
  outputs=[
49
  gr.Textbox(label="Transcription"),
50
  gr.Textbox(label="AI Response"),
 
53
  )
54
 
55
  # Launch the app
56
+ interface.launch(share=True) # Set `share=True` if you want to share the app via a link