File size: 3,803 Bytes
bacd0f5
 
 
1145213
bacd0f5
f975d86
bacd0f5
f975d86
bacd0f5
 
 
 
 
 
 
 
 
 
 
99b56c0
1145213
bacd0f5
 
 
 
 
 
f975d86
bacd0f5
 
 
f975d86
bacd0f5
 
 
 
 
 
 
 
 
 
f975d86
 
 
 
bacd0f5
f975d86
bacd0f5
 
 
f975d86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bacd0f5
f975d86
 
 
 
bacd0f5
f975d86
 
bacd0f5
f975d86
 
bacd0f5
f975d86
bacd0f5
f975d86
bacd0f5
f975d86
bacd0f5
 
f975d86
bacd0f5
 
 
 
 
 
 
 
f975d86
bacd0f5
f975d86
bacd0f5
 
 
f975d86
bacd0f5
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
import groq
import io
import numpy as np
import soundfile as sf
import requests

# Function to transcribe audio using Groq
def transcribe_audio(audio, api_key):
    if audio is None:
        return ""
    
    client = groq.Client(api_key=api_key)
    
    # Convert audio to the format expected by the model
    audio_data = audio[1]  # Get the numpy array from the tuple
    buffer = io.BytesIO()
    sf.write(buffer, audio_data, audio[0], format='wav')
    buffer.seek(0)

    try:
        # Use Distil-Whisper English powered by Groq for transcription
        completion = client.audio.transcriptions.create(
            model="distil-whisper-large-v3-en",
            file=("audio.wav", buffer),
            response_format="text"
        )
        return completion.get('text', '')  # Extract transcription text from response
    except Exception as e:
        return f"Error in transcription: {str(e)}"

# Function to generate AI response using Groq
def generate_response(transcription, api_key):
    if not transcription:
        return "No transcription available. Please try speaking again."
    
    client = groq.Client(api_key=api_key)
    
    try:
        # Use Llama 3 70B powered by Groq for text generation
        completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": transcription}
            ],
        )
        return completion.choices[0].message['content']
    except Exception as e:
        return f"Error in response generation: {str(e)}"

# VoiceRSS TTS function
def text_to_speech(text, tts_api_key):
    url = "https://api.voicerss.org/"
    params = {
        'key': tts_api_key,
        'src': text,
        'hl': 'en-us',  # Language: English (US)
        'r': '0',  # Speech rate
        'c': 'mp3',  # Audio format (mp3)
        'f': '48khz_16bit_stereo'  # Frequency and bitrate
    }

    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            return response.content  # Return the audio data
        else:
            return f"Error in TTS conversion: {response.status_code}"
    except Exception as e:
        return f"Error in TTS conversion: {str(e)}"

# Process audio function to handle transcription, response generation, and TTS
def process_audio(audio, groq_api_key, tts_api_key):
    if not groq_api_key:
        return "Please enter your Groq API key.", "API key is required.", None
    
    transcription = transcribe_audio(audio, groq_api_key)
    response = generate_response(transcription, groq_api_key)
    
    # Convert the AI response to speech using VoiceRSS
    audio_response = text_to_speech(response, tts_api_key)
    
    return transcription, response, audio_response

# Gradio interface with TTS
with gr.Blocks(theme=gr.themes.Default()) as demo:
    gr.Markdown("# 🎙️ Groq x Gradio Voice-Powered AI Assistant with TTS")
    
    api_key_input = gr.Textbox(type="password", label="Enter your Groq API Key")
    tts_api_key_input = gr.Textbox(type="password", label="Enter your VoiceRSS API Key")
    
    with gr.Row():
        audio_input = gr.Audio(label="Speak!", type="numpy")
    
    with gr.Row():
        transcription_output = gr.Textbox(label="Transcription")
        response_output = gr.Textbox(label="AI Assistant Response")
    
    audio_output = gr.Audio(label="AI Response (Audio)", type="auto")
    
    submit_button = gr.Button("Process", variant="primary")
    
    submit_button.click(
        process_audio,
        inputs=[audio_input, api_key_input, tts_api_key_input],
        outputs=[transcription_output, response_output, audio_output]
    )

demo.launch()