Spaces:

Futuresony
/

Customer-service

Sleeping

File size: 3,224 Bytes

3da6c4d
d8b7ad8
 
 
 
 
 
3da6c4d
d8b7ad8
 
3da6c4d
d8b7ad8
 
3da6c4d
d8b7ad8
 
 
3da6c4d
d8b7ad8
 
 
3da6c4d
d8b7ad8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3da6c4d
d8b7ad8
 
 
 
 
 
 
 
 
 
3da6c4d
 
 
d8b7ad8
3da6c4d
 
 
d8b7ad8
 
 
 
 
3da6c4d
d8b7ad8
 
 
 
 
3da6c4d
d8b7ad8
 
 
 
 
3da6c4d

import gradio as gr
from huggingface_hub import InferenceClient
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
import torch
from asr import transcribe_auto  # ASR function

# Initialize Chat Model
chat_client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf")

# Initialize Facebook TTS Model
tts_synthesizer = pipeline("text-to-speech", model="Futuresony/Output")

# Load Speaker Embeddings for TTS
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

def speech_to_chat(audio, history, system_message, max_tokens, temperature, top_p):
    # Step 1: Transcribe Speech to Text
    transcribed_text = transcribe_auto(audio)

    # Step 2: Generate Chat Response
    messages = [{"role": "system", "content": system_message}]
    
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    
    messages.append({"role": "user", "content": transcribed_text})
    
    response = ""
    for msg in chat_client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = msg.choices[0].delta.content
        response += token

    # Step 3: Convert Chat Response to Speech
    speech = tts_synthesizer(response, forward_params={"speaker_embeddings": speaker_embedding})
    output_file = "generated_speech.wav"
    sf.write(output_file, speech["audio"], samplerate=speech["sampling_rate"])
    
    # Update Chat History
    history.append((transcribed_text, response))
    
    # Return transcribed text, chatbot response, generated speech, and updated history
    return transcribed_text, response, output_file, history

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("<h2 style='text-align: center;'>Real-time ASR → Chat → TTS</h2>")
    
    with gr.Row():
        audio_input = gr.Audio(source="microphone", type="filepath", label="🎤 Speak Here")
        transcribed_text_output = gr.Textbox(label="📝 Transcribed Text", interactive=False)
        chat_response_output = gr.Textbox(label="🤖 AI Response", interactive=False)
        audio_output = gr.Audio(label="🔊 AI Speech Output")

    submit_button = gr.Button("🎙️ Speak & Generate Response")

    system_msg = gr.Textbox(value="You are a friendly chatbot.", label="System Message")
    max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens")
    temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
    top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p")
    chat_history = gr.State([])  # Store conversation history

    submit_button.click(
        fn=speech_to_chat,
        inputs=[audio_input, chat_history, system_msg, max_tokens, temperature, top_p],
        outputs=[transcribed_text_output, chat_response_output, audio_output, chat_history],
    )

# Run the App
if __name__ == "__main__":
    demo.launch()