File size: 4,663 Bytes
c1533fd
 
 
 
 
 
 
 
7a2a265
dfd2dd8
9b4fabe
dfd2dd8
 
 
 
 
c1533fd
02819d3
c1533fd
 
02819d3
 
 
 
c1533fd
 
 
 
 
 
 
 
02819d3
c1533fd
 
 
 
 
 
 
 
 
9b4fabe
c1533fd
 
 
 
02819d3
c1533fd
 
 
 
 
 
4d5fc75
 
 
9b4fabe
4d5fc75
02819d3
 
 
 
 
 
4d5fc75
c1533fd
 
 
 
 
 
9b4fabe
 
 
 
 
 
 
4d5fc75
c1533fd
 
02819d3
 
c1533fd
 
02819d3
 
c1533fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02819d3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
from llama_cpp import Llama
import whisper
from gtts import gTTS
import tempfile
import os
from huggingface_hub import hf_hub_download

# ----- Initialization ------
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
model_basename = "llama-2-13b-chat.Q5_K_M.gguf"

model_path = hf_hub_download(
    repo_id=model_name_or_path,
    filename=model_basename
)

# Initialize the LLAMA model.
llm = Llama(
    model_path=model_path,
    n_threads=2,  # CPU cores
    n_batch=512,
    n_gpu_layers=43,
    n_ctx=4096,
)

# Load the Whisper model for speech-to-text transcription.
whisper_model = whisper.load_model("base")

# ----- Helper Functions -----

def transcribe_audio(audio_file):
    """Transcribes the provided audio file using Whisper."""
    if audio_file is None:
        return ""
    result = whisper_model.transcribe(audio_file)
    return result["text"]

def generate_response(prompt, max_tokens=150, temperature=0.7):
    """
    Uses LLAMA-CPP to generate a response for the given prompt.
    """
    output = llm(prompt, max_tokens=max_tokens, temperature=temperature, stop=["User:"])
    response = output["choices"][0]["text"]
    return response.strip()

def text_to_speech(text):
    """Converts text to speech using gTTS and returns the filepath to the saved audio."""
    tts = gTTS(text=text, lang="en")
    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    tts.save(tmp_file.name)
    return tmp_file.name

def voice_chat(audio, text, history, max_tokens, temperature):
    """
    Handles a single turn of the conversation:
      - If an audio file is provided and no text message, transcribe it.
      - Builds a prompt using only the current user input with additional instructions.
      - Generates a response from LLAMA.
      - Converts the assistant's response to speech.
    Returns:
      - A new history containing only the current turn.
      - The assistant's response text.
      - The assistant's response audio filepath.
      - Updated state (new history).
    """
    # Use the transcribed audio if text is empty.
    if audio is not None and (text is None or text.strip() == ""):
        user_input = transcribe_audio(audio)
    else:
        user_input = text if text else ""
    
    # Additional system instructions for improved behavior.
    system_prompt = ("You are a helpful, knowledgeable, and concise assistant. "
                     "Provide accurate, factual, and polite responses. "
                     "Answer the user's question directly without unnecessary commentary.")
    
    # Build prompt using the system instructions plus the current user input.
    prompt = f"{system_prompt}\n\nUser: {user_input}\nAssistant: "

    # Generate response using LLAMA-CPP.
    response_text = generate_response(prompt, max_tokens=max_tokens, temperature=temperature)
    
    # Convert only the assistant's response to speech.
    audio_response = text_to_speech(response_text)
    
    # Reset conversation history to only include the current turn.
    new_history = [(user_input, response_text)]
    
    return new_history, response_text, audio_response, new_history

# ----- Gradio Interface -----

with gr.Blocks() as demo:
    gr.Markdown("# Voice Chatbot with LLAMA‑CPP")
    
    with gr.Row():
        with gr.Column(scale=5):
            # User inputs: Audio input and/or text input.
            audio_input = gr.Audio(type="filepath", label="Speak to Chatbot")
            text_input = gr.Textbox(placeholder="Or type your message", label="Your Message")
            send_btn = gr.Button("Send")
            max_tokens_slider = gr.Slider(50, 300, value=150, step=10, label="Max Tokens")
            temperature_slider = gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Temperature")
        with gr.Column(scale=7):
            # Display outputs: Chat history, assistant text response, and audio playback.
            chat_history = gr.Chatbot(label="Chat History")
            response_textbox = gr.Textbox(label="Assistant Response")
            audio_output = gr.Audio(label="Response Audio", type="filepath")
    
    # Gradio State to hold the conversation history.
    state = gr.State([])

    def run_voice_chat(audio, text, history, max_tokens, temperature):
        return voice_chat(audio, text, history, max_tokens, temperature)
    
    # On clicking the "Send" button, run the voice_chat function.
    send_btn.click(
        fn=run_voice_chat,
        inputs=[audio_input, text_input, state, max_tokens_slider, temperature_slider],
        outputs=[chat_history, response_textbox, audio_output, state]
    )

# Launch the app.
demo.launch()