File size: 4,696 Bytes
1ac399b
 
 
 
 
 
b2051b3
 
1ac399b
54b2ac1
d1fe398
54b2ac1
 
 
 
 
 
1ac399b
 
 
 
 
 
 
 
 
 
 
 
 
 
a7e49b4
 
b2051b3
 
 
3202126
 
 
 
12f6caf
 
a7e49b4
3202126
12f6caf
2afbde9
3202126
77dbc9a
0471c24
1ac399b
77dbc9a
3202126
449d4d5
042390d
 
1ac399b
042390d
1ac399b
a7e49b4
 
77dbc9a
 
 
 
 
 
 
a7e49b4
77dbc9a
 
 
 
 
 
 
1ac399b
 
3202126
a7e49b4
88c0791
a7e49b4
 
 
 
 
 
 
 
 
1ac399b
a7e49b4
77dbc9a
 
 
 
 
 
 
 
 
 
a7e49b4
 
 
 
1ac399b
77dbc9a
 
a7e49b4
77dbc9a
 
1ac399b
 
 
 
 
 
 
a7e49b4
3202126
 
042390d
3202126
 
 
 
042390d
 
 
 
3202126
1ac399b
a7e49b4
 
77dbc9a
 
 
 
a7e49b4
 
77dbc9a
1ac399b
3202126
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import gradio as gr
from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs
import transformers
import numpy as np
from twilio.rest import Client
import os
import torch
import librosa

pipe = transformers.pipeline(
    model="reach-vb/smolvox-smollm2-whisper-turbo",
    trust_remote_code=True,
    device=torch.device("cuda"),
)
whisper = transformers.pipeline(
    model="openai/whisper-large-v3-turbo", device=torch.device("cuda")
)

account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")

if account_sid and auth_token:
    client = Client(account_sid, auth_token)
    token = client.tokens.create()
    rtc_configuration = {
        "iceServers": token.ice_servers,
        "iceTransportPolicy": "relay",
    }
else:
    rtc_configuration = None


def transcribe(audio: tuple[int, np.ndarray], transformers_chat: list[dict], conversation: list[dict]):
    original_sr = audio[0]
    target_sr = 16000

    audio_sr = librosa.resample(
        audio[1].astype(np.float32) / 32768.0, orig_sr=original_sr, target_sr=target_sr
    )

    tf_input = [d for d in transformers_chat]

    # Generate a response from the pipeline using the audio input
    output = pipe(
        {"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr},
        max_new_tokens=2048,
    )
    # Transcribe the audio using Whisper
    transcription = whisper({"array": audio_sr.squeeze(), "sampling_rate": target_sr})

    # Update both conversation histories
    conversation.append({"role": "user", "content": transcription["text"]})
    conversation.append({"role": "assistant", "content": output})
    transformers_chat.append({"role": "user", "content": transcription["text"]})
    transformers_chat.append({"role": "assistant", "content": output})

    yield AdditionalOutputs(transformers_chat, conversation)


def respond_text(user_text: str, transformers_chat: list[dict], conversation: list[dict]):
    if not user_text.strip():
        return transformers_chat, conversation

    # Append the user message from the textbox
    conversation.append({"role": "user", "content": user_text})
    transformers_chat.append({"role": "user", "content": user_text})

    # Generate a response using the pipeline. We assume it can process text input via "text"
    output = pipe({"text": user_text, "turns": transformers_chat}, max_new_tokens=512)

    conversation.append({"role": "assistant", "content": output})
    transformers_chat.append({"role": "assistant", "content": output})
    return transformers_chat, conversation


with gr.Blocks() as demo:
    gr.HTML(
        """
    <h1 style='text-align: center'>
    Talk to Smolvox Smollm2 1.7b (Powered by WebRTC ⚡️)
    </h1>
    <p style='text-align: center'>
    Once you grant access to your microphone, you can talk naturally to Ultravox.
    When you stop talking, the audio will be sent for processing.
    </p>
    <p style='text-align: center'>
    Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
    </p>
    """
    )

    # Shared conversation state
    transformers_chat = gr.State(
        value=[
            {
                "role": "system",
                "content": "You are a friendly and helpful character. You love to answer questions for people.",
            }
        ]
    )

    # Chat transcript at the top
    transcript = gr.Chatbot(label="Transcript", type="messages")

    # Lower row: text input and audio input side by side
    with gr.Row():
        with gr.Column(scale=1):
            text_input = gr.Textbox(
                placeholder="Type your message here and press Enter...", label="Your Message"
            )
        with gr.Column(scale=1):
            audio = WebRTC(
                rtc_configuration=rtc_configuration,
                label="Stream",
                mode="send",
                modality="audio",
            )

    # Audio stream: process audio when speaking stops.
    audio.stream(
        ReplyOnPause(transcribe),
        inputs=[audio, transformers_chat, transcript],
        outputs=[audio],
        time_limit=90,
    )
    audio.on_additional_outputs(
        lambda t, g: (t, g),
        outputs=[transformers_chat, transcript],
        queue=False,
        show_progress="hidden",
    )

    # Text input: submit callback when pressing Enter.
    text_input.submit(
        respond_text,
        inputs=[text_input, transformers_chat, transcript],
        outputs=[transformers_chat, transcript],
    )
    # Clear text input after submission.
    text_input.submit(lambda: "", inputs=[], outputs=[text_input])

if __name__ == "__main__":
    demo.launch()