File size: 6,195 Bytes
0b1ae9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c230aed
84665a2
0b1ae9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18dbd32
0b1ae9e
 
18dbd32
0b1ae9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9076931
0b1ae9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84665a2
0b1ae9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import asyncio
import base64
import json
import os
import secrets
from pathlib import Path

import gradio as gr
import numpy as np
from dotenv import load_dotenv
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse, StreamingResponse
from fastrtc import (
    AdditionalOutputs,
    AsyncStreamHandler,
    Stream,
    get_cloudflare_turn_credentials_async,
    wait_for_item,
)
from gradio.utils import get_space
from websockets.asyncio.client import connect

load_dotenv()

cur_dir = Path(__file__).parent

API_KEY = os.getenv("MODELSCOPE_API_KEY", "")
API_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model=qwen-omni-turbo-realtime-2025-03-26"
VOICES = ["Chelsie", "Serena", "Ethan", "Cherry"]
headers = {"Authorization": "Bearer " + API_KEY}


class QwenOmniHandler(AsyncStreamHandler):
    def __init__(
        self,
    ) -> None:
        super().__init__(
            expected_layout="mono",
            output_sample_rate=24_000,
            input_sample_rate=16_000,
        )
        self.connection = None
        self.output_queue = asyncio.Queue()

    def copy(self):
        return QwenOmniHandler()

    @staticmethod
    def msg_id() -> str:
        return f"event_{secrets.token_hex(10)}"

    async def start_up(
        self,
    ):
        """Connect to realtime API. Run forever in separate thread to keep connection open."""
        voice_id = "Serena"
        print("voice_id", voice_id)
        async with connect(
            API_URL,
            additional_headers=headers,
        ) as conn:
            self.client = conn
            await conn.send(
                json.dumps(
                    {
                        "event_id": self.msg_id(),
                        "type": "session.update",
                        "session": {
                            "modalities": [
                                "text",
                                "audio",
                            ],
                            "voice": voice_id,
                            "input_audio_format": "pcm16",
                        },
                    }
                )
            )
            self.connection = conn
            async for data in self.connection:
                event = json.loads(data)
                if "type" not in event:
                    continue
                # Handle interruptions
                if event["type"] == "input_audio_buffer.speech_started":
                    print("clear queue")
                    self.clear_queue()
                if event["type"] == "response.audio.delta":
                    print("putting output")
                    await self.output_queue.put(
                        (
                            self.output_sample_rate,
                            np.frombuffer(
                                base64.b64decode(event["delta"]), dtype=np.int16
                            ).reshape(1, -1),
                        ),
                    )

    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
        if not self.connection:
            return
        _, array = frame
        array = array.squeeze()
        audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
        await self.connection.send(
            json.dumps(
                {
                    "event_id": self.msg_id(),
                    "type": "input_audio_buffer.append",
                    "audio": audio_message,
                }
            )
        )

    async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
        return await wait_for_item(self.output_queue)

    async def shutdown(self) -> None:
        if self.connection:
            await self.connection.close()
            self.connection = None


voice = gr.Dropdown(choices=VOICES, value=VOICES[0], type="value", label="Voice")
stream = Stream(
    QwenOmniHandler(),
    mode="send-receive",
    modality="audio",
    additional_inputs=[voice],
    additional_outputs=None,
    rtc_configuration=get_cloudflare_turn_credentials_async,
    concurrency_limit=20,
    time_limit=180,
)

app = FastAPI()

@app.post("/telephone/incoming")
async def handle_incoming_call(request: Request):
    """
    Handle incoming telephone calls (e.g., via Twilio).

    Generates TwiML instructions to connect the incoming call to the
    WebSocket handler (`/telephone/handler`) for audio streaming.

    Args:
        request: The FastAPI Request object for the incoming call webhook.

    Returns:
        An HTMLResponse containing the TwiML instructions as XML.
    """
    from twilio.twiml.voice_response import Connect, VoiceResponse

    if len(stream.connections) > (stream.concurrency_limit or 20):
        response = VoiceResponse()
        response.say("Qwen is busy please try again later!")
        return HTMLResponse(content=str(response), media_type="application/xml")

    response = VoiceResponse()
    response.say("Connecting to Qwen")
    connect = Connect()
    print("request.url.hostname", request.url.hostname)
    connect.stream(url=f"wss://{request.url.hostname}/telephone/handler")
    response.append(connect)
    response.say("The call has been disconnected.")
    return HTMLResponse(content=str(response), media_type="application/xml")

stream.mount(app)


@app.get("/")
async def _():
    html_content = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Qwen Phone Chat</title>
        <style>
            body {
                font-family: Arial, sans-serif;
                max-width: 800px;
                margin: 0 auto;
                padding: 20px;
                line-height: 1.6;
            }
            pre {
                background-color: #f5f5f5;
                padding: 15px;
                border-radius: 5px;
                overflow-x: auto;
            }
            h1 {
                color: #333;
            }
        </style>
    </head>
    <body>
        <h1>Qwen Phone Chat</h1>
        <p>Call +1 (877) 853-7936</p>
    </body>
    </html>
    """
    return HTMLResponse(content=html_content)


if __name__ == "__main__":
    import uvicorn

    uvicorn.run(app, host="0.0.0.0", port=7860)