Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
-
import os
|
2 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
3 |
import asyncio
|
4 |
import base64
|
5 |
import json
|
|
|
6 |
import pathlib
|
7 |
from typing import AsyncGenerator, Literal, List
|
8 |
|
@@ -10,11 +9,11 @@ import numpy as np
|
|
10 |
from dotenv import load_dotenv
|
11 |
from fastapi import FastAPI
|
12 |
from fastapi.responses import HTMLResponse
|
13 |
-
from fastrtc import AsyncStreamHandler, Stream,
|
14 |
from pydantic import BaseModel
|
15 |
import uvicorn
|
16 |
|
17 |
-
# --- Import get_space to detect Hugging Face Spaces ---
|
18 |
from gradio.utils import get_space
|
19 |
|
20 |
# --- Document processing and RAG libraries ---
|
@@ -116,7 +115,7 @@ def generate_answer(query: str) -> str:
|
|
116 |
# 2. Speech-to-Text and Text-to-Speech Functions
|
117 |
# ====================================================
|
118 |
|
119 |
-
#
|
120 |
stt_model = whisper.load_model("base", device="cpu")
|
121 |
|
122 |
def speech_to_text(audio_array: np.ndarray, sample_rate: int = 16000) -> str:
|
@@ -158,7 +157,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
|
|
158 |
self.last_input_time = asyncio.get_event_loop().time()
|
159 |
|
160 |
def copy(self) -> "RAGVoiceHandler":
|
161 |
-
# Return a new instance with the same configuration
|
162 |
return RAGVoiceHandler(
|
163 |
expected_layout="mono",
|
164 |
output_sample_rate=self.output_sample_rate,
|
@@ -166,7 +164,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
|
|
166 |
)
|
167 |
|
168 |
async def stream(self) -> AsyncGenerator[bytes, None]:
|
169 |
-
# Continuously check for new audio; if a short silence occurs (timeout), process the buffered utterance.
|
170 |
while not self.quit.is_set():
|
171 |
try:
|
172 |
audio_data = await asyncio.wait_for(self.input_queue.get(), timeout=0.5)
|
@@ -174,7 +171,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
|
|
174 |
self.last_input_time = asyncio.get_event_loop().time()
|
175 |
except asyncio.TimeoutError:
|
176 |
if self.input_buffer:
|
177 |
-
# Process the buffered utterance
|
178 |
audio_array = np.frombuffer(self.input_buffer, dtype=np.int16)
|
179 |
self.input_buffer = bytearray()
|
180 |
query_text = speech_to_text(audio_array, sample_rate=self.input_sample_rate)
|
@@ -187,7 +183,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
|
|
187 |
await asyncio.sleep(0.1)
|
188 |
|
189 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
190 |
-
# Each received frame is added as bytes to the input queue
|
191 |
sample_rate, audio_array = frame
|
192 |
audio_bytes = audio_array.tobytes()
|
193 |
await self.input_queue.put(audio_bytes)
|
@@ -202,13 +197,10 @@ class RAGVoiceHandler(AsyncStreamHandler):
|
|
202 |
# 4. Voice Streaming Setup & FastAPI Endpoints
|
203 |
# ====================================================
|
204 |
|
205 |
-
#
|
206 |
-
|
207 |
-
|
208 |
-
else:
|
209 |
-
rtc_config = get_twilio_turn_credentials()
|
210 |
|
211 |
-
# Create a Stream instance using our RAGVoiceHandler.
|
212 |
stream = Stream(
|
213 |
modality="audio",
|
214 |
mode="send-receive",
|
@@ -218,7 +210,6 @@ stream = Stream(
|
|
218 |
time_limit=90,
|
219 |
)
|
220 |
|
221 |
-
# Define a simple input hook (if needed by the client to initialize the call)
|
222 |
class InputData(BaseModel):
|
223 |
webrtc_id: str
|
224 |
|
@@ -230,13 +221,10 @@ async def input_hook(body: InputData):
|
|
230 |
stream.set_input(body.webrtc_id)
|
231 |
return {"status": "ok"}
|
232 |
|
233 |
-
# Endpoint to handle WebRTC offer from the client (for voice calls)
|
234 |
@app.post("/webrtc/offer")
|
235 |
async def webrtc_offer(offer: dict):
|
236 |
-
# This uses fastrtc's built-in handling of the offer to set up the connection.
|
237 |
return await stream.handle_offer(offer)
|
238 |
|
239 |
-
# Serve your existing HTML file (which contains your voice UI)
|
240 |
@app.get("/")
|
241 |
async def index():
|
242 |
index_path = current_dir / "index.html"
|
@@ -250,14 +238,12 @@ async def index():
|
|
250 |
if __name__ == "__main__":
|
251 |
mode = os.getenv("MODE", "PHONE")
|
252 |
if mode == "UI":
|
253 |
-
# Optionally launch a text-based Gradio interface for testing the RAG backend
|
254 |
import gradio as gr
|
255 |
def gradio_chat(user_input):
|
256 |
return generate_answer(user_input)
|
257 |
iface = gr.Interface(fn=gradio_chat, inputs="text", outputs="text", title="Customer Support Chatbot")
|
258 |
iface.launch(server_port=7860)
|
259 |
elif mode == "PHONE":
|
260 |
-
# Run the FastAPI app so that callers can use the voice functionality.
|
261 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
262 |
else:
|
263 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
|
|
|
1 |
import asyncio
|
2 |
import base64
|
3 |
import json
|
4 |
+
import os
|
5 |
import pathlib
|
6 |
from typing import AsyncGenerator, Literal, List
|
7 |
|
|
|
9 |
from dotenv import load_dotenv
|
10 |
from fastapi import FastAPI
|
11 |
from fastapi.responses import HTMLResponse
|
12 |
+
from fastrtc import AsyncStreamHandler, Stream, wait_for_item
|
13 |
from pydantic import BaseModel
|
14 |
import uvicorn
|
15 |
|
16 |
+
# --- Import get_space to detect Hugging Face Spaces (optional) ---
|
17 |
from gradio.utils import get_space
|
18 |
|
19 |
# --- Document processing and RAG libraries ---
|
|
|
115 |
# 2. Speech-to-Text and Text-to-Speech Functions
|
116 |
# ====================================================
|
117 |
|
118 |
+
# Force Whisper to load on CPU explicitly
|
119 |
stt_model = whisper.load_model("base", device="cpu")
|
120 |
|
121 |
def speech_to_text(audio_array: np.ndarray, sample_rate: int = 16000) -> str:
|
|
|
157 |
self.last_input_time = asyncio.get_event_loop().time()
|
158 |
|
159 |
def copy(self) -> "RAGVoiceHandler":
|
|
|
160 |
return RAGVoiceHandler(
|
161 |
expected_layout="mono",
|
162 |
output_sample_rate=self.output_sample_rate,
|
|
|
164 |
)
|
165 |
|
166 |
async def stream(self) -> AsyncGenerator[bytes, None]:
|
|
|
167 |
while not self.quit.is_set():
|
168 |
try:
|
169 |
audio_data = await asyncio.wait_for(self.input_queue.get(), timeout=0.5)
|
|
|
171 |
self.last_input_time = asyncio.get_event_loop().time()
|
172 |
except asyncio.TimeoutError:
|
173 |
if self.input_buffer:
|
|
|
174 |
audio_array = np.frombuffer(self.input_buffer, dtype=np.int16)
|
175 |
self.input_buffer = bytearray()
|
176 |
query_text = speech_to_text(audio_array, sample_rate=self.input_sample_rate)
|
|
|
183 |
await asyncio.sleep(0.1)
|
184 |
|
185 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
|
|
186 |
sample_rate, audio_array = frame
|
187 |
audio_bytes = audio_array.tobytes()
|
188 |
await self.input_queue.put(audio_bytes)
|
|
|
197 |
# 4. Voice Streaming Setup & FastAPI Endpoints
|
198 |
# ====================================================
|
199 |
|
200 |
+
# For ZeroGPU spaces, supply a dummy RTC configuration.
|
201 |
+
# (This avoids calling get_twilio_turn_credentials() which depends on NVML.)
|
202 |
+
rtc_config = {"iceServers": [{"urls": "stun:stun.l.google.com:19302"}]}
|
|
|
|
|
203 |
|
|
|
204 |
stream = Stream(
|
205 |
modality="audio",
|
206 |
mode="send-receive",
|
|
|
210 |
time_limit=90,
|
211 |
)
|
212 |
|
|
|
213 |
class InputData(BaseModel):
|
214 |
webrtc_id: str
|
215 |
|
|
|
221 |
stream.set_input(body.webrtc_id)
|
222 |
return {"status": "ok"}
|
223 |
|
|
|
224 |
@app.post("/webrtc/offer")
|
225 |
async def webrtc_offer(offer: dict):
|
|
|
226 |
return await stream.handle_offer(offer)
|
227 |
|
|
|
228 |
@app.get("/")
|
229 |
async def index():
|
230 |
index_path = current_dir / "index.html"
|
|
|
238 |
if __name__ == "__main__":
|
239 |
mode = os.getenv("MODE", "PHONE")
|
240 |
if mode == "UI":
|
|
|
241 |
import gradio as gr
|
242 |
def gradio_chat(user_input):
|
243 |
return generate_answer(user_input)
|
244 |
iface = gr.Interface(fn=gradio_chat, inputs="text", outputs="text", title="Customer Support Chatbot")
|
245 |
iface.launch(server_port=7860)
|
246 |
elif mode == "PHONE":
|
|
|
247 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
248 |
else:
|
249 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|