Spaces:

scooter7
/

voicechat

Sleeping

App Files Files Community

scooter7 commited on Mar 2

Commit

59c59d4

verified ·

1 Parent(s): 697db26

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -21

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
-import os
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
 import asyncio
 import base64
 import json
 import pathlib
 from typing import AsyncGenerator, Literal, List
@@ -10,11 +9,11 @@ import numpy as np
 from dotenv import load_dotenv
 from fastapi import FastAPI
 from fastapi.responses import HTMLResponse
-from fastrtc import AsyncStreamHandler, Stream, get_twilio_turn_credentials, wait_for_item
 from pydantic import BaseModel
 import uvicorn
-# --- Import get_space to detect Hugging Face Spaces ---
 from gradio.utils import get_space
 # --- Document processing and RAG libraries ---
@@ -116,7 +115,7 @@ def generate_answer(query: str) -> str:
 # 2. Speech-to-Text and Text-to-Speech Functions
 # ====================================================
-# Load Whisper model for speech-to-text
 stt_model = whisper.load_model("base", device="cpu")
 def speech_to_text(audio_array: np.ndarray, sample_rate: int = 16000) -> str:
@@ -158,7 +157,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
         self.last_input_time = asyncio.get_event_loop().time()
     def copy(self) -> "RAGVoiceHandler":
-        # Return a new instance with the same configuration
         return RAGVoiceHandler(
             expected_layout="mono",
             output_sample_rate=self.output_sample_rate,
@@ -166,7 +164,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
         )
     async def stream(self) -> AsyncGenerator[bytes, None]:
-        # Continuously check for new audio; if a short silence occurs (timeout), process the buffered utterance.
         while not self.quit.is_set():
             try:
                 audio_data = await asyncio.wait_for(self.input_queue.get(), timeout=0.5)
@@ -174,7 +171,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
                 self.last_input_time = asyncio.get_event_loop().time()
             except asyncio.TimeoutError:
                 if self.input_buffer:
-                    # Process the buffered utterance
                     audio_array = np.frombuffer(self.input_buffer, dtype=np.int16)
                     self.input_buffer = bytearray()
                     query_text = speech_to_text(audio_array, sample_rate=self.input_sample_rate)
@@ -187,7 +183,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
             await asyncio.sleep(0.1)
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
-        # Each received frame is added as bytes to the input queue
         sample_rate, audio_array = frame
         audio_bytes = audio_array.tobytes()
         await self.input_queue.put(audio_bytes)
@@ -202,13 +197,10 @@ class RAGVoiceHandler(AsyncStreamHandler):
 # 4. Voice Streaming Setup & FastAPI Endpoints
 # ====================================================
-# When running on Hugging Face Spaces, supply a dummy RTC configuration.
-if get_space():
-    rtc_config = {"iceServers": [{"urls": "stun:stun.l.google.com:19302"}]}
-else:
-    rtc_config = get_twilio_turn_credentials()
-# Create a Stream instance using our RAGVoiceHandler.
 stream = Stream(
     modality="audio",
     mode="send-receive",
@@ -218,7 +210,6 @@ stream = Stream(
     time_limit=90,
 )
-# Define a simple input hook (if needed by the client to initialize the call)
 class InputData(BaseModel):
     webrtc_id: str
@@ -230,13 +221,10 @@ async def input_hook(body: InputData):
     stream.set_input(body.webrtc_id)
     return {"status": "ok"}
-# Endpoint to handle WebRTC offer from the client (for voice calls)
 @app.post("/webrtc/offer")
 async def webrtc_offer(offer: dict):
-    # This uses fastrtc's built-in handling of the offer to set up the connection.
     return await stream.handle_offer(offer)
-# Serve your existing HTML file (which contains your voice UI)
 @app.get("/")
 async def index():
     index_path = current_dir / "index.html"
@@ -250,14 +238,12 @@ async def index():
 if __name__ == "__main__":
     mode = os.getenv("MODE", "PHONE")
     if mode == "UI":
-        # Optionally launch a text-based Gradio interface for testing the RAG backend
         import gradio as gr
         def gradio_chat(user_input):
             return generate_answer(user_input)
         iface = gr.Interface(fn=gradio_chat, inputs="text", outputs="text", title="Customer Support Chatbot")
         iface.launch(server_port=7860)
     elif mode == "PHONE":
-        # Run the FastAPI app so that callers can use the voice functionality.
         uvicorn.run(app, host="0.0.0.0", port=7860)
     else:
         uvicorn.run(app, host="0.0.0.0", port=7860)

 import asyncio
 import base64
 import json
+import os
 import pathlib
 from typing import AsyncGenerator, Literal, List
 from dotenv import load_dotenv
 from fastapi import FastAPI
 from fastapi.responses import HTMLResponse
+from fastrtc import AsyncStreamHandler, Stream, wait_for_item
 from pydantic import BaseModel
 import uvicorn
+# --- Import get_space to detect Hugging Face Spaces (optional) ---
 from gradio.utils import get_space
 # --- Document processing and RAG libraries ---
 # 2. Speech-to-Text and Text-to-Speech Functions
 # ====================================================
+# Force Whisper to load on CPU explicitly
 stt_model = whisper.load_model("base", device="cpu")
 def speech_to_text(audio_array: np.ndarray, sample_rate: int = 16000) -> str:
         self.last_input_time = asyncio.get_event_loop().time()
     def copy(self) -> "RAGVoiceHandler":
         return RAGVoiceHandler(
             expected_layout="mono",
             output_sample_rate=self.output_sample_rate,
         )
     async def stream(self) -> AsyncGenerator[bytes, None]:
         while not self.quit.is_set():
             try:
                 audio_data = await asyncio.wait_for(self.input_queue.get(), timeout=0.5)
                 self.last_input_time = asyncio.get_event_loop().time()
             except asyncio.TimeoutError:
                 if self.input_buffer:
                     audio_array = np.frombuffer(self.input_buffer, dtype=np.int16)
                     self.input_buffer = bytearray()
                     query_text = speech_to_text(audio_array, sample_rate=self.input_sample_rate)
             await asyncio.sleep(0.1)
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         sample_rate, audio_array = frame
         audio_bytes = audio_array.tobytes()
         await self.input_queue.put(audio_bytes)
 # 4. Voice Streaming Setup & FastAPI Endpoints
 # ====================================================
+# For ZeroGPU spaces, supply a dummy RTC configuration.
+# (This avoids calling get_twilio_turn_credentials() which depends on NVML.)
+rtc_config = {"iceServers": [{"urls": "stun:stun.l.google.com:19302"}]}
 stream = Stream(
     modality="audio",
     mode="send-receive",
     time_limit=90,
 )
 class InputData(BaseModel):
     webrtc_id: str
     stream.set_input(body.webrtc_id)
     return {"status": "ok"}
 @app.post("/webrtc/offer")
 async def webrtc_offer(offer: dict):
     return await stream.handle_offer(offer)
 @app.get("/")
 async def index():
     index_path = current_dir / "index.html"
 if __name__ == "__main__":
     mode = os.getenv("MODE", "PHONE")
     if mode == "UI":
         import gradio as gr
         def gradio_chat(user_input):
             return generate_answer(user_input)
         iface = gr.Interface(fn=gradio_chat, inputs="text", outputs="text", title="Customer Support Chatbot")
         iface.launch(server_port=7860)
     elif mode == "PHONE":
         uvicorn.run(app, host="0.0.0.0", port=7860)
     else:
         uvicorn.run(app, host="0.0.0.0", port=7860)