Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- README.md +92 -6
- app.py +99 -0
- requirements.txt +136 -0
README.md
CHANGED
@@ -1,12 +1,98 @@
|
|
1 |
---
|
2 |
-
title: Talk
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Talk to Smolagents
|
3 |
+
emoji: π»
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: red
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.16.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: mit
|
11 |
+
short_description: FastRTC Voice Agent with smolagents
|
12 |
+
tags: [webrtc, websocket, gradio, secret|HF_TOKEN]
|
13 |
---
|
14 |
|
15 |
+
# Voice LLM Agent with Image Generation
|
16 |
+
|
17 |
+
A voice-enabled AI assistant powered by FastRTC that can:
|
18 |
+
1. Stream audio in real-time using WebRTC
|
19 |
+
2. Listen and respond with natural pauses in conversation
|
20 |
+
3. Generate images based on your requests
|
21 |
+
4. Maintain conversation context across exchanges
|
22 |
+
|
23 |
+
This app combines the real-time communication capabilities of FastRTC with the powerful agent framework of smolagents.
|
24 |
+
|
25 |
+
## Key Features
|
26 |
+
|
27 |
+
- **Real-time Streaming**: Uses FastRTC's WebRTC-based audio streaming
|
28 |
+
- **Voice Activation**: Automatic detection of speech pauses to trigger responses
|
29 |
+
- **Multi-modal Interaction**: Combines voice and image generation in a single interface
|
30 |
+
|
31 |
+
## Setup
|
32 |
+
|
33 |
+
1. Install Python 3.9+ and create a virtual environment:
|
34 |
+
```bash
|
35 |
+
python -m venv .venv
|
36 |
+
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
37 |
+
```
|
38 |
+
|
39 |
+
2. Install dependencies:
|
40 |
+
```bash
|
41 |
+
pip install -r requirements.txt
|
42 |
+
```
|
43 |
+
|
44 |
+
3. Create a `.env` file with the following:
|
45 |
+
```
|
46 |
+
HF_TOKEN=your_huggingface_api_key
|
47 |
+
MODE=UI # Use 'UI' for Gradio interface, leave blank for HTML interface
|
48 |
+
```
|
49 |
+
|
50 |
+
## Running the App
|
51 |
+
|
52 |
+
### With Gradio UI (Recommended)
|
53 |
+
|
54 |
+
```bash
|
55 |
+
MODE=UI python app.py
|
56 |
+
```
|
57 |
+
|
58 |
+
This launches a Gradio UI at http://localhost:7860 with:
|
59 |
+
- FastRTC's built-in streaming audio components
|
60 |
+
- A chat interface showing the conversation
|
61 |
+
- An image display panel for generated images
|
62 |
+
|
63 |
+
## How to Use
|
64 |
+
|
65 |
+
1. Click the microphone button to start streaming your voice.
|
66 |
+
2. Speak naturally - the app will automatically detect when you pause.
|
67 |
+
3. Ask the agent to generate an image, for example:
|
68 |
+
- "Create an image of a magical forest with glowing mushrooms."
|
69 |
+
- "Generate a picture of a futuristic city with flying cars."
|
70 |
+
4. View the generated image and hear the agent's response.
|
71 |
+
|
72 |
+
## Technical Architecture
|
73 |
+
|
74 |
+
### FastRTC Components
|
75 |
+
|
76 |
+
- **Stream**: Core component that handles WebRTC connections and audio streaming
|
77 |
+
- **ReplyOnPause**: Detects when the user stops speaking to trigger a response
|
78 |
+
- **get_stt_model/get_tts_model**: Provides optimized speech-to-text and text-to-speech models
|
79 |
+
|
80 |
+
### smolagents Components
|
81 |
+
|
82 |
+
- **CodeAgent**: Intelligent agent that can use tools based on natural language inputs
|
83 |
+
- **Tool.from_space**: Integration with Hugging Face Spaces for image generation
|
84 |
+
- **HfApiModel**: Connection to powerful language models for understanding requests
|
85 |
+
|
86 |
+
### Integration Flow
|
87 |
+
|
88 |
+
1. FastRTC streams and processes audio input in real-time
|
89 |
+
2. Speech is converted to text and passed to the smolagents CodeAgent
|
90 |
+
3. The agent processes the request and calls tools when needed
|
91 |
+
4. Responses and generated images are streamed back through FastRTC
|
92 |
+
5. The UI updates to show both text responses and generated images
|
93 |
+
|
94 |
+
## Advanced Features
|
95 |
+
|
96 |
+
- Conversation history is maintained across exchanges
|
97 |
+
- Error handling ensures the app continues working even if agent processing fails
|
98 |
+
- The application leverages FastRTC's streaming capabilities for efficient audio transmission
|
app.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from typing import Dict, List
|
3 |
+
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from fastrtc import (
|
6 |
+
ReplyOnPause,
|
7 |
+
Stream,
|
8 |
+
get_stt_model,
|
9 |
+
get_tts_model,
|
10 |
+
get_twilio_turn_credentials,
|
11 |
+
)
|
12 |
+
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel
|
13 |
+
|
14 |
+
# Load environment variables
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
# Initialize file paths
|
18 |
+
curr_dir = Path(__file__).parent
|
19 |
+
|
20 |
+
# Initialize models
|
21 |
+
stt_model = get_stt_model()
|
22 |
+
tts_model = get_tts_model()
|
23 |
+
|
24 |
+
# Conversation state to maintain history
|
25 |
+
conversation_state: List[Dict[str, str]] = []
|
26 |
+
|
27 |
+
# System prompt for agent
|
28 |
+
system_prompt = """You are a helpful assistant that can helps with finding places to
|
29 |
+
workremotely from. You should specifically check against reviews and ratings of the
|
30 |
+
place. You should use this criteria to find the best place to work from:
|
31 |
+
- Price
|
32 |
+
- Reviews
|
33 |
+
- Ratings
|
34 |
+
- Location
|
35 |
+
- WIFI
|
36 |
+
Only return the name, address of the place, and a short description of the place.
|
37 |
+
Always search for real places.
|
38 |
+
Only return real places, not fake ones.
|
39 |
+
If you receive anything other than a location, you should ask for a location.
|
40 |
+
<example>
|
41 |
+
User: I am in Paris, France. Can you find me a place to work from?
|
42 |
+
Assistant: I found a place called "Le CafΓ© de la Paix" at 123 Rue de la Paix,
|
43 |
+
Paris, France. It has good reviews and is in a great location.
|
44 |
+
</example>
|
45 |
+
<example>
|
46 |
+
User: I am in London, UK. Can you find me a place to work from?
|
47 |
+
Assistant: I found a place called "The London Coffee Company".
|
48 |
+
</example>
|
49 |
+
<example>
|
50 |
+
User: How many people are in the room?
|
51 |
+
Assistant: I only respond to requests about finding places to work from.
|
52 |
+
</example>
|
53 |
+
|
54 |
+
"""
|
55 |
+
|
56 |
+
model = HfApiModel(provider="together", model="Qwen/Qwen2.5-Coder-32B-Instruct")
|
57 |
+
|
58 |
+
agent = CodeAgent(
|
59 |
+
tools=[
|
60 |
+
DuckDuckGoSearchTool(),
|
61 |
+
],
|
62 |
+
model=model,
|
63 |
+
max_steps=10,
|
64 |
+
verbosity_level=2,
|
65 |
+
description="Search the web for cafes to work from.",
|
66 |
+
)
|
67 |
+
|
68 |
+
|
69 |
+
def process_response(audio):
|
70 |
+
"""Process audio input and generate LLM response with TTS"""
|
71 |
+
# Convert speech to text using STT model
|
72 |
+
text = stt_model.stt(audio)
|
73 |
+
if not text.strip():
|
74 |
+
return
|
75 |
+
|
76 |
+
input_text = f"{system_prompt}\n\n{text}"
|
77 |
+
# Get response from agent
|
78 |
+
response_content = agent.run(input_text)
|
79 |
+
|
80 |
+
# Convert response to audio using TTS model
|
81 |
+
for audio_chunk in tts_model.stream_tts_sync(response_content or ""):
|
82 |
+
# Yield the audio chunk
|
83 |
+
yield audio_chunk
|
84 |
+
|
85 |
+
|
86 |
+
stream = Stream(
|
87 |
+
handler=ReplyOnPause(process_response, input_sample_rate=16000),
|
88 |
+
modality="audio",
|
89 |
+
mode="send-receive",
|
90 |
+
ui_args={
|
91 |
+
"pulse_color": "rgb(255, 255, 255)",
|
92 |
+
"icon_button_color": "rgb(255, 255, 255)",
|
93 |
+
"title": "π§βπ»The Coworking Agent",
|
94 |
+
},
|
95 |
+
rtc_configuration=get_twilio_turn_credentials(),
|
96 |
+
)
|
97 |
+
|
98 |
+
if __name__ == "__main__":
|
99 |
+
stream.ui.launch(server_port=7860)
|
requirements.txt
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This file was autogenerated by uv via the following command:
|
2 |
+
# uv export --format requirements-txt --no-hashes
|
3 |
+
aiofiles==23.2.1
|
4 |
+
aiohappyeyeballs==2.4.6
|
5 |
+
aiohttp==3.11.13
|
6 |
+
aiohttp-retry==2.9.1
|
7 |
+
aioice==0.9.0
|
8 |
+
aiortc==1.10.1
|
9 |
+
aiosignal==1.3.2
|
10 |
+
annotated-types==0.7.0
|
11 |
+
anyio==4.8.0
|
12 |
+
async-timeout==5.0.1 ; python_full_version < '3.11'
|
13 |
+
attrs==25.1.0
|
14 |
+
audioop-lts==0.2.1 ; python_full_version >= '3.13'
|
15 |
+
audioread==3.0.1
|
16 |
+
av==13.1.0
|
17 |
+
babel==2.17.0
|
18 |
+
beautifulsoup4==4.13.3
|
19 |
+
certifi==2025.1.31
|
20 |
+
cffi==1.17.1
|
21 |
+
charset-normalizer==3.4.1
|
22 |
+
click==8.1.8
|
23 |
+
colorama==0.4.6
|
24 |
+
coloredlogs==15.0.1
|
25 |
+
colorlog==6.9.0
|
26 |
+
cryptography==44.0.1
|
27 |
+
csvw==3.5.1
|
28 |
+
decorator==5.2.1
|
29 |
+
dlinfo==2.0.0
|
30 |
+
dnspython==2.7.0
|
31 |
+
duckduckgo-search==7.5.0
|
32 |
+
espeakng-loader==0.2.4
|
33 |
+
exceptiongroup==1.2.2 ; python_full_version < '3.11'
|
34 |
+
fastapi==0.115.8
|
35 |
+
fastrtc==0.0.8.post1
|
36 |
+
fastrtc-moonshine-onnx==20241016
|
37 |
+
ffmpy==0.5.0
|
38 |
+
filelock==3.17.0
|
39 |
+
flatbuffers==25.2.10
|
40 |
+
frozenlist==1.5.0
|
41 |
+
fsspec==2025.2.0
|
42 |
+
google-crc32c==1.6.0
|
43 |
+
gradio==5.19.0
|
44 |
+
gradio-client==1.7.2
|
45 |
+
h11==0.14.0
|
46 |
+
httpcore==1.0.7
|
47 |
+
httpx==0.28.1
|
48 |
+
huggingface-hub==0.29.1
|
49 |
+
humanfriendly==10.0
|
50 |
+
idna==3.10
|
51 |
+
ifaddr==0.2.0
|
52 |
+
isodate==0.7.2
|
53 |
+
jinja2==3.1.5
|
54 |
+
joblib==1.4.2
|
55 |
+
jsonschema==4.23.0
|
56 |
+
jsonschema-specifications==2024.10.1
|
57 |
+
kokoro-onnx==0.4.3
|
58 |
+
language-tags==1.2.0
|
59 |
+
lazy-loader==0.4
|
60 |
+
librosa==0.10.2.post1
|
61 |
+
llvmlite==0.44.0
|
62 |
+
lxml==5.3.1
|
63 |
+
markdown-it-py==3.0.0
|
64 |
+
markdownify==1.0.0
|
65 |
+
markupsafe==2.1.5
|
66 |
+
mdurl==0.1.2
|
67 |
+
mpmath==1.3.0
|
68 |
+
msgpack==1.1.0
|
69 |
+
multidict==6.1.0
|
70 |
+
numba==0.61.0
|
71 |
+
numpy==2.1.3
|
72 |
+
onnxruntime==1.20.1
|
73 |
+
orjson==3.10.15
|
74 |
+
packaging==24.2
|
75 |
+
pandas==2.2.3
|
76 |
+
phonemizer-fork==3.3.1
|
77 |
+
pillow==11.1.0
|
78 |
+
platformdirs==4.3.6
|
79 |
+
pooch==1.8.2
|
80 |
+
primp==0.14.0
|
81 |
+
propcache==0.3.0
|
82 |
+
protobuf==5.29.3
|
83 |
+
pycparser==2.22
|
84 |
+
pydantic==2.10.6
|
85 |
+
pydantic-core==2.27.2
|
86 |
+
pydub==0.25.1
|
87 |
+
pyee==12.1.1
|
88 |
+
pygments==2.19.1
|
89 |
+
pyjwt==2.10.1
|
90 |
+
pylibsrtp==0.11.0
|
91 |
+
pyopenssl==25.0.0
|
92 |
+
pyparsing==3.2.1
|
93 |
+
pyreadline3==3.5.4 ; sys_platform == 'win32'
|
94 |
+
python-dateutil==2.9.0.post0
|
95 |
+
python-dotenv==1.0.1
|
96 |
+
python-multipart==0.0.20
|
97 |
+
pytz==2025.1
|
98 |
+
pyyaml==6.0.2
|
99 |
+
rdflib==7.1.3
|
100 |
+
referencing==0.36.2
|
101 |
+
regex==2024.11.6
|
102 |
+
requests==2.32.3
|
103 |
+
rfc3986==1.5.0
|
104 |
+
rich==13.9.4
|
105 |
+
rpds-py==0.23.1
|
106 |
+
ruff==0.9.7 ; sys_platform != 'emscripten'
|
107 |
+
safehttpx==0.1.6
|
108 |
+
scikit-learn==1.6.1
|
109 |
+
scipy==1.15.2
|
110 |
+
segments==2.3.0
|
111 |
+
semantic-version==2.10.0
|
112 |
+
shellingham==1.5.4 ; sys_platform != 'emscripten'
|
113 |
+
six==1.17.0
|
114 |
+
smolagents==1.9.2
|
115 |
+
sniffio==1.3.1
|
116 |
+
soundfile==0.13.1
|
117 |
+
soupsieve==2.6
|
118 |
+
soxr==0.5.0.post1
|
119 |
+
standard-aifc==3.13.0 ; python_full_version >= '3.13'
|
120 |
+
standard-chunk==3.13.0 ; python_full_version >= '3.13'
|
121 |
+
standard-sunau==3.13.0 ; python_full_version >= '3.13'
|
122 |
+
starlette==0.45.3
|
123 |
+
sympy==1.13.3
|
124 |
+
threadpoolctl==3.5.0
|
125 |
+
tokenizers==0.21.0
|
126 |
+
tomlkit==0.13.2
|
127 |
+
tqdm==4.67.1
|
128 |
+
twilio==9.4.6
|
129 |
+
typer==0.15.1 ; sys_platform != 'emscripten'
|
130 |
+
typing-extensions==4.12.2
|
131 |
+
tzdata==2025.1
|
132 |
+
uritemplate==4.1.1
|
133 |
+
urllib3==2.3.0
|
134 |
+
uvicorn==0.34.0 ; sys_platform != 'emscripten'
|
135 |
+
websockets==15.0
|
136 |
+
yarl==1.18.3
|