freddyaboulton HF staff commited on
Commit
ff470a3
1 Parent(s): 73fffb1
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +141 -0
  3. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs
3
+ import anthropic
4
+ from pyht import Client as PyHtClient, TTSOptions
5
+ import dataclasses
6
+ import os
7
+ import numpy as np
8
+ from huggingface_hub import InferenceClient
9
+ import io
10
+ from pydub import AudioSegment
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+
15
+ account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
16
+ auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
17
+
18
+ if account_sid and auth_token:
19
+ from twilio.rest import Client
20
+ client = Client(account_sid, auth_token)
21
+
22
+ token = client.tokens.create()
23
+
24
+ rtc_configuration = {
25
+ "iceServers": token.ice_servers,
26
+ "iceTransportPolicy": "relay",
27
+ }
28
+ else:
29
+ rtc_configuration = None
30
+
31
+
32
+ @dataclasses.dataclass
33
+ class Clients:
34
+ claude: anthropic.Anthropic
35
+ play_ht: PyHtClient
36
+ hf: InferenceClient
37
+
38
+
39
+ tts_options = TTSOptions(voice="s3://voice-cloning-zero-shot/775ae416-49bb-4fb6-bd45-740f205d20a1/jennifersaad/manifest.json",
40
+ sample_rate=24000)
41
+
42
+
43
+ def aggregate_chunks(chunks_iterator):
44
+ leftover = b'' # Store incomplete bytes between chunks
45
+
46
+ for chunk in chunks_iterator:
47
+ # Combine with any leftover bytes from previous chunk
48
+ current_bytes = leftover + chunk
49
+
50
+ # Calculate complete samples
51
+ n_complete_samples = len(current_bytes) // 2 # int16 = 2 bytes
52
+ bytes_to_process = n_complete_samples * 2
53
+
54
+ # Split into complete samples and leftover
55
+ to_process = current_bytes[:bytes_to_process]
56
+ leftover = current_bytes[bytes_to_process:]
57
+
58
+ if to_process: # Only yield if we have complete samples
59
+ audio_array = np.frombuffer(to_process, dtype=np.int16).reshape(1, -1)
60
+ yield audio_array
61
+
62
+
63
+ def audio_to_bytes(audio: tuple[int, np.ndarray]) -> bytes:
64
+ audio_buffer = io.BytesIO()
65
+ segment = AudioSegment(
66
+ audio[1].tobytes(),
67
+ frame_rate=audio[0],
68
+ sample_width=audio[1].dtype.itemsize,
69
+ channels=1,
70
+ )
71
+ segment.export(audio_buffer, format="mp3")
72
+ return audio_buffer.getvalue()
73
+
74
+
75
+ def set_api_key(claude_key, play_ht_username, play_ht_key):
76
+ claude_client = anthropic.Anthropic(api_key=claude_key)
77
+ play_ht_client = PyHtClient(user_id=play_ht_username, api_key=play_ht_key)
78
+ gr.Info("Successfully set API keys.", duration=3)
79
+ return Clients(claude=claude_client, play_ht=play_ht_client,
80
+ hf=InferenceClient()), gr.skip()
81
+
82
+
83
+ def response(audio: tuple[int, np.ndarray], conversation_llm_format: list[dict],
84
+ chatbot: list[dict], client_state: Clients):
85
+ if not client_state:
86
+ raise gr.Error("Please set your API keys first.")
87
+ prompt = client_state.hf.automatic_speech_recognition(audio_to_bytes(audio)).text
88
+ conversation_llm_format.append({"role": "user", "content": prompt})
89
+ response = client_state.claude.messages.create(
90
+ model="claude-3-5-haiku-20241022",
91
+ max_tokens=512,
92
+ messages=conversation_llm_format,
93
+ )
94
+ response_text = " ".join(block.text for block in response.content if getattr(block, "type", None) == "text")
95
+ conversation_llm_format.append({"role": "assistant", "content": response_text})
96
+ chatbot.append({"role": "user", "content": prompt})
97
+ chatbot.append({"role": "assistant", "content": response_text})
98
+ yield AdditionalOutputs(conversation_llm_format, chatbot)
99
+ iterator = client_state.play_ht.tts(response_text, options=tts_options, voice_engine="Play3.0")
100
+ for chunk in aggregate_chunks(iterator):
101
+ audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
102
+ yield (24000, audio_array, "mono")
103
+
104
+
105
+ with gr.Blocks() as demo:
106
+ with gr.Group():
107
+ with gr.Row():
108
+ chatbot = gr.Chatbot(label="Conversation", type="messages")
109
+ with gr.Row(equal_height=True):
110
+ with gr.Column(scale=1):
111
+ with gr.Row():
112
+ claude_key = gr.Textbox(type="password", value=os.getenv("ANTHROPIC_API_KEY"),
113
+ label="Enter your Anthropic API Key")
114
+ play_ht_username = gr.Textbox(type="password",
115
+ value=os.getenv("PLAY_HT_USER_ID"),
116
+ label="Enter your PlayHt Username")
117
+ play_ht_key = gr.Textbox(type="password",
118
+ value=os.getenv("PLAY_HT_API_KEY"),
119
+ label="Enter your PlayHt API Key")
120
+ with gr.Row():
121
+ set_key_button = gr.Button("Set Keys", variant="primary")
122
+ with gr.Column(scale=5):
123
+ audio = WebRTC(modality="audio", mode="send-receive",
124
+ label="Audio Stream",
125
+ rtc_configuration=rtc_configuration)
126
+
127
+ client_state = gr.State(None)
128
+ conversation_llm_format = gr.State([])
129
+
130
+ set_key_button.click(set_api_key, inputs=[claude_key, play_ht_username, play_ht_key],
131
+ outputs=[client_state, set_key_button])
132
+ audio.stream(
133
+ ReplyOnPause(response),
134
+ inputs=[audio, conversation_llm_format, chatbot, client_state],
135
+ outputs=[audio]
136
+ )
137
+ audio.on_additional_outputs(lambda l, g: (l, g), outputs=[conversation_llm_format, chatbot])
138
+
139
+
140
+ if __name__ == "__main__":
141
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio_webrtc[vad]==0.0.11
2
+ anthropic
3
+ pyht
4
+ twilio
5
+ python-dotenv