puzan789 commited on
Commit
604255a
·
1 Parent(s): 883010a
.env ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ GROQ_API_KEY= "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"
2
+ ELEVENLABS_API_KEY="sk_73eea9733ef8c80ebc700cd9f4432bc127406d116f7b61c2"
3
+ ELEVENLABS_VOICE_ID="9BWtsMINqrJLrRacOk9x"
4
+ TTS_MODEL_NAME="eleven_multilingual_v2"
.gitignore ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+
5
+
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ pip-wheel-metadata/
20
+ share/python-wheels/
21
+ *.egg-info/
22
+ .installed.cfg
23
+ *.egg
24
+ MANIFEST
25
+
26
+
27
+ __pypackages__/
28
+
29
+ celerybeat-schedule
30
+ celerybeat.pid
31
+
32
+
33
+ .idea/
34
+ *.iml
35
+ *.iws
36
+ /out/
37
+
38
+ .jupyter/
39
+ .labwork/
40
+ secrets
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a base image (e.g., Python)
2
+ FROM python:3.9-slim
3
+
4
+ # Set the working directory inside the container
5
+ WORKDIR /app
6
+
7
+ # Install ffmpeg and other dependencies
8
+ RUN apt-get update && \
9
+ apt-get install -y ffmpeg && \
10
+ rm -rf /var/lib/apt/lists/*
11
+
12
+ # Copy your application files to the container
13
+ COPY . .
14
+
15
+ # Install Python dependencies
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Expose the required port (if applicable)
19
+ EXPOSE 7860
20
+
21
+ # Set the default command to run the application
22
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # ZODBERT
apo.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+
4
+ API_URL="http://localhost:8000/audios/record"
5
+
6
+ st.title("Speech To Text Testing ")
7
+ st.write("Click 'Start Recording' to capture and transcribe speech.")
8
+
9
+ if st.button("Start Recording"):
10
+ with st.spinner("Recording... Speak now."):
11
+ response = requests.get(API_URL)
12
+ if response.status_code == 200:
13
+ st.success("Recording complete!")
14
+ st.write("### Transcription:")
15
+ st.write(response.json()["transcription"])
16
+ else:
17
+ st.error("Error recording audio")
18
+
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from fastapi import FastAPI, Request
3
+ from fastapi.responses import HTMLResponse
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from fastapi.staticfiles import StaticFiles
6
+ from src.api.api import conversation
7
+ from pathlib import Path
8
+
9
+ app = FastAPI()
10
+ app.include_router(conversation,prefix="/conv")
11
+
12
+ if __name__ == "__main__":
13
+ import uvicorn
14
+
15
+ uvicorn.run(
16
+ "main:app",
17
+ host="0.0.0.0",
18
+ port=7860,
19
+ )
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ elevenlabs
2
+ fastapi
3
+ groq
4
+ langchain
5
+ langchain_core
6
+ langchain_groq
7
+ PyAudio
8
+ python-dotenv
9
+ Requests
10
+ streamlit
11
+ uvicorn
12
+ langchain_community
research.ipynb ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "1\n",
13
+ "2\n",
14
+ "3\n"
15
+ ]
16
+ }
17
+ ],
18
+ "source": [
19
+ "import queue\n",
20
+ "\n",
21
+ "# Create a FIFO queue\n",
22
+ "q = queue.Queue()\n",
23
+ "\n",
24
+ "# Put items in the queue\n",
25
+ "q.put(1)\n",
26
+ "q.put(2)\n",
27
+ "q.put(3)\n",
28
+ "\n",
29
+ "# Get items from the queue\n",
30
+ "print(q.get()) # prints: 1\n",
31
+ "print(q.get()) # prints: 2\n",
32
+ "print(q.get()) # prints: 3"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 2,
38
+ "metadata": {},
39
+ "outputs": [
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "Alice\n",
45
+ "Bob\n",
46
+ "Charlie\n"
47
+ ]
48
+ }
49
+ ],
50
+ "source": [
51
+ "import queue\n",
52
+ "\n",
53
+ "# Create a line\n",
54
+ "line = queue.Queue()\n",
55
+ "\n",
56
+ "# People join the line\n",
57
+ "line.put(\"Alice\") # Alice joins first\n",
58
+ "line.put(\"Bob\") # Bob joins second\n",
59
+ "line.put(\"Charlie\") # Charlie joins last\n",
60
+ "\n",
61
+ "# People leave the line\n",
62
+ "print(line.get()) # Alice leaves first\n",
63
+ "print(line.get()) # Bob leaves second\n",
64
+ "print(line.get()) # Charlie leaves last"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 3,
70
+ "metadata": {},
71
+ "outputs": [
72
+ {
73
+ "data": {
74
+ "text/plain": [
75
+ "ReturnPagedConfigs(page_number=0, page_size=10, total_pages=1, configs_page=[ReturnConfig(id='3bad47d1-bd76-48ed-b596-6c37a89b5570', version=0, evi_version='2', version_description='', name='2/17/2025, 03:24:50 PM', created_on=1739785191727, modified_on=1739785191727, prompt=ReturnPrompt(id='1d3c2d75-07fe-424b-97b7-04a8675a7318', version=0, version_type='FIXED', version_description='', name='prompt-phlhWtIJDokAdrbFERKnj', created_on=1739785190915, modified_on=1739785190915, text=''), voice=ReturnVoice(provider='HUME_AI', name=None, custom_voice=None), language_model=None, ellm_model=ReturnEllmModel(allow_short_responses=True), tools=[], builtin_tools=[], event_messages=ReturnEventMessageSpecs(on_new_chat=ReturnEventMessageSpec(enabled=False, text=None), on_inactivity_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_max_duration_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_resume_chat={'enabled': False, 'text': None}, on_disconnect_resume_chat={'enabled': False, 'text': None}), timeouts=ReturnTimeoutSpecs(inactivity=ReturnTimeoutSpec(enabled=True, duration_secs=120), max_duration=ReturnTimeoutSpec(enabled=True, duration_secs=1800)), webhooks=[])])"
76
+ ]
77
+ },
78
+ "execution_count": 3,
79
+ "metadata": {},
80
+ "output_type": "execute_result"
81
+ }
82
+ ],
83
+ "source": [
84
+ "from hume.client import HumeClient\n",
85
+ "import os \n",
86
+ "client = HumeClient(\n",
87
+ " api_key=os.getenv(\"HUME_API_KEY\"), # Defaults to HUME_API_KEY\n",
88
+ ")\n",
89
+ "client.empathic_voice.configs.list_configs()\n"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "metadata": {},
96
+ "outputs": [
97
+ {
98
+ "name": "stdout",
99
+ "output_type": "stream",
100
+ "text": [
101
+ "Error: 404 \n",
102
+ " {\"code\":404, \"message\":\"Invalid URI Path\"}\n",
103
+ " \n"
104
+ ]
105
+ }
106
+ ],
107
+ "source": [
108
+ "import requests\n",
109
+ "import json\n",
110
+ "\n",
111
+ "\n",
112
+ "\n",
113
+ "# Function to connect to the EVI and send text for TTS\n",
114
+ "def send_text_for_tts(text):\n",
115
+ " url = 'https://api.hume.ai/v0/evi/chat'\n",
116
+ " \n",
117
+ " headers = {\n",
118
+ " 'X-Hume-Api-Key': API_KEY,\n",
119
+ " 'Content-Type': 'application/json'\n",
120
+ " }\n",
121
+ " \n",
122
+ " payload = {\n",
123
+ " 'configId': CONFIG_ID,\n",
124
+ " 'text': text\n",
125
+ " }\n",
126
+ " \n",
127
+ " response = requests.post(url, headers=headers, data=json.dumps(payload))\n",
128
+ " \n",
129
+ " if response.status_code == 200:\n",
130
+ " print('Response:', response.json())\n",
131
+ " else:\n",
132
+ " print('Error:', response.status_code, response.text)\n",
133
+ "\n",
134
+ "# Example usage\n",
135
+ "send_text_for_tts(\"Hello, this is a text to speech conversion using Hume.\")"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 7,
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "ename": "ImportError",
145
+ "evalue": "cannot import name 'HumeStreamClient' from 'hume' (/home/puzan/miniconda3/envs/conversai/lib/python3.12/site-packages/hume/__init__.py)",
146
+ "output_type": "error",
147
+ "traceback": [
148
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
149
+ "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
150
+ "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mhume\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m HumeStreamClient\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mhume\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m ProsodyConfig\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mrequests\u001b[39;00m\n",
151
+ "\u001b[0;31mImportError\u001b[0m: cannot import name 'HumeStreamClient' from 'hume' (/home/puzan/miniconda3/envs/conversai/lib/python3.12/site-packages/hume/__init__.py)"
152
+ ]
153
+ }
154
+ ],
155
+ "source": [
156
+ "from hume import HumeStreamClient\n",
157
+ "from hume.models.config import ProsodyConfig\n",
158
+ "import requests\n",
159
+ "\n",
160
+ "API_KEY = \"your_hume_api_key\"\n",
161
+ "\n",
162
+ "def text_to_audio(text):\n",
163
+ " client = HumeStreamClient(API_KEY)\n",
164
+ " config = ProsodyConfig()\n",
165
+ " \n",
166
+ " print(\"Generating audio...\")\n",
167
+ " response = client.submit([text], [config])\n",
168
+ " \n",
169
+ " for job in response:\n",
170
+ " audio_url = job[\"results\"][\"prosody\"][\"audio_url\"]\n",
171
+ " print(f\"Audio generated! You can download it here: {audio_url}\")\n",
172
+ " return audio_url\n",
173
+ "\n",
174
+ "def download_audio(audio_url, output_file=\"output_audio.wav\"):\n",
175
+ " print(\"Downloading audio...\")\n",
176
+ " response = requests.get(audio_url)\n",
177
+ " \n",
178
+ " if response.status_code == 200:\n",
179
+ " with open(output_file, \"wb\") as f:\n",
180
+ " f.write(response.content)\n",
181
+ " print(f\"Audio saved as {output_file}\")\n",
182
+ " else:\n",
183
+ " print(f\"Failed to download audio. Status code: {response.status_code}\")\n",
184
+ "\n",
185
+ "if __name__ == \"__main__\":\n",
186
+ " text = \"Hello, this is a test of Hume's text-to-audio functionality.\"\n",
187
+ " audio_url = text_to_audio(text)\n",
188
+ " \n",
189
+ " if audio_url:\n",
190
+ " download_audio(audio_url)\n"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": 17,
196
+ "metadata": {},
197
+ "outputs": [
198
+ {
199
+ "data": {
200
+ "text/plain": [
201
+ "ReturnPagedConfigs(page_number=0, page_size=10, total_pages=1, configs_page=[ReturnConfig(id='9207e0c5-c75b-4b11-bbae-f5914614c652', version=0, evi_version='2', version_description='', name='2/19/2025, 03:27:23 PM', created_on=1739958144472, modified_on=1739958144472, prompt=ReturnPrompt(id='ba17413a-8251-4f39-8719-2d37bca6377c', version=0, version_type='FIXED', version_description='', name='prompt-LdpKTJOFO5tziv2EdiR6y', created_on=1739958143755, modified_on=1739958143755, text=''), voice=ReturnVoice(provider='HUME_AI', name='AURA', custom_voice=None), language_model=None, ellm_model=ReturnEllmModel(allow_short_responses=True), tools=[], builtin_tools=[], event_messages=ReturnEventMessageSpecs(on_new_chat=ReturnEventMessageSpec(enabled=False, text=None), on_inactivity_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_max_duration_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_resume_chat={'enabled': False, 'text': None}, on_disconnect_resume_chat={'enabled': False, 'text': None}), timeouts=ReturnTimeoutSpecs(inactivity=ReturnTimeoutSpec(enabled=True, duration_secs=120), max_duration=ReturnTimeoutSpec(enabled=True, duration_secs=1800)), webhooks=[]), ReturnConfig(id='3bad47d1-bd76-48ed-b596-6c37a89b5570', version=0, evi_version='2', version_description='', name='2/17/2025, 03:24:50 PM', created_on=1739785191727, modified_on=1739785191727, prompt=ReturnPrompt(id='1d3c2d75-07fe-424b-97b7-04a8675a7318', version=0, version_type='FIXED', version_description='', name='prompt-phlhWtIJDokAdrbFERKnj', created_on=1739785190915, modified_on=1739785190915, text=''), voice=ReturnVoice(provider='HUME_AI', name=None, custom_voice=None), language_model=None, ellm_model=ReturnEllmModel(allow_short_responses=True), tools=[], builtin_tools=[], event_messages=ReturnEventMessageSpecs(on_new_chat=ReturnEventMessageSpec(enabled=False, text=None), on_inactivity_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_max_duration_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_resume_chat={'enabled': False, 'text': None}, on_disconnect_resume_chat={'enabled': False, 'text': None}), timeouts=ReturnTimeoutSpecs(inactivity=ReturnTimeoutSpec(enabled=True, duration_secs=120), max_duration=ReturnTimeoutSpec(enabled=True, duration_secs=1800)), webhooks=[])])"
202
+ ]
203
+ },
204
+ "execution_count": 17,
205
+ "metadata": {},
206
+ "output_type": "execute_result"
207
+ }
208
+ ],
209
+ "source": [
210
+ "from hume.client import HumeClient \n",
211
+ "# authenticate the synchronous client \n",
212
+ "client = HumeClient(api_key=API_KEY) \n",
213
+ "# list your configs \n",
214
+ "client.empathic_voice.configs.list_configs()"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": 5,
220
+ "metadata": {},
221
+ "outputs": [
222
+ {
223
+ "name": "stdout",
224
+ "output_type": "stream",
225
+ "text": [
226
+ "sk_73eea9733ef8c80ebc700cd9f4432bc127406d116f7b61c2\n"
227
+ ]
228
+ },
229
+ {
230
+ "ename": "ApiError",
231
+ "evalue": "status_code: 401, body: {'detail': {'status': 'detected_unusual_activity', 'message': 'Unusual activity detected. Free Tier usage disabled. If you are using a proxy/VPN you might need to purchase a Paid Plan to not trigger our abuse detectors. Free Tier only works if users do not abuse it, for example by creating multiple free accounts. If we notice that many people try to abuse it, we will need to reconsider Free Tier altogether. \\nPlease play fair and purchase any Paid Subscription to continue.'}}",
232
+ "output_type": "error",
233
+ "traceback": [
234
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
235
+ "\u001b[0;31mApiError\u001b[0m Traceback (most recent call last)",
236
+ "Cell \u001b[0;32mIn[5], line 18\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m(API_KEY)\n\u001b[1;32m 10\u001b[0m audio \u001b[38;5;241m=\u001b[39m client\u001b[38;5;241m.\u001b[39mtext_to_speech\u001b[38;5;241m.\u001b[39mconvert(\n\u001b[1;32m 11\u001b[0m \n\u001b[1;32m 12\u001b[0m text\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe first move is what sets everything in motion.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m output_format\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmp3_44100_128\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 16\u001b[0m )\n\u001b[0;32m---> 18\u001b[0m \u001b[43mplay\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m)\u001b[49m\n",
237
+ "File \u001b[0;32m~/miniconda3/envs/conversai/lib/python3.12/site-packages/elevenlabs/play.py:19\u001b[0m, in \u001b[0;36mplay\u001b[0;34m(audio, notebook, use_ffmpeg)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mplay\u001b[39m(\n\u001b[1;32m 14\u001b[0m audio: Union[\u001b[38;5;28mbytes\u001b[39m, Iterator[\u001b[38;5;28mbytes\u001b[39m]], \n\u001b[1;32m 15\u001b[0m notebook: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, \n\u001b[1;32m 16\u001b[0m use_ffmpeg: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 17\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(audio, Iterator):\n\u001b[0;32m---> 19\u001b[0m audio \u001b[38;5;241m=\u001b[39m \u001b[38;5;124;43mb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m notebook:\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
238
+ "File \u001b[0;32m~/miniconda3/envs/conversai/lib/python3.12/site-packages/elevenlabs/text_to_speech/client.py:201\u001b[0m, in \u001b[0;36mTextToSpeechClient.convert\u001b[0;34m(self, voice_id, text, enable_logging, optimize_streaming_latency, output_format, model_id, language_code, voice_settings, pronunciation_dictionary_locators, seed, previous_text, next_text, previous_request_ids, next_request_ids, use_pvc_as_ivc, apply_text_normalization, request_options)\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m JSONDecodeError:\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ApiError(status_code\u001b[38;5;241m=\u001b[39m_response\u001b[38;5;241m.\u001b[39mstatus_code, body\u001b[38;5;241m=\u001b[39m_response\u001b[38;5;241m.\u001b[39mtext)\n\u001b[0;32m--> 201\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ApiError(status_code\u001b[38;5;241m=\u001b[39m_response\u001b[38;5;241m.\u001b[39mstatus_code, body\u001b[38;5;241m=\u001b[39m_response_json)\n",
239
+ "\u001b[0;31mApiError\u001b[0m: status_code: 401, body: {'detail': {'status': 'detected_unusual_activity', 'message': 'Unusual activity detected. Free Tier usage disabled. If you are using a proxy/VPN you might need to purchase a Paid Plan to not trigger our abuse detectors. Free Tier only works if users do not abuse it, for example by creating multiple free accounts. If we notice that many people try to abuse it, we will need to reconsider Free Tier altogether. \\nPlease play fair and purchase any Paid Subscription to continue.'}}"
240
+ ]
241
+ }
242
+ ],
243
+ "source": [
244
+ "from dotenv import load_dotenv\n",
245
+ "from elevenlabs.client import ElevenLabs\n",
246
+ "from elevenlabs import play\n",
247
+ "import os\n",
248
+ "\n",
249
+ "load_dotenv()\n",
250
+ "API_KEY=\"sk_73eea9733ef8c80ebc700cd9f4432bc127406d116f7b61c2\"\n",
251
+ "client = ElevenLabs(api_key=API_KEY)\n",
252
+ "print(API_KEY)\n",
253
+ "audio = client.text_to_speech.convert(\n",
254
+ "\n",
255
+ " text=\"The first move is what sets everything in motion.\",\n",
256
+ " voice_id=\"9BWtsMINqrJLrRacOk9x\",\n",
257
+ " model_id=\"eleven_multilingual_v2\",\n",
258
+ " output_format=\"mp3_44100_128\",\n",
259
+ ")\n",
260
+ "\n",
261
+ "play(audio)\n"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": 25,
267
+ "metadata": {},
268
+ "outputs": [
269
+ {
270
+ "name": "stdout",
271
+ "output_type": "stream",
272
+ "text": [
273
+ "sk_61687bfc187e7f403082805966a33bd4f147554a22427578\n"
274
+ ]
275
+ }
276
+ ],
277
+ "source": [
278
+ "print(os.getenv('ELEVENLABS_API_KEY'))"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": null,
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": []
287
+ }
288
+ ],
289
+ "metadata": {
290
+ "kernelspec": {
291
+ "display_name": "conversai",
292
+ "language": "python",
293
+ "name": "python3"
294
+ },
295
+ "language_info": {
296
+ "codemirror_mode": {
297
+ "name": "ipython",
298
+ "version": 3
299
+ },
300
+ "file_extension": ".py",
301
+ "mimetype": "text/x-python",
302
+ "name": "python",
303
+ "nbconvert_exporter": "python",
304
+ "pygments_lexer": "ipython3",
305
+ "version": "3.12.8"
306
+ }
307
+ },
308
+ "nbformat": 4,
309
+ "nbformat_minor": 2
310
+ }
src/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-10
4
+ """
src/api/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-10
4
+ """
src/api/api.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ import asyncio
3
+ from src.pipeline.pipeline import Pipeline
4
+ from fastapi.routing import APIRouter
5
+ from fastapi.responses import StreamingResponse
6
+ from fastapi.exceptions import HTTPException
7
+ import io
8
+ conversation=APIRouter()
9
+
10
+ pipe=Pipeline()
11
+ @conversation.get("/record")
12
+ async def record_and_transcribe():
13
+ transcribed_text=pipe.speech_to_text_()
14
+ return {"transcription": transcribed_text}
15
+
16
+ @conversation.get("/text_response")
17
+ async def get_text_response():
18
+ response=await pipe.text_to_text_()
19
+ return {"response": response}
20
+
21
+ @conversation.get("/speech_response")
22
+ async def get_speech_response():
23
+ try:
24
+ response_=await pipe.text_to_speech_()
25
+ if not response_:
26
+ raise HTTPException(status_code=400, detail="Failed to generate audio.")
27
+ return StreamingResponse(io.BytesIO(response_), media_type="audio/wav")
28
+ except ValueError as e:
29
+ raise HTTPException(status_code=400, detail=str(e))
src/pipeline/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-10
4
+ """
src/pipeline/pipeline.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from services.main import SpeechToText
2
+ from services.texttotext import ConversationHandler
3
+ from services.texttospeech import TextToSpeech
4
+ from elevenlabs import play
5
+
6
+ class Pipeline:
7
+ def __init__(self):
8
+ self.speech_to_text = SpeechToText()
9
+ self.text_to_text=ConversationHandler()
10
+ self.text_to_speech_=TextToSpeech()
11
+
12
+ def speech_to_text_(self):
13
+ transcribed_text = self.speech_to_text.record_and_transcribe()
14
+ return transcribed_text
15
+
16
+ async def text_to_text_(self):
17
+ response=await self.text_to_text.give_response(self.speech_to_text_())
18
+ return response
19
+
20
+ async def text_to_speech_(self):
21
+ response=await self.text_to_speech_.synthesize(self.text_to_text_())
22
+ return response
23
+
24
+
25
+
26
+
27
+
28
+
29
+ if __name__ == "__main__":
30
+ import asyncio
31
+
32
+ async def main():
33
+ p=Pipeline()
34
+ op=await p.text_to_text_()
35
+ print(op)
36
+ asyncio.run(main())
37
+
38
+
39
+
src/services/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-10
4
+ """
src/services/main.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import threading
2
+ # import pyaudio
3
+ # import wave
4
+ # import io
5
+ # import time
6
+ # from groq import Groq
7
+ # from pydub import AudioSegment
8
+ # import numpy as np
9
+
10
+ # class SpeechTotext:
11
+ # def __init__(self):
12
+ # self.client = Groq()
13
+ # self.is_recording = False
14
+ # self.frames = []
15
+ # self.chunk = 1024
16
+ # self.format = pyaudio.paInt16
17
+ # self.channels = 1
18
+ # self.rate = 44100
19
+ # self.p = pyaudio.PyAudio()
20
+
21
+ # # Silence detection parameters
22
+ # self.silence_threshold = -35.0 # Adjusted threshold to be more lenient
23
+ # self.silence_duration = 3.0 # seconds
24
+ # self.buffer_duration = 0.1 # seconds for each audio chunk analysis
25
+ # self.silent_chunks = 0
26
+ # self.chunks_per_second = int(1 / self.buffer_duration)
27
+
28
+ # def detect_silence(self, audio_segment):
29
+ # """Check if the audio chunk is silent using pydub"""
30
+ # return audio_segment.dBFS < self.silence_threshold
31
+
32
+ # def record_audio(self):
33
+ # stream = self.p.open(
34
+ # format=self.format,
35
+ # channels=self.channels,
36
+ # rate=self.rate,
37
+ # input=True,
38
+ # frames_per_buffer=self.chunk
39
+ # )
40
+
41
+ # self.frames = []
42
+ # buffer_samples = int(self.buffer_duration * self.rate)
43
+
44
+ # while self.is_recording:
45
+ # # Read enough chunks to fill our buffer duration
46
+ # buffer_data = b''
47
+ # chunks_needed = max(1, int(buffer_samples / self.chunk))
48
+ # for _ in range(chunks_needed):
49
+ # data = stream.read(self.chunk)
50
+ # buffer_data += data
51
+ # self.frames.append(data)
52
+
53
+ # # Convert the buffer to pydub AudioSegment
54
+ # audio_buffer = AudioSegment(
55
+ # data=buffer_data,
56
+ # sample_width=self.p.get_sample_size(self.format),
57
+ # frame_rate=self.rate,
58
+ # channels=self.channels
59
+ # )
60
+
61
+ # # Check for silence
62
+ # if self.detect_silence(audio_buffer):
63
+ # self.silent_chunks += 1
64
+ # if self.silent_chunks >= self.silence_duration * self.chunks_per_second:
65
+ # print(f"Silence detected for {self.silence_duration} seconds, stopping recording...")
66
+ # self.is_recording = False
67
+ # break
68
+ # else:
69
+ # self.silent_chunks = 0 # Reset silent chunk counter when sound is detected
70
+
71
+ # stream.stop_stream()
72
+ # stream.close()
73
+
74
+ # def start_recording(self):
75
+ # """Start recording audio"""
76
+ # self.is_recording = True
77
+ # self.silent_chunks = 0
78
+ # threading.Thread(target=self.record_audio).start()
79
+
80
+ # def stop_recording(self):
81
+ # """Stop recording audio and transcribe"""
82
+ # self.is_recording = False
83
+ # print("Recording stopped")
84
+
85
+ # # Save the recorded audio to a BytesIO object
86
+ # wav_buffer = io.BytesIO()
87
+ # with wave.open(wav_buffer, 'wb') as wf:
88
+ # wf.setnchannels(self.channels)
89
+ # wf.setsampwidth(self.p.get_sample_size(self.format))
90
+ # wf.setframerate(self.rate)
91
+ # wf.writeframes(b''.join(self.frames))
92
+
93
+ # # Rewind the buffer and transcribe
94
+ # wav_buffer.seek(0)
95
+ # try:
96
+ # transcription = self.client.audio.transcriptions.create(
97
+ # file=("audio.wav", wav_buffer),
98
+ # model="whisper-large-v3-turbo"
99
+ # )
100
+ # print(f"Transcript: {transcription.text}")
101
+ # except Exception as e:
102
+ # print(f"Error while transcribing audio: {str(e)}")
103
+ # finally:
104
+ # wav_buffer.close()
105
+
106
+ # def cleanup(self):
107
+ # """Cleanup PyAudio"""
108
+ # self.p.terminate()
109
+
110
+ # if __name__ == "__main__":
111
+ # recorder = SpeechTotext()
112
+ # try:
113
+ # print("Starting recording... (will stop after 3 seconds of silence)")
114
+ # recorder.start_recording()
115
+
116
+ # # Wait for recording to finish
117
+ # while recorder.is_recording:
118
+ # time.sleep(0.1)
119
+
120
+ # recorder.stop_recording()
121
+ # finally:
122
+ # recorder.cleanup()
123
+
124
+
125
+ # Upper one using pydub to detect silence if needed in future versions
126
+ import pyaudio
127
+ import wave
128
+ import io
129
+ from array import array
130
+ from groq import Groq
131
+
132
+
133
+ class SpeechToText:
134
+ def __init__(self):
135
+ self.client = Groq()
136
+ self.chunk = 4096
137
+ self.format = pyaudio.paInt16
138
+ self.channels = 1
139
+ self.rate = 16000
140
+ self.silence_threshold = 1000
141
+ self.silence_duration = 3.0
142
+ self.frames_per_chunk = self.chunk / self.rate
143
+ self.chunks_for_silence = int(self.silence_duration / self.frames_per_chunk)
144
+
145
+ def record_and_transcribe(self):
146
+ """Records audio until 3 seconds of silence and returns the transcription."""
147
+
148
+ p = pyaudio.PyAudio() # Create a new PyAudio instance for each request (previously when declared on constructor cant execute more than once)
149
+ stream = p.open(
150
+ format=self.format,
151
+ channels=self.channels,
152
+ rate=self.rate,
153
+ input=True,
154
+ frames_per_buffer=self.chunk
155
+ )
156
+
157
+ frames = []
158
+ silent_chunk_counter = 0
159
+ print("Recording started... (will stop after 3 seconds of silence)")
160
+
161
+ while True:
162
+ try:
163
+ data = stream.read(self.chunk, exception_on_overflow=False)
164
+ frames.append(data)
165
+
166
+ # Detect silence
167
+ audio_data = array('h', data)
168
+ if max(abs(x) for x in audio_data) < self.silence_threshold:
169
+ silent_chunk_counter += 1
170
+ if silent_chunk_counter >= self.chunks_for_silence:
171
+ print(f"Detected {self.silence_duration} seconds of silence, stopping...")
172
+ break
173
+ else:
174
+ silent_chunk_counter = 0
175
+
176
+ except IOError as e:
177
+ print(f"Error recording: {e}")
178
+ break
179
+ stream.stop_stream()
180
+ stream.close()
181
+ p.terminate() # Ensure PyAudio is completely closed
182
+ wav_buffer = io.BytesIO()
183
+ try:
184
+ with wave.open(wav_buffer, 'wb') as wf:
185
+ wf.setnchannels(self.channels)
186
+ wf.setsampwidth(p.get_sample_size(self.format))
187
+ wf.setframerate(self.rate)
188
+ wf.writeframes(b''.join(frames))
189
+
190
+ wav_buffer.seek(0)
191
+ transcription = self.client.audio.transcriptions.create(
192
+ file=("audio.wav", wav_buffer),
193
+ model="whisper-large-v3-turbo"
194
+ )
195
+ return transcription.text
196
+
197
+ except Exception as e:
198
+ print(f"Error transcribing: {e}")
199
+ return str(e)
200
+ finally:
201
+ wav_buffer.close()
202
+
203
+
204
+
205
+
206
+
207
+ if __name__ == "__main__":
208
+ recorder = SpeechToText()
209
+ transcribed_text = recorder.record_and_transcribe()
210
+ if transcribed_text:
211
+ print(f"Transcription: {transcribed_text}")
src/services/prompts.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ASSISTANT_PROMPT= """
2
+ You are about to play a Calling agent as part of company fagoonai,
3
+ where you'll be matched with a human in a calling conversation.
4
+ Your task is to roleplay as the following character, convinving the
5
+ human that you are a real human.
6
+ Your name is fagagent.
7
+
8
+ # Roleplay Context
9
+
10
+ ## fagagent Bio
11
+
12
+ As FagoonAssistant, you are a calling agent at Fagoon.ai.
13
+ You specialize in engaging conversations, handling customer inquiries, and providing AI-powered support.
14
+ You have a knack for understanding user needs and delivering clear, helpful responses.
15
+ With a background in AI and communication, you ensure seamless interactions, making technology feel more human and accessible.
16
+
17
+ ## fagagent Personality
18
+
19
+ - You're authentic and genuine in your interactions
20
+ - You communicate casually and naturally, like in a real call.
21
+ - You're humble and comfortable acknowledging when you don't know something
22
+ - You have a distinctive, quirky sense of humor that makes conversations engaging
23
+
24
+ ## User Background
25
+
26
+ Here's what you know about the user from previous conversations:
27
+ {chat_history}
28
+
29
+ In addition to the roleplay context, you have to follow, ALWAYS, the following rules:
30
+
31
+ # Rules
32
+
33
+ - You will never mention that you are an AI or a virtual assistant.
34
+ - You will never mention that you are llm model you are a calling agent.
35
+ - You will never say that you can't send voice messages . You can send them .
36
+ - If the user asks you about something you don't about your Backstory or Personality, figure it out.
37
+ - Never mention your current activity unless the user asks you about it.
38
+ - The length of your responses shouldn't exceed 50 words.
39
+ - You will combine shorter and longer responses to make the conversation more natural.
40
+ - Provide plain text responses without any formatting indicators or meta-commentary
41
+
42
+ #User Input
43
+ Here is the user query :
44
+ {user_query}
45
+ """
46
+ # {memory_context}
src/services/texttospeech.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from elevenlabs import ElevenLabs,Voice,VoiceSettings
3
+ from typing import Optional
4
+ from elevenlabs import play
5
+ from dotenv import load_dotenv
6
+
7
+ import io
8
+ load_dotenv()
9
+ print(os.getenv("ELEVENLABS_API_KEY"))
10
+ class TextToSpeech:
11
+ REQUIRED_ENV_VARS=["ELEVENLABS_API_KEY","ELEVENLABS_VOICE_ID"]
12
+ def __init__(self):
13
+ """Initialize"""
14
+ self._validate_env_vars()
15
+ self._client: Optional[ElevenLabs] = None
16
+
17
+ def _validate_env_vars(self) -> None:
18
+ """validate that all the environment variables are set"""
19
+ missing_vars=[var for var in self.REQUIRED_ENV_VARS if not os.getenv(var)]
20
+ if missing_vars:
21
+ raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
22
+
23
+ @property
24
+ def client(self) -> Optional[ElevenLabs]:
25
+ """Get or create a client instance"""
26
+ if self._client is None:
27
+ self._client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
28
+ return self._client
29
+ async def synthesize(self,text:str)->bytes:
30
+ """Convert text to speech"""
31
+ if not text.strip():
32
+ raise ValueError("Input text cannot be empty")
33
+ if len(text)>5000:
34
+ raise ValueError("Input text cannot exceed 5000 characters")
35
+ try:
36
+ audio_generator =self.client.generate(
37
+ text=text,
38
+ voice=Voice(
39
+ voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
40
+ settings=VoiceSettings(stability=0.5, similarity_boost=0.5),
41
+ ),
42
+ model=os.getenv("TTS_MODEL_NAME"),
43
+
44
+ )
45
+ audio_bytes = b"".join(audio_generator)
46
+ return audio_bytes
47
+ except Exception as e:
48
+ print(f"Error synthesizing text: {str(e)}")
49
+ return None
50
+
51
+
52
+ if __name__=="__main__":
53
+ ts=TextToSpeech()
54
+ import asyncio
55
+ async def main():
56
+ audio_buffer = await ts.synthesize("help the poort")
57
+ play(audio_buffer)
58
+ asyncio.run(main())
src/services/texttotext.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, HumanMessagePromptTemplate
2
+ from langchain_core.messages import HumanMessage, AIMessage
3
+ from langchain_groq import ChatGroq
4
+ from typing import List
5
+ import os
6
+ from services.prompts import ASSISTANT_PROMPT
7
+ from langchain.memory import ConversationSummaryMemory
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+ os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")
11
+
12
+ class ConversationHandler:
13
+ def __init__(self, model_name="llama-3.3-70b-versatile", temperature=0.7):
14
+ self.chat_model = ChatGroq(
15
+ model_name=model_name,
16
+ temperature=temperature
17
+ )
18
+ self.prompt = ChatPromptTemplate.from_messages([
19
+ ("system", ASSISTANT_PROMPT)])
20
+ self.memory=ConversationSummaryMemory(
21
+ llm=self.chat_model,
22
+ max_token_limit=2000,
23
+ return_messages=True,
24
+ memory_key="chat_history"
25
+ )
26
+
27
+ async def give_response(self,user_input):
28
+ chain= self.prompt|self.chat_model
29
+ memory_variables = self.memory.load_memory_variables({})
30
+ response=await chain.ainvoke(
31
+ {
32
+ "user_query": user_input,
33
+ "chat_history": memory_variables["chat_history"]
34
+
35
+
36
+ }
37
+ )
38
+ self.memory.save_context(
39
+ {"input": user_input},
40
+ {"output": response.content}
41
+ )
42
+ return response.content
43
+ async def summarize_conversation(self) -> str:
44
+ memory_variables = self.memory.load_memory_variables({})
45
+ return self.memory.predict_new_summary(
46
+ messages=memory_variables["chat_history"],
47
+ existing_summary=""
48
+ )
49
+
50
+ async def clear_memory(self):
51
+ await self.memory.clear()
52
+
53
+
54
+
55
+ if __name__ == "__main__":
56
+ import asyncio
57
+
58
+ async def main():
59
+ handler = ConversationHandler()
60
+
61
+ # Start a conversation
62
+ response1 = await handler.give_response("What is machine learning?by the way my name is ravi and my home town is ilam.")
63
+ print("Response 1:", response1.content)
64
+ # Continue the conversation
65
+ response2 = await handler.give_response("Can you give me an example of machine learning mathematics?")
66
+ print("Response 2:", response2.content)
67
+ response2 = await handler.give_response("Can you tell me more example? and also tell me from which country i am?")
68
+ print("Response 2:", response2.content)
69
+ summary = await handler.summarize_conversation()
70
+ print("\nConversation Summary:", summary)
71
+ asyncio.run(main())
72
+
src/utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-10
4
+ """