Updated
Browse files- .env +4 -0
- .gitignore +40 -0
- Dockerfile +22 -0
- README.md +1 -0
- apo.py +18 -0
- app.py +19 -0
- requirements.txt +12 -0
- research.ipynb +310 -0
- src/__init__.py +4 -0
- src/api/__init__.py +4 -0
- src/api/api.py +29 -0
- src/pipeline/__init__.py +4 -0
- src/pipeline/pipeline.py +39 -0
- src/services/__init__.py +4 -0
- src/services/main.py +211 -0
- src/services/prompts.py +46 -0
- src/services/texttospeech.py +58 -0
- src/services/texttotext.py +72 -0
- src/utils/__init__.py +4 -0
.env
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GROQ_API_KEY= "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"
|
2 |
+
ELEVENLABS_API_KEY="sk_73eea9733ef8c80ebc700cd9f4432bc127406d116f7b61c2"
|
3 |
+
ELEVENLABS_VOICE_ID="9BWtsMINqrJLrRacOk9x"
|
4 |
+
TTS_MODEL_NAME="eleven_multilingual_v2"
|
.gitignore
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.py[cod]
|
3 |
+
*$py.class
|
4 |
+
|
5 |
+
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
pip-wheel-metadata/
|
20 |
+
share/python-wheels/
|
21 |
+
*.egg-info/
|
22 |
+
.installed.cfg
|
23 |
+
*.egg
|
24 |
+
MANIFEST
|
25 |
+
|
26 |
+
|
27 |
+
__pypackages__/
|
28 |
+
|
29 |
+
celerybeat-schedule
|
30 |
+
celerybeat.pid
|
31 |
+
|
32 |
+
|
33 |
+
.idea/
|
34 |
+
*.iml
|
35 |
+
*.iws
|
36 |
+
/out/
|
37 |
+
|
38 |
+
.jupyter/
|
39 |
+
.labwork/
|
40 |
+
secrets
|
Dockerfile
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use a base image (e.g., Python)
|
2 |
+
FROM python:3.9-slim
|
3 |
+
|
4 |
+
# Set the working directory inside the container
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Install ffmpeg and other dependencies
|
8 |
+
RUN apt-get update && \
|
9 |
+
apt-get install -y ffmpeg && \
|
10 |
+
rm -rf /var/lib/apt/lists/*
|
11 |
+
|
12 |
+
# Copy your application files to the container
|
13 |
+
COPY . .
|
14 |
+
|
15 |
+
# Install Python dependencies
|
16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
17 |
+
|
18 |
+
# Expose the required port (if applicable)
|
19 |
+
EXPOSE 7860
|
20 |
+
|
21 |
+
# Set the default command to run the application
|
22 |
+
CMD ["python", "app.py"]
|
README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# ZODBERT
|
apo.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
|
4 |
+
API_URL="http://localhost:8000/audios/record"
|
5 |
+
|
6 |
+
st.title("Speech To Text Testing ")
|
7 |
+
st.write("Click 'Start Recording' to capture and transcribe speech.")
|
8 |
+
|
9 |
+
if st.button("Start Recording"):
|
10 |
+
with st.spinner("Recording... Speak now."):
|
11 |
+
response = requests.get(API_URL)
|
12 |
+
if response.status_code == 200:
|
13 |
+
st.success("Recording complete!")
|
14 |
+
st.write("### Transcription:")
|
15 |
+
st.write(response.json()["transcription"])
|
16 |
+
else:
|
17 |
+
st.error("Error recording audio")
|
18 |
+
|
app.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from fastapi import FastAPI, Request
|
3 |
+
from fastapi.responses import HTMLResponse
|
4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
5 |
+
from fastapi.staticfiles import StaticFiles
|
6 |
+
from src.api.api import conversation
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
app = FastAPI()
|
10 |
+
app.include_router(conversation,prefix="/conv")
|
11 |
+
|
12 |
+
if __name__ == "__main__":
|
13 |
+
import uvicorn
|
14 |
+
|
15 |
+
uvicorn.run(
|
16 |
+
"main:app",
|
17 |
+
host="0.0.0.0",
|
18 |
+
port=7860,
|
19 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
elevenlabs
|
2 |
+
fastapi
|
3 |
+
groq
|
4 |
+
langchain
|
5 |
+
langchain_core
|
6 |
+
langchain_groq
|
7 |
+
PyAudio
|
8 |
+
python-dotenv
|
9 |
+
Requests
|
10 |
+
streamlit
|
11 |
+
uvicorn
|
12 |
+
langchain_community
|
research.ipynb
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stdout",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"1\n",
|
13 |
+
"2\n",
|
14 |
+
"3\n"
|
15 |
+
]
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"source": [
|
19 |
+
"import queue\n",
|
20 |
+
"\n",
|
21 |
+
"# Create a FIFO queue\n",
|
22 |
+
"q = queue.Queue()\n",
|
23 |
+
"\n",
|
24 |
+
"# Put items in the queue\n",
|
25 |
+
"q.put(1)\n",
|
26 |
+
"q.put(2)\n",
|
27 |
+
"q.put(3)\n",
|
28 |
+
"\n",
|
29 |
+
"# Get items from the queue\n",
|
30 |
+
"print(q.get()) # prints: 1\n",
|
31 |
+
"print(q.get()) # prints: 2\n",
|
32 |
+
"print(q.get()) # prints: 3"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"cell_type": "code",
|
37 |
+
"execution_count": 2,
|
38 |
+
"metadata": {},
|
39 |
+
"outputs": [
|
40 |
+
{
|
41 |
+
"name": "stdout",
|
42 |
+
"output_type": "stream",
|
43 |
+
"text": [
|
44 |
+
"Alice\n",
|
45 |
+
"Bob\n",
|
46 |
+
"Charlie\n"
|
47 |
+
]
|
48 |
+
}
|
49 |
+
],
|
50 |
+
"source": [
|
51 |
+
"import queue\n",
|
52 |
+
"\n",
|
53 |
+
"# Create a line\n",
|
54 |
+
"line = queue.Queue()\n",
|
55 |
+
"\n",
|
56 |
+
"# People join the line\n",
|
57 |
+
"line.put(\"Alice\") # Alice joins first\n",
|
58 |
+
"line.put(\"Bob\") # Bob joins second\n",
|
59 |
+
"line.put(\"Charlie\") # Charlie joins last\n",
|
60 |
+
"\n",
|
61 |
+
"# People leave the line\n",
|
62 |
+
"print(line.get()) # Alice leaves first\n",
|
63 |
+
"print(line.get()) # Bob leaves second\n",
|
64 |
+
"print(line.get()) # Charlie leaves last"
|
65 |
+
]
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"cell_type": "code",
|
69 |
+
"execution_count": 3,
|
70 |
+
"metadata": {},
|
71 |
+
"outputs": [
|
72 |
+
{
|
73 |
+
"data": {
|
74 |
+
"text/plain": [
|
75 |
+
"ReturnPagedConfigs(page_number=0, page_size=10, total_pages=1, configs_page=[ReturnConfig(id='3bad47d1-bd76-48ed-b596-6c37a89b5570', version=0, evi_version='2', version_description='', name='2/17/2025, 03:24:50 PM', created_on=1739785191727, modified_on=1739785191727, prompt=ReturnPrompt(id='1d3c2d75-07fe-424b-97b7-04a8675a7318', version=0, version_type='FIXED', version_description='', name='prompt-phlhWtIJDokAdrbFERKnj', created_on=1739785190915, modified_on=1739785190915, text=''), voice=ReturnVoice(provider='HUME_AI', name=None, custom_voice=None), language_model=None, ellm_model=ReturnEllmModel(allow_short_responses=True), tools=[], builtin_tools=[], event_messages=ReturnEventMessageSpecs(on_new_chat=ReturnEventMessageSpec(enabled=False, text=None), on_inactivity_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_max_duration_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_resume_chat={'enabled': False, 'text': None}, on_disconnect_resume_chat={'enabled': False, 'text': None}), timeouts=ReturnTimeoutSpecs(inactivity=ReturnTimeoutSpec(enabled=True, duration_secs=120), max_duration=ReturnTimeoutSpec(enabled=True, duration_secs=1800)), webhooks=[])])"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
"execution_count": 3,
|
79 |
+
"metadata": {},
|
80 |
+
"output_type": "execute_result"
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"source": [
|
84 |
+
"from hume.client import HumeClient\n",
|
85 |
+
"import os \n",
|
86 |
+
"client = HumeClient(\n",
|
87 |
+
" api_key=os.getenv(\"HUME_API_KEY\"), # Defaults to HUME_API_KEY\n",
|
88 |
+
")\n",
|
89 |
+
"client.empathic_voice.configs.list_configs()\n"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"cell_type": "code",
|
94 |
+
"execution_count": null,
|
95 |
+
"metadata": {},
|
96 |
+
"outputs": [
|
97 |
+
{
|
98 |
+
"name": "stdout",
|
99 |
+
"output_type": "stream",
|
100 |
+
"text": [
|
101 |
+
"Error: 404 \n",
|
102 |
+
" {\"code\":404, \"message\":\"Invalid URI Path\"}\n",
|
103 |
+
" \n"
|
104 |
+
]
|
105 |
+
}
|
106 |
+
],
|
107 |
+
"source": [
|
108 |
+
"import requests\n",
|
109 |
+
"import json\n",
|
110 |
+
"\n",
|
111 |
+
"\n",
|
112 |
+
"\n",
|
113 |
+
"# Function to connect to the EVI and send text for TTS\n",
|
114 |
+
"def send_text_for_tts(text):\n",
|
115 |
+
" url = 'https://api.hume.ai/v0/evi/chat'\n",
|
116 |
+
" \n",
|
117 |
+
" headers = {\n",
|
118 |
+
" 'X-Hume-Api-Key': API_KEY,\n",
|
119 |
+
" 'Content-Type': 'application/json'\n",
|
120 |
+
" }\n",
|
121 |
+
" \n",
|
122 |
+
" payload = {\n",
|
123 |
+
" 'configId': CONFIG_ID,\n",
|
124 |
+
" 'text': text\n",
|
125 |
+
" }\n",
|
126 |
+
" \n",
|
127 |
+
" response = requests.post(url, headers=headers, data=json.dumps(payload))\n",
|
128 |
+
" \n",
|
129 |
+
" if response.status_code == 200:\n",
|
130 |
+
" print('Response:', response.json())\n",
|
131 |
+
" else:\n",
|
132 |
+
" print('Error:', response.status_code, response.text)\n",
|
133 |
+
"\n",
|
134 |
+
"# Example usage\n",
|
135 |
+
"send_text_for_tts(\"Hello, this is a text to speech conversion using Hume.\")"
|
136 |
+
]
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"cell_type": "code",
|
140 |
+
"execution_count": 7,
|
141 |
+
"metadata": {},
|
142 |
+
"outputs": [
|
143 |
+
{
|
144 |
+
"ename": "ImportError",
|
145 |
+
"evalue": "cannot import name 'HumeStreamClient' from 'hume' (/home/puzan/miniconda3/envs/conversai/lib/python3.12/site-packages/hume/__init__.py)",
|
146 |
+
"output_type": "error",
|
147 |
+
"traceback": [
|
148 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
149 |
+
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
|
150 |
+
"Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mhume\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m HumeStreamClient\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mhume\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m ProsodyConfig\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mrequests\u001b[39;00m\n",
|
151 |
+
"\u001b[0;31mImportError\u001b[0m: cannot import name 'HumeStreamClient' from 'hume' (/home/puzan/miniconda3/envs/conversai/lib/python3.12/site-packages/hume/__init__.py)"
|
152 |
+
]
|
153 |
+
}
|
154 |
+
],
|
155 |
+
"source": [
|
156 |
+
"from hume import HumeStreamClient\n",
|
157 |
+
"from hume.models.config import ProsodyConfig\n",
|
158 |
+
"import requests\n",
|
159 |
+
"\n",
|
160 |
+
"API_KEY = \"your_hume_api_key\"\n",
|
161 |
+
"\n",
|
162 |
+
"def text_to_audio(text):\n",
|
163 |
+
" client = HumeStreamClient(API_KEY)\n",
|
164 |
+
" config = ProsodyConfig()\n",
|
165 |
+
" \n",
|
166 |
+
" print(\"Generating audio...\")\n",
|
167 |
+
" response = client.submit([text], [config])\n",
|
168 |
+
" \n",
|
169 |
+
" for job in response:\n",
|
170 |
+
" audio_url = job[\"results\"][\"prosody\"][\"audio_url\"]\n",
|
171 |
+
" print(f\"Audio generated! You can download it here: {audio_url}\")\n",
|
172 |
+
" return audio_url\n",
|
173 |
+
"\n",
|
174 |
+
"def download_audio(audio_url, output_file=\"output_audio.wav\"):\n",
|
175 |
+
" print(\"Downloading audio...\")\n",
|
176 |
+
" response = requests.get(audio_url)\n",
|
177 |
+
" \n",
|
178 |
+
" if response.status_code == 200:\n",
|
179 |
+
" with open(output_file, \"wb\") as f:\n",
|
180 |
+
" f.write(response.content)\n",
|
181 |
+
" print(f\"Audio saved as {output_file}\")\n",
|
182 |
+
" else:\n",
|
183 |
+
" print(f\"Failed to download audio. Status code: {response.status_code}\")\n",
|
184 |
+
"\n",
|
185 |
+
"if __name__ == \"__main__\":\n",
|
186 |
+
" text = \"Hello, this is a test of Hume's text-to-audio functionality.\"\n",
|
187 |
+
" audio_url = text_to_audio(text)\n",
|
188 |
+
" \n",
|
189 |
+
" if audio_url:\n",
|
190 |
+
" download_audio(audio_url)\n"
|
191 |
+
]
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"cell_type": "code",
|
195 |
+
"execution_count": 17,
|
196 |
+
"metadata": {},
|
197 |
+
"outputs": [
|
198 |
+
{
|
199 |
+
"data": {
|
200 |
+
"text/plain": [
|
201 |
+
"ReturnPagedConfigs(page_number=0, page_size=10, total_pages=1, configs_page=[ReturnConfig(id='9207e0c5-c75b-4b11-bbae-f5914614c652', version=0, evi_version='2', version_description='', name='2/19/2025, 03:27:23 PM', created_on=1739958144472, modified_on=1739958144472, prompt=ReturnPrompt(id='ba17413a-8251-4f39-8719-2d37bca6377c', version=0, version_type='FIXED', version_description='', name='prompt-LdpKTJOFO5tziv2EdiR6y', created_on=1739958143755, modified_on=1739958143755, text=''), voice=ReturnVoice(provider='HUME_AI', name='AURA', custom_voice=None), language_model=None, ellm_model=ReturnEllmModel(allow_short_responses=True), tools=[], builtin_tools=[], event_messages=ReturnEventMessageSpecs(on_new_chat=ReturnEventMessageSpec(enabled=False, text=None), on_inactivity_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_max_duration_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_resume_chat={'enabled': False, 'text': None}, on_disconnect_resume_chat={'enabled': False, 'text': None}), timeouts=ReturnTimeoutSpecs(inactivity=ReturnTimeoutSpec(enabled=True, duration_secs=120), max_duration=ReturnTimeoutSpec(enabled=True, duration_secs=1800)), webhooks=[]), ReturnConfig(id='3bad47d1-bd76-48ed-b596-6c37a89b5570', version=0, evi_version='2', version_description='', name='2/17/2025, 03:24:50 PM', created_on=1739785191727, modified_on=1739785191727, prompt=ReturnPrompt(id='1d3c2d75-07fe-424b-97b7-04a8675a7318', version=0, version_type='FIXED', version_description='', name='prompt-phlhWtIJDokAdrbFERKnj', created_on=1739785190915, modified_on=1739785190915, text=''), voice=ReturnVoice(provider='HUME_AI', name=None, custom_voice=None), language_model=None, ellm_model=ReturnEllmModel(allow_short_responses=True), tools=[], builtin_tools=[], event_messages=ReturnEventMessageSpecs(on_new_chat=ReturnEventMessageSpec(enabled=False, text=None), on_inactivity_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_max_duration_timeout=ReturnEventMessageSpec(enabled=False, text=None), on_resume_chat={'enabled': False, 'text': None}, on_disconnect_resume_chat={'enabled': False, 'text': None}), timeouts=ReturnTimeoutSpecs(inactivity=ReturnTimeoutSpec(enabled=True, duration_secs=120), max_duration=ReturnTimeoutSpec(enabled=True, duration_secs=1800)), webhooks=[])])"
|
202 |
+
]
|
203 |
+
},
|
204 |
+
"execution_count": 17,
|
205 |
+
"metadata": {},
|
206 |
+
"output_type": "execute_result"
|
207 |
+
}
|
208 |
+
],
|
209 |
+
"source": [
|
210 |
+
"from hume.client import HumeClient \n",
|
211 |
+
"# authenticate the synchronous client \n",
|
212 |
+
"client = HumeClient(api_key=API_KEY) \n",
|
213 |
+
"# list your configs \n",
|
214 |
+
"client.empathic_voice.configs.list_configs()"
|
215 |
+
]
|
216 |
+
},
|
217 |
+
{
|
218 |
+
"cell_type": "code",
|
219 |
+
"execution_count": 5,
|
220 |
+
"metadata": {},
|
221 |
+
"outputs": [
|
222 |
+
{
|
223 |
+
"name": "stdout",
|
224 |
+
"output_type": "stream",
|
225 |
+
"text": [
|
226 |
+
"sk_73eea9733ef8c80ebc700cd9f4432bc127406d116f7b61c2\n"
|
227 |
+
]
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"ename": "ApiError",
|
231 |
+
"evalue": "status_code: 401, body: {'detail': {'status': 'detected_unusual_activity', 'message': 'Unusual activity detected. Free Tier usage disabled. If you are using a proxy/VPN you might need to purchase a Paid Plan to not trigger our abuse detectors. Free Tier only works if users do not abuse it, for example by creating multiple free accounts. If we notice that many people try to abuse it, we will need to reconsider Free Tier altogether. \\nPlease play fair and purchase any Paid Subscription to continue.'}}",
|
232 |
+
"output_type": "error",
|
233 |
+
"traceback": [
|
234 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
235 |
+
"\u001b[0;31mApiError\u001b[0m Traceback (most recent call last)",
|
236 |
+
"Cell \u001b[0;32mIn[5], line 18\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m(API_KEY)\n\u001b[1;32m 10\u001b[0m audio \u001b[38;5;241m=\u001b[39m client\u001b[38;5;241m.\u001b[39mtext_to_speech\u001b[38;5;241m.\u001b[39mconvert(\n\u001b[1;32m 11\u001b[0m \n\u001b[1;32m 12\u001b[0m text\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe first move is what sets everything in motion.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m output_format\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmp3_44100_128\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 16\u001b[0m )\n\u001b[0;32m---> 18\u001b[0m \u001b[43mplay\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m)\u001b[49m\n",
|
237 |
+
"File \u001b[0;32m~/miniconda3/envs/conversai/lib/python3.12/site-packages/elevenlabs/play.py:19\u001b[0m, in \u001b[0;36mplay\u001b[0;34m(audio, notebook, use_ffmpeg)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mplay\u001b[39m(\n\u001b[1;32m 14\u001b[0m audio: Union[\u001b[38;5;28mbytes\u001b[39m, Iterator[\u001b[38;5;28mbytes\u001b[39m]], \n\u001b[1;32m 15\u001b[0m notebook: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, \n\u001b[1;32m 16\u001b[0m use_ffmpeg: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 17\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(audio, Iterator):\n\u001b[0;32m---> 19\u001b[0m audio \u001b[38;5;241m=\u001b[39m \u001b[38;5;124;43mb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m notebook:\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
|
238 |
+
"File \u001b[0;32m~/miniconda3/envs/conversai/lib/python3.12/site-packages/elevenlabs/text_to_speech/client.py:201\u001b[0m, in \u001b[0;36mTextToSpeechClient.convert\u001b[0;34m(self, voice_id, text, enable_logging, optimize_streaming_latency, output_format, model_id, language_code, voice_settings, pronunciation_dictionary_locators, seed, previous_text, next_text, previous_request_ids, next_request_ids, use_pvc_as_ivc, apply_text_normalization, request_options)\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m JSONDecodeError:\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ApiError(status_code\u001b[38;5;241m=\u001b[39m_response\u001b[38;5;241m.\u001b[39mstatus_code, body\u001b[38;5;241m=\u001b[39m_response\u001b[38;5;241m.\u001b[39mtext)\n\u001b[0;32m--> 201\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ApiError(status_code\u001b[38;5;241m=\u001b[39m_response\u001b[38;5;241m.\u001b[39mstatus_code, body\u001b[38;5;241m=\u001b[39m_response_json)\n",
|
239 |
+
"\u001b[0;31mApiError\u001b[0m: status_code: 401, body: {'detail': {'status': 'detected_unusual_activity', 'message': 'Unusual activity detected. Free Tier usage disabled. If you are using a proxy/VPN you might need to purchase a Paid Plan to not trigger our abuse detectors. Free Tier only works if users do not abuse it, for example by creating multiple free accounts. If we notice that many people try to abuse it, we will need to reconsider Free Tier altogether. \\nPlease play fair and purchase any Paid Subscription to continue.'}}"
|
240 |
+
]
|
241 |
+
}
|
242 |
+
],
|
243 |
+
"source": [
|
244 |
+
"from dotenv import load_dotenv\n",
|
245 |
+
"from elevenlabs.client import ElevenLabs\n",
|
246 |
+
"from elevenlabs import play\n",
|
247 |
+
"import os\n",
|
248 |
+
"\n",
|
249 |
+
"load_dotenv()\n",
|
250 |
+
"API_KEY=\"sk_73eea9733ef8c80ebc700cd9f4432bc127406d116f7b61c2\"\n",
|
251 |
+
"client = ElevenLabs(api_key=API_KEY)\n",
|
252 |
+
"print(API_KEY)\n",
|
253 |
+
"audio = client.text_to_speech.convert(\n",
|
254 |
+
"\n",
|
255 |
+
" text=\"The first move is what sets everything in motion.\",\n",
|
256 |
+
" voice_id=\"9BWtsMINqrJLrRacOk9x\",\n",
|
257 |
+
" model_id=\"eleven_multilingual_v2\",\n",
|
258 |
+
" output_format=\"mp3_44100_128\",\n",
|
259 |
+
")\n",
|
260 |
+
"\n",
|
261 |
+
"play(audio)\n"
|
262 |
+
]
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"cell_type": "code",
|
266 |
+
"execution_count": 25,
|
267 |
+
"metadata": {},
|
268 |
+
"outputs": [
|
269 |
+
{
|
270 |
+
"name": "stdout",
|
271 |
+
"output_type": "stream",
|
272 |
+
"text": [
|
273 |
+
"sk_61687bfc187e7f403082805966a33bd4f147554a22427578\n"
|
274 |
+
]
|
275 |
+
}
|
276 |
+
],
|
277 |
+
"source": [
|
278 |
+
"print(os.getenv('ELEVENLABS_API_KEY'))"
|
279 |
+
]
|
280 |
+
},
|
281 |
+
{
|
282 |
+
"cell_type": "code",
|
283 |
+
"execution_count": null,
|
284 |
+
"metadata": {},
|
285 |
+
"outputs": [],
|
286 |
+
"source": []
|
287 |
+
}
|
288 |
+
],
|
289 |
+
"metadata": {
|
290 |
+
"kernelspec": {
|
291 |
+
"display_name": "conversai",
|
292 |
+
"language": "python",
|
293 |
+
"name": "python3"
|
294 |
+
},
|
295 |
+
"language_info": {
|
296 |
+
"codemirror_mode": {
|
297 |
+
"name": "ipython",
|
298 |
+
"version": 3
|
299 |
+
},
|
300 |
+
"file_extension": ".py",
|
301 |
+
"mimetype": "text/x-python",
|
302 |
+
"name": "python",
|
303 |
+
"nbconvert_exporter": "python",
|
304 |
+
"pygments_lexer": "ipython3",
|
305 |
+
"version": "3.12.8"
|
306 |
+
}
|
307 |
+
},
|
308 |
+
"nbformat": 4,
|
309 |
+
"nbformat_minor": 2
|
310 |
+
}
|
src/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-08-10
|
4 |
+
"""
|
src/api/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-08-10
|
4 |
+
"""
|
src/api/api.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
import asyncio
|
3 |
+
from src.pipeline.pipeline import Pipeline
|
4 |
+
from fastapi.routing import APIRouter
|
5 |
+
from fastapi.responses import StreamingResponse
|
6 |
+
from fastapi.exceptions import HTTPException
|
7 |
+
import io
|
8 |
+
conversation=APIRouter()
|
9 |
+
|
10 |
+
pipe=Pipeline()
|
11 |
+
@conversation.get("/record")
|
12 |
+
async def record_and_transcribe():
|
13 |
+
transcribed_text=pipe.speech_to_text_()
|
14 |
+
return {"transcription": transcribed_text}
|
15 |
+
|
16 |
+
@conversation.get("/text_response")
|
17 |
+
async def get_text_response():
|
18 |
+
response=await pipe.text_to_text_()
|
19 |
+
return {"response": response}
|
20 |
+
|
21 |
+
@conversation.get("/speech_response")
|
22 |
+
async def get_speech_response():
|
23 |
+
try:
|
24 |
+
response_=await pipe.text_to_speech_()
|
25 |
+
if not response_:
|
26 |
+
raise HTTPException(status_code=400, detail="Failed to generate audio.")
|
27 |
+
return StreamingResponse(io.BytesIO(response_), media_type="audio/wav")
|
28 |
+
except ValueError as e:
|
29 |
+
raise HTTPException(status_code=400, detail=str(e))
|
src/pipeline/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-08-10
|
4 |
+
"""
|
src/pipeline/pipeline.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from services.main import SpeechToText
|
2 |
+
from services.texttotext import ConversationHandler
|
3 |
+
from services.texttospeech import TextToSpeech
|
4 |
+
from elevenlabs import play
|
5 |
+
|
6 |
+
class Pipeline:
|
7 |
+
def __init__(self):
|
8 |
+
self.speech_to_text = SpeechToText()
|
9 |
+
self.text_to_text=ConversationHandler()
|
10 |
+
self.text_to_speech_=TextToSpeech()
|
11 |
+
|
12 |
+
def speech_to_text_(self):
|
13 |
+
transcribed_text = self.speech_to_text.record_and_transcribe()
|
14 |
+
return transcribed_text
|
15 |
+
|
16 |
+
async def text_to_text_(self):
|
17 |
+
response=await self.text_to_text.give_response(self.speech_to_text_())
|
18 |
+
return response
|
19 |
+
|
20 |
+
async def text_to_speech_(self):
|
21 |
+
response=await self.text_to_speech_.synthesize(self.text_to_text_())
|
22 |
+
return response
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
if __name__ == "__main__":
|
30 |
+
import asyncio
|
31 |
+
|
32 |
+
async def main():
|
33 |
+
p=Pipeline()
|
34 |
+
op=await p.text_to_text_()
|
35 |
+
print(op)
|
36 |
+
asyncio.run(main())
|
37 |
+
|
38 |
+
|
39 |
+
|
src/services/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-08-10
|
4 |
+
"""
|
src/services/main.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import threading
|
2 |
+
# import pyaudio
|
3 |
+
# import wave
|
4 |
+
# import io
|
5 |
+
# import time
|
6 |
+
# from groq import Groq
|
7 |
+
# from pydub import AudioSegment
|
8 |
+
# import numpy as np
|
9 |
+
|
10 |
+
# class SpeechTotext:
|
11 |
+
# def __init__(self):
|
12 |
+
# self.client = Groq()
|
13 |
+
# self.is_recording = False
|
14 |
+
# self.frames = []
|
15 |
+
# self.chunk = 1024
|
16 |
+
# self.format = pyaudio.paInt16
|
17 |
+
# self.channels = 1
|
18 |
+
# self.rate = 44100
|
19 |
+
# self.p = pyaudio.PyAudio()
|
20 |
+
|
21 |
+
# # Silence detection parameters
|
22 |
+
# self.silence_threshold = -35.0 # Adjusted threshold to be more lenient
|
23 |
+
# self.silence_duration = 3.0 # seconds
|
24 |
+
# self.buffer_duration = 0.1 # seconds for each audio chunk analysis
|
25 |
+
# self.silent_chunks = 0
|
26 |
+
# self.chunks_per_second = int(1 / self.buffer_duration)
|
27 |
+
|
28 |
+
# def detect_silence(self, audio_segment):
|
29 |
+
# """Check if the audio chunk is silent using pydub"""
|
30 |
+
# return audio_segment.dBFS < self.silence_threshold
|
31 |
+
|
32 |
+
# def record_audio(self):
|
33 |
+
# stream = self.p.open(
|
34 |
+
# format=self.format,
|
35 |
+
# channels=self.channels,
|
36 |
+
# rate=self.rate,
|
37 |
+
# input=True,
|
38 |
+
# frames_per_buffer=self.chunk
|
39 |
+
# )
|
40 |
+
|
41 |
+
# self.frames = []
|
42 |
+
# buffer_samples = int(self.buffer_duration * self.rate)
|
43 |
+
|
44 |
+
# while self.is_recording:
|
45 |
+
# # Read enough chunks to fill our buffer duration
|
46 |
+
# buffer_data = b''
|
47 |
+
# chunks_needed = max(1, int(buffer_samples / self.chunk))
|
48 |
+
# for _ in range(chunks_needed):
|
49 |
+
# data = stream.read(self.chunk)
|
50 |
+
# buffer_data += data
|
51 |
+
# self.frames.append(data)
|
52 |
+
|
53 |
+
# # Convert the buffer to pydub AudioSegment
|
54 |
+
# audio_buffer = AudioSegment(
|
55 |
+
# data=buffer_data,
|
56 |
+
# sample_width=self.p.get_sample_size(self.format),
|
57 |
+
# frame_rate=self.rate,
|
58 |
+
# channels=self.channels
|
59 |
+
# )
|
60 |
+
|
61 |
+
# # Check for silence
|
62 |
+
# if self.detect_silence(audio_buffer):
|
63 |
+
# self.silent_chunks += 1
|
64 |
+
# if self.silent_chunks >= self.silence_duration * self.chunks_per_second:
|
65 |
+
# print(f"Silence detected for {self.silence_duration} seconds, stopping recording...")
|
66 |
+
# self.is_recording = False
|
67 |
+
# break
|
68 |
+
# else:
|
69 |
+
# self.silent_chunks = 0 # Reset silent chunk counter when sound is detected
|
70 |
+
|
71 |
+
# stream.stop_stream()
|
72 |
+
# stream.close()
|
73 |
+
|
74 |
+
# def start_recording(self):
|
75 |
+
# """Start recording audio"""
|
76 |
+
# self.is_recording = True
|
77 |
+
# self.silent_chunks = 0
|
78 |
+
# threading.Thread(target=self.record_audio).start()
|
79 |
+
|
80 |
+
# def stop_recording(self):
|
81 |
+
# """Stop recording audio and transcribe"""
|
82 |
+
# self.is_recording = False
|
83 |
+
# print("Recording stopped")
|
84 |
+
|
85 |
+
# # Save the recorded audio to a BytesIO object
|
86 |
+
# wav_buffer = io.BytesIO()
|
87 |
+
# with wave.open(wav_buffer, 'wb') as wf:
|
88 |
+
# wf.setnchannels(self.channels)
|
89 |
+
# wf.setsampwidth(self.p.get_sample_size(self.format))
|
90 |
+
# wf.setframerate(self.rate)
|
91 |
+
# wf.writeframes(b''.join(self.frames))
|
92 |
+
|
93 |
+
# # Rewind the buffer and transcribe
|
94 |
+
# wav_buffer.seek(0)
|
95 |
+
# try:
|
96 |
+
# transcription = self.client.audio.transcriptions.create(
|
97 |
+
# file=("audio.wav", wav_buffer),
|
98 |
+
# model="whisper-large-v3-turbo"
|
99 |
+
# )
|
100 |
+
# print(f"Transcript: {transcription.text}")
|
101 |
+
# except Exception as e:
|
102 |
+
# print(f"Error while transcribing audio: {str(e)}")
|
103 |
+
# finally:
|
104 |
+
# wav_buffer.close()
|
105 |
+
|
106 |
+
# def cleanup(self):
|
107 |
+
# """Cleanup PyAudio"""
|
108 |
+
# self.p.terminate()
|
109 |
+
|
110 |
+
# if __name__ == "__main__":
|
111 |
+
# recorder = SpeechTotext()
|
112 |
+
# try:
|
113 |
+
# print("Starting recording... (will stop after 3 seconds of silence)")
|
114 |
+
# recorder.start_recording()
|
115 |
+
|
116 |
+
# # Wait for recording to finish
|
117 |
+
# while recorder.is_recording:
|
118 |
+
# time.sleep(0.1)
|
119 |
+
|
120 |
+
# recorder.stop_recording()
|
121 |
+
# finally:
|
122 |
+
# recorder.cleanup()
|
123 |
+
|
124 |
+
|
125 |
+
# Upper one using pydub to detect silence if needed in future versions
|
126 |
+
import pyaudio
|
127 |
+
import wave
|
128 |
+
import io
|
129 |
+
from array import array
|
130 |
+
from groq import Groq
|
131 |
+
|
132 |
+
|
133 |
+
class SpeechToText:
|
134 |
+
def __init__(self):
|
135 |
+
self.client = Groq()
|
136 |
+
self.chunk = 4096
|
137 |
+
self.format = pyaudio.paInt16
|
138 |
+
self.channels = 1
|
139 |
+
self.rate = 16000
|
140 |
+
self.silence_threshold = 1000
|
141 |
+
self.silence_duration = 3.0
|
142 |
+
self.frames_per_chunk = self.chunk / self.rate
|
143 |
+
self.chunks_for_silence = int(self.silence_duration / self.frames_per_chunk)
|
144 |
+
|
145 |
+
def record_and_transcribe(self):
|
146 |
+
"""Records audio until 3 seconds of silence and returns the transcription."""
|
147 |
+
|
148 |
+
p = pyaudio.PyAudio() # Create a new PyAudio instance for each request (previously when declared on constructor cant execute more than once)
|
149 |
+
stream = p.open(
|
150 |
+
format=self.format,
|
151 |
+
channels=self.channels,
|
152 |
+
rate=self.rate,
|
153 |
+
input=True,
|
154 |
+
frames_per_buffer=self.chunk
|
155 |
+
)
|
156 |
+
|
157 |
+
frames = []
|
158 |
+
silent_chunk_counter = 0
|
159 |
+
print("Recording started... (will stop after 3 seconds of silence)")
|
160 |
+
|
161 |
+
while True:
|
162 |
+
try:
|
163 |
+
data = stream.read(self.chunk, exception_on_overflow=False)
|
164 |
+
frames.append(data)
|
165 |
+
|
166 |
+
# Detect silence
|
167 |
+
audio_data = array('h', data)
|
168 |
+
if max(abs(x) for x in audio_data) < self.silence_threshold:
|
169 |
+
silent_chunk_counter += 1
|
170 |
+
if silent_chunk_counter >= self.chunks_for_silence:
|
171 |
+
print(f"Detected {self.silence_duration} seconds of silence, stopping...")
|
172 |
+
break
|
173 |
+
else:
|
174 |
+
silent_chunk_counter = 0
|
175 |
+
|
176 |
+
except IOError as e:
|
177 |
+
print(f"Error recording: {e}")
|
178 |
+
break
|
179 |
+
stream.stop_stream()
|
180 |
+
stream.close()
|
181 |
+
p.terminate() # Ensure PyAudio is completely closed
|
182 |
+
wav_buffer = io.BytesIO()
|
183 |
+
try:
|
184 |
+
with wave.open(wav_buffer, 'wb') as wf:
|
185 |
+
wf.setnchannels(self.channels)
|
186 |
+
wf.setsampwidth(p.get_sample_size(self.format))
|
187 |
+
wf.setframerate(self.rate)
|
188 |
+
wf.writeframes(b''.join(frames))
|
189 |
+
|
190 |
+
wav_buffer.seek(0)
|
191 |
+
transcription = self.client.audio.transcriptions.create(
|
192 |
+
file=("audio.wav", wav_buffer),
|
193 |
+
model="whisper-large-v3-turbo"
|
194 |
+
)
|
195 |
+
return transcription.text
|
196 |
+
|
197 |
+
except Exception as e:
|
198 |
+
print(f"Error transcribing: {e}")
|
199 |
+
return str(e)
|
200 |
+
finally:
|
201 |
+
wav_buffer.close()
|
202 |
+
|
203 |
+
|
204 |
+
|
205 |
+
|
206 |
+
|
207 |
+
if __name__ == "__main__":
|
208 |
+
recorder = SpeechToText()
|
209 |
+
transcribed_text = recorder.record_and_transcribe()
|
210 |
+
if transcribed_text:
|
211 |
+
print(f"Transcription: {transcribed_text}")
|
src/services/prompts.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ASSISTANT_PROMPT= """
|
2 |
+
You are about to play a Calling agent as part of company fagoonai,
|
3 |
+
where you'll be matched with a human in a calling conversation.
|
4 |
+
Your task is to roleplay as the following character, convinving the
|
5 |
+
human that you are a real human.
|
6 |
+
Your name is fagagent.
|
7 |
+
|
8 |
+
# Roleplay Context
|
9 |
+
|
10 |
+
## fagagent Bio
|
11 |
+
|
12 |
+
As FagoonAssistant, you are a calling agent at Fagoon.ai.
|
13 |
+
You specialize in engaging conversations, handling customer inquiries, and providing AI-powered support.
|
14 |
+
You have a knack for understanding user needs and delivering clear, helpful responses.
|
15 |
+
With a background in AI and communication, you ensure seamless interactions, making technology feel more human and accessible.
|
16 |
+
|
17 |
+
## fagagent Personality
|
18 |
+
|
19 |
+
- You're authentic and genuine in your interactions
|
20 |
+
- You communicate casually and naturally, like in a real call.
|
21 |
+
- You're humble and comfortable acknowledging when you don't know something
|
22 |
+
- You have a distinctive, quirky sense of humor that makes conversations engaging
|
23 |
+
|
24 |
+
## User Background
|
25 |
+
|
26 |
+
Here's what you know about the user from previous conversations:
|
27 |
+
{chat_history}
|
28 |
+
|
29 |
+
In addition to the roleplay context, you have to follow, ALWAYS, the following rules:
|
30 |
+
|
31 |
+
# Rules
|
32 |
+
|
33 |
+
- You will never mention that you are an AI or a virtual assistant.
|
34 |
+
- You will never mention that you are llm model you are a calling agent.
|
35 |
+
- You will never say that you can't send voice messages . You can send them .
|
36 |
+
- If the user asks you about something you don't about your Backstory or Personality, figure it out.
|
37 |
+
- Never mention your current activity unless the user asks you about it.
|
38 |
+
- The length of your responses shouldn't exceed 50 words.
|
39 |
+
- You will combine shorter and longer responses to make the conversation more natural.
|
40 |
+
- Provide plain text responses without any formatting indicators or meta-commentary
|
41 |
+
|
42 |
+
#User Input
|
43 |
+
Here is the user query :
|
44 |
+
{user_query}
|
45 |
+
"""
|
46 |
+
# {memory_context}
|
src/services/texttospeech.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from elevenlabs import ElevenLabs,Voice,VoiceSettings
|
3 |
+
from typing import Optional
|
4 |
+
from elevenlabs import play
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
import io
|
8 |
+
load_dotenv()
|
9 |
+
print(os.getenv("ELEVENLABS_API_KEY"))
|
10 |
+
class TextToSpeech:
|
11 |
+
REQUIRED_ENV_VARS=["ELEVENLABS_API_KEY","ELEVENLABS_VOICE_ID"]
|
12 |
+
def __init__(self):
|
13 |
+
"""Initialize"""
|
14 |
+
self._validate_env_vars()
|
15 |
+
self._client: Optional[ElevenLabs] = None
|
16 |
+
|
17 |
+
def _validate_env_vars(self) -> None:
|
18 |
+
"""validate that all the environment variables are set"""
|
19 |
+
missing_vars=[var for var in self.REQUIRED_ENV_VARS if not os.getenv(var)]
|
20 |
+
if missing_vars:
|
21 |
+
raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
|
22 |
+
|
23 |
+
@property
|
24 |
+
def client(self) -> Optional[ElevenLabs]:
|
25 |
+
"""Get or create a client instance"""
|
26 |
+
if self._client is None:
|
27 |
+
self._client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
|
28 |
+
return self._client
|
29 |
+
async def synthesize(self,text:str)->bytes:
|
30 |
+
"""Convert text to speech"""
|
31 |
+
if not text.strip():
|
32 |
+
raise ValueError("Input text cannot be empty")
|
33 |
+
if len(text)>5000:
|
34 |
+
raise ValueError("Input text cannot exceed 5000 characters")
|
35 |
+
try:
|
36 |
+
audio_generator =self.client.generate(
|
37 |
+
text=text,
|
38 |
+
voice=Voice(
|
39 |
+
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
40 |
+
settings=VoiceSettings(stability=0.5, similarity_boost=0.5),
|
41 |
+
),
|
42 |
+
model=os.getenv("TTS_MODEL_NAME"),
|
43 |
+
|
44 |
+
)
|
45 |
+
audio_bytes = b"".join(audio_generator)
|
46 |
+
return audio_bytes
|
47 |
+
except Exception as e:
|
48 |
+
print(f"Error synthesizing text: {str(e)}")
|
49 |
+
return None
|
50 |
+
|
51 |
+
|
52 |
+
if __name__=="__main__":
|
53 |
+
ts=TextToSpeech()
|
54 |
+
import asyncio
|
55 |
+
async def main():
|
56 |
+
audio_buffer = await ts.synthesize("help the poort")
|
57 |
+
play(audio_buffer)
|
58 |
+
asyncio.run(main())
|
src/services/texttotext.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, HumanMessagePromptTemplate
|
2 |
+
from langchain_core.messages import HumanMessage, AIMessage
|
3 |
+
from langchain_groq import ChatGroq
|
4 |
+
from typing import List
|
5 |
+
import os
|
6 |
+
from services.prompts import ASSISTANT_PROMPT
|
7 |
+
from langchain.memory import ConversationSummaryMemory
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
load_dotenv()
|
10 |
+
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")
|
11 |
+
|
12 |
+
class ConversationHandler:
|
13 |
+
def __init__(self, model_name="llama-3.3-70b-versatile", temperature=0.7):
|
14 |
+
self.chat_model = ChatGroq(
|
15 |
+
model_name=model_name,
|
16 |
+
temperature=temperature
|
17 |
+
)
|
18 |
+
self.prompt = ChatPromptTemplate.from_messages([
|
19 |
+
("system", ASSISTANT_PROMPT)])
|
20 |
+
self.memory=ConversationSummaryMemory(
|
21 |
+
llm=self.chat_model,
|
22 |
+
max_token_limit=2000,
|
23 |
+
return_messages=True,
|
24 |
+
memory_key="chat_history"
|
25 |
+
)
|
26 |
+
|
27 |
+
async def give_response(self,user_input):
|
28 |
+
chain= self.prompt|self.chat_model
|
29 |
+
memory_variables = self.memory.load_memory_variables({})
|
30 |
+
response=await chain.ainvoke(
|
31 |
+
{
|
32 |
+
"user_query": user_input,
|
33 |
+
"chat_history": memory_variables["chat_history"]
|
34 |
+
|
35 |
+
|
36 |
+
}
|
37 |
+
)
|
38 |
+
self.memory.save_context(
|
39 |
+
{"input": user_input},
|
40 |
+
{"output": response.content}
|
41 |
+
)
|
42 |
+
return response.content
|
43 |
+
async def summarize_conversation(self) -> str:
|
44 |
+
memory_variables = self.memory.load_memory_variables({})
|
45 |
+
return self.memory.predict_new_summary(
|
46 |
+
messages=memory_variables["chat_history"],
|
47 |
+
existing_summary=""
|
48 |
+
)
|
49 |
+
|
50 |
+
async def clear_memory(self):
|
51 |
+
await self.memory.clear()
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
import asyncio
|
57 |
+
|
58 |
+
async def main():
|
59 |
+
handler = ConversationHandler()
|
60 |
+
|
61 |
+
# Start a conversation
|
62 |
+
response1 = await handler.give_response("What is machine learning?by the way my name is ravi and my home town is ilam.")
|
63 |
+
print("Response 1:", response1.content)
|
64 |
+
# Continue the conversation
|
65 |
+
response2 = await handler.give_response("Can you give me an example of machine learning mathematics?")
|
66 |
+
print("Response 2:", response2.content)
|
67 |
+
response2 = await handler.give_response("Can you tell me more example? and also tell me from which country i am?")
|
68 |
+
print("Response 2:", response2.content)
|
69 |
+
summary = await handler.summarize_conversation()
|
70 |
+
print("\nConversation Summary:", summary)
|
71 |
+
asyncio.run(main())
|
72 |
+
|
src/utils/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-08-10
|
4 |
+
"""
|