Spaces:
Sleeping
Sleeping
File size: 7,561 Bytes
6250169 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
import asyncio
from dotenv import load_dotenv
import shutil
import subprocess
import requests
import time
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.chains import LLMChain
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
LiveTranscriptionEvents,
LiveOptions,
Microphone,
)
load_dotenv()
class LanguageModelProcessor:
def __init__(self, groq_api_key):
self.llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768", groq_api_key=groq_api_key)
self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# Load the system prompt from a file
with open('system_prompt.txt', 'r') as file:
system_prompt = file.read().strip()
self.prompt = ChatPromptTemplate.from_messages([
SystemMessagePromptTemplate.from_template(system_prompt),
MessagesPlaceholder(variable_name="chat_history"),
HumanMessagePromptTemplate.from_template("{text}")
])
self.conversation = LLMChain(
llm=self.llm,
prompt=self.prompt,
memory=self.memory
)
def process(self, text):
self.memory.chat_memory.add_user_message(text) # Add user message to memory
start_time = time.time()
# Go get the response from the LLM
response = self.conversation.invoke({"text": text})
end_time = time.time()
self.memory.chat_memory.add_ai_message(response['text']) # Add AI response to memory
elapsed_time = int((end_time - start_time) * 1000)
print(f"LLM ({elapsed_time}ms): {response['text']}")
return response['text']
class TextToSpeech:
def __init__(self, deepgram_api_key):
self.DG_API_KEY = deepgram_api_key
# Set your Deepgram API Key and desired voice model
MODEL_NAME = "aura-helios-en" # Example model name, change as needed
@staticmethod
def is_installed(lib_name: str) -> bool:
lib = shutil.which(lib_name)
return lib is not None
def speak(self, text):
if not self.is_installed("ffplay"):
raise ValueError("ffplay not found, necessary to stream audio.")
DEEPGRAM_URL = f"https://api.deepgram.com/v1/speak?model={self.MODEL_NAME}&performance=some&encoding=linear16&sample_rate=24000"
headers = {
"Authorization": f"Token {self.DG_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"text": text
}
player_command = ["ffplay", "-autoexit", "-", "-nodisp"]
player_process = subprocess.Popen(
player_command,
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
start_time = time.time() # Record the time before sending the request
first_byte_time = None # Initialize a variable to store the time when the first byte is received
with requests.post(DEEPGRAM_URL, stream=True, headers=headers, json=payload) as r:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
if first_byte_time is None: # Check if this is the first chunk received
first_byte_time = time.time() # Record the time when the first byte is received
ttfb = int((first_byte_time - start_time)*1000) # Calculate the time to first byte
print(f"TTS Time to First Byte (TTFB): {ttfb}ms\n")
player_process.stdin.write(chunk)
player_process.stdin.flush()
if player_process.stdin:
player_process.stdin.close()
player_process.wait()
class TranscriptCollector:
def __init__(self):
self.reset()
def reset(self):
self.transcript_parts = []
def add_part(self, part):
self.transcript_parts.append(part)
def get_full_transcript(self):
return ' '.join(self.transcript_parts)
transcript_collector = TranscriptCollector()
async def get_transcript(callback):
transcription_complete = asyncio.Event() # Event to signal transcription completion
try:
# example of setting up a client config. logging values: WARNING, VERBOSE, DEBUG, SPAM
config = DeepgramClientOptions(options={"keepalive": "true"})
deepgram: DeepgramClient = DeepgramClient("", config)
dg_connection = deepgram.listen.asynclive.v("1")
print ("Listening...")
async def on_message(self, result, **kwargs):
sentence = result.channel.alternatives[0].transcript
if not result.speech_final:
transcript_collector.add_part(sentence)
else:
# This is the final part of the current sentence
transcript_collector.add_part(sentence)
full_sentence = transcript_collector.get_full_transcript()
# Check if the full_sentence is not empty before printing
if len(full_sentence.strip()) > 0:
full_sentence = full_sentence.strip()
print(f"Human: {full_sentence}")
callback(full_sentence) # Call the callback with the full_sentence
transcript_collector.reset()
transcription_complete.set() # Signal to stop transcription and exit
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
options = LiveOptions(
model="nova-2",
punctuate=True,
language="en-US",
encoding="linear16",
channels=1,
sample_rate=16000,
endpointing=300,
smart_format=True,
)
await dg_connection.start(options)
# Open a microphone stream on the default input device
microphone = Microphone(dg_connection.send)
microphone.start()
await transcription_complete.wait() # Wait for the transcription to complete instead of looping indefinitely
# Wait for the microphone to close
microphone.finish()
# Indicate that we've finished
await dg_connection.finish()
except Exception as e:
print(f"Could not open socket: {e}")
return
class ConversationManager:
def __init__(self):
self.transcription_response = ""
self.llm = LanguageModelProcessor()
async def main(self):
def handle_full_sentence(full_sentence):
self.transcription_response = full_sentence
# Loop indefinitely until "goodbye" is detected
while True:
await get_transcript(handle_full_sentence)
# Check for "goodbye" to exit the loop
if "goodbye" in self.transcription_response.lower():
break
llm_response = self.llm.process(self.transcription_response)
tts = TextToSpeech()
tts.speak(llm_response)
# Reset transcription_response for the next loop iteration
self.transcription_response = ""
if __name__ == "__main__":
manager = ConversationManager()
asyncio.run(manager.main()) |