SmartQuery / app.py
JulsdL's picture
Enhance app.py with environmental loading, transcription, and text-to-speech capabilities using ElevenLabs API and AsyncOpenAI; refactor message processing and integrate audio handling steps.
b77b218
raw
history blame
4.55 kB
from io import BytesIO
import os
import chainlit as cl
import httpx
from dotenv import load_dotenv
from langchain.schema.runnable.config import RunnableConfig
from sql_agent import SQLAgent
from openai import AsyncOpenAI
from chainlit.element import Audio
# Load the .env file
load_dotenv()
# Set up the transcription API (e.g., Eleven Labs)
ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
ELEVENLABS_VOICE_ID = os.environ.get("ELEVENLABS_VOICE_ID")
if not ELEVENLABS_API_KEY or not ELEVENLABS_VOICE_ID:
raise ValueError("ELEVENLABS_API_KEY and ELEVENLABS_VOICE_ID must be set")
client = AsyncOpenAI()
@cl.step(type="tool")
async def speech_to_text(audio_file):
response = await client.audio.transcriptions.create(
model="whisper-1", file=audio_file
)
return response.text
@cl.step(type="tool")
async def generate_text_answer(transcription, images):
model = "gpt-4-turbo"
messages = [{"role": "user", "content": transcription}]
response = await client.chat.completions.create(
messages=messages, model=model, temperature=0.3
)
return response.choices[0].message.content
@cl.step(type="tool")
async def text_to_speech(text: str, mime_type: str):
CHUNK_SIZE = 1024
url = f"https://api.elevenlabs.io/v1/text-to-speech/{ELEVENLABS_VOICE_ID}"
headers = {
"Accept": mime_type,
"Content-Type": "application/json",
"xi-api-key": ELEVENLABS_API_KEY
}
data = {
"text": text,
"model_id": "eleven_monolingual_v1",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.5
}
}
async with httpx.AsyncClient(timeout=25.0) as client:
response = await client.post(url, json=data, headers=headers)
response.raise_for_status() # Ensure we notice bad responses
buffer = BytesIO()
buffer.name = f"output_audio.{mime_type.split('/')[1]}"
async for chunk in response.aiter_bytes(chunk_size=CHUNK_SIZE):
if chunk:
buffer.write(chunk)
buffer.seek(0)
return buffer.name, buffer.read()
@cl.on_chat_start
async def on_chat_start():
cl.user_session.set("agent", SQLAgent)
@cl.on_message
async def on_message(message: cl.Message):
await process_message(message.content)
@cl.on_audio_chunk
async def on_audio_chunk(chunk: cl.AudioChunk):
if chunk.isStart:
buffer = BytesIO()
# This is required for whisper to recognize the file type
buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
# Initialize the session for a new audio stream
cl.user_session.set("audio_buffer", buffer)
cl.user_session.set("audio_mime_type", chunk.mimeType)
cl.user_session.get("audio_buffer").write(chunk.data)
@cl.on_audio_end
async def on_audio_end(elements: list[Audio]):
audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
audio_buffer.seek(0)
audio_file = audio_buffer.read()
audio_mime_type: str = cl.user_session.get("audio_mime_type")
input_audio_el = Audio(
mime=audio_mime_type, content=audio_file, name=audio_buffer.name
)
await cl.Message(
author="You",
type="user_message",
content="",
elements=[input_audio_el, *elements]
).send()
answer_message = await cl.Message(content="").send()
whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
transcription = await speech_to_text(whisper_input)
await process_message(transcription, answer_message, audio_mime_type)
async def process_message(content: str, answer_message=None, mime_type=None):
agent = cl.user_session.get("agent")
cb = cl.AsyncLangchainCallbackHandler(stream_final_answer=True)
config = RunnableConfig(callbacks=[cb])
async with cl.Step(name="SmartQuery Agent", root=True) as step:
step.input = content
result = await agent.ainvoke(content, config=config)
final_answer = result.get('output', 'No answer returned')
await step.stream_token(final_answer)
if mime_type:
output_name, output_audio = await text_to_speech(final_answer, mime_type)
output_audio_el = Audio(
name=output_name,
auto_play=True,
mime=mime_type,
content=output_audio,
)
answer_message.elements = [output_audio_el]
await answer_message.update()
else:
await cl.Message(content=final_answer).send()