Spaces:

JulsdL
/

SmartQuery

Paused

SmartQuery / app.py

Enhance app.py with environmental loading, transcription, and text-to-speech capabilities using ElevenLabs API and AsyncOpenAI; refactor message processing and integrate audio handling steps.

b77b218 10 months ago

raw

history blame

4.55 kB

	from io import BytesIO
	import os
	import chainlit as cl
	import httpx
	from dotenv import load_dotenv
	from langchain.schema.runnable.config import RunnableConfig
	from sql_agent import SQLAgent
	from openai import AsyncOpenAI
	from chainlit.element import Audio

	# Load the .env file
	load_dotenv()

	# Set up the transcription API (e.g., Eleven Labs)
	ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
	ELEVENLABS_VOICE_ID = os.environ.get("ELEVENLABS_VOICE_ID")

	if not ELEVENLABS_API_KEY or not ELEVENLABS_VOICE_ID:
	raise ValueError("ELEVENLABS_API_KEY and ELEVENLABS_VOICE_ID must be set")

	client = AsyncOpenAI()

	@cl.step(type="tool")
	async def speech_to_text(audio_file):
	response = await client.audio.transcriptions.create(
	model="whisper-1", file=audio_file
	)
	return response.text

	@cl.step(type="tool")
	async def generate_text_answer(transcription, images):
	model = "gpt-4-turbo"
	messages = [{"role": "user", "content": transcription}]
	response = await client.chat.completions.create(
	messages=messages, model=model, temperature=0.3
	)
	return response.choices[0].message.content

	@cl.step(type="tool")
	async def text_to_speech(text: str, mime_type: str):
	CHUNK_SIZE = 1024
	url = f"https://api.elevenlabs.io/v1/text-to-speech/{ELEVENLABS_VOICE_ID}"

	headers = {
	"Accept": mime_type,
	"Content-Type": "application/json",
	"xi-api-key": ELEVENLABS_API_KEY
	}

	data = {
	"text": text,
	"model_id": "eleven_monolingual_v1",
	"voice_settings": {
	"stability": 0.5,
	"similarity_boost": 0.5
	}
	}

	async with httpx.AsyncClient(timeout=25.0) as client:
	response = await client.post(url, json=data, headers=headers)
	response.raise_for_status() # Ensure we notice bad responses

	buffer = BytesIO()
	buffer.name = f"output_audio.{mime_type.split('/')[1]}"

	async for chunk in response.aiter_bytes(chunk_size=CHUNK_SIZE):
	if chunk:
	buffer.write(chunk)

	buffer.seek(0)
	return buffer.name, buffer.read()

	@cl.on_chat_start
	async def on_chat_start():
	cl.user_session.set("agent", SQLAgent)

	@cl.on_message
	async def on_message(message: cl.Message):
	await process_message(message.content)

	@cl.on_audio_chunk
	async def on_audio_chunk(chunk: cl.AudioChunk):
	if chunk.isStart:
	buffer = BytesIO()
	# This is required for whisper to recognize the file type
	buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
	# Initialize the session for a new audio stream
	cl.user_session.set("audio_buffer", buffer)
	cl.user_session.set("audio_mime_type", chunk.mimeType)

	cl.user_session.get("audio_buffer").write(chunk.data)

	@cl.on_audio_end
	async def on_audio_end(elements: list[Audio]):
	audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
	audio_buffer.seek(0)
	audio_file = audio_buffer.read()
	audio_mime_type: str = cl.user_session.get("audio_mime_type")

	input_audio_el = Audio(
	mime=audio_mime_type, content=audio_file, name=audio_buffer.name
	)
	await cl.Message(
	author="You",
	type="user_message",
	content="",
	elements=[input_audio_el, *elements]
	).send()

	answer_message = await cl.Message(content="").send()

	whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
	transcription = await speech_to_text(whisper_input)

	await process_message(transcription, answer_message, audio_mime_type)

	async def process_message(content: str, answer_message=None, mime_type=None):
	agent = cl.user_session.get("agent")
	cb = cl.AsyncLangchainCallbackHandler(stream_final_answer=True)
	config = RunnableConfig(callbacks=[cb])

	async with cl.Step(name="SmartQuery Agent", root=True) as step:
	step.input = content
	result = await agent.ainvoke(content, config=config)

	final_answer = result.get('output', 'No answer returned')

	await step.stream_token(final_answer)

	if mime_type:
	output_name, output_audio = await text_to_speech(final_answer, mime_type)
	output_audio_el = Audio(
	name=output_name,
	auto_play=True,
	mime=mime_type,
	content=output_audio,
	)
	answer_message.elements = [output_audio_el]
	await answer_message.update()
	else:
	await cl.Message(content=final_answer).send()