Spaces:

yasserrmd
/

ggwave

Sleeping

App Files Files Community

ggwave / app.py

yasserrmd

Update app.py

dad5570 verified 3 months ago

raw

history blame

5.17 kB

	from fastapi import FastAPI, UploadFile, File, Response, Request
	from fastapi.staticfiles import StaticFiles
	import ggwave
	import scipy.io.wavfile as wav
	import numpy as np
	import os
	from pydantic import BaseModel
	from groq import Groq
	import io
	import wave

	app = FastAPI()

	# Serve static files
	app.mount("/static", StaticFiles(directory="static"), name="static")

	# Initialize ggwave instance
	instance = ggwave.init()

	# Initialize Groq client
	client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

	class TextInput(BaseModel):
	text: str

	@app.get("/")
	async def serve_homepage():
	"""Serve the chat interface HTML."""
	with open("static/index.html", "r") as f:
	return Response(content=f.read(), media_type="text/html")

	@app.post("/stt/")
	async def speech_to_text(file: UploadFile = File(...)):
	"""Convert WAV audio file to text using ggwave."""
	with open("temp.wav", "wb") as audio_file:
	audio_file.write(await file.read())

	# Load WAV file
	fs, recorded_waveform = wav.read("temp.wav")
	os.remove("temp.wav")

	# Convert to bytes and decode
	waveform_bytes = recorded_waveform.astype(np.uint8).tobytes()
	decoded_message = ggwave.decode(instance, waveform_bytes)

	return {"text": decoded_message}

	@app.post("/tts/")
	def text_to_speech(input_text: TextInput):
	"""Convert text to a WAV audio file using ggwave and return as response."""
	encoded_waveform = ggwave.encode(input_text.text, protocolId=1, volume=100)

	# Convert byte data into float32 array
	waveform_float32 = np.frombuffer(encoded_waveform, dtype=np.float32)

	# Normalize float32 data to the range of int16
	waveform_int16 = np.int16(waveform_float32 * 32767)

	# Save to buffer instead of a file
	buffer = io.BytesIO()
	with wave.open(buffer, "wb") as wf:
	wf.setnchannels(1) # Mono audio
	wf.setsampwidth(2) # 2 bytes per sample (16-bit PCM)
	wf.setframerate(48000) # Sample rate
	wf.writeframes(waveform_int16.tobytes()) # Write waveform as bytes

	buffer.seek(0)
	return Response(content=buffer.getvalue(), media_type="audio/wav")

	@app.post("/chat/")
	async def chat_with_llm(file: UploadFile = File(...)):
	"""Process input WAV, send text to LLM, and return generated response as WAV."""
	# Initialize ggwave instance
	instance = ggwave.init()
	# Read the file content into memory without saving to disk
	file_content = await file.read()

	# Create a BytesIO object to use with wav.read
	with io.BytesIO(file_content) as buffer:
	try:
	fs, recorded_waveform = wav.read(buffer)
	recorded_waveform = recorded_waveform.astype(np.float32) / 32767.0
	waveform_bytes = recorded_waveform.tobytes()
	user_message = ggwave.decode(instance, waveform_bytes)

	if user_message is None:
	return Response(
	content="No message detected in audio",
	media_type="text/plain",
	status_code=400
	)

	print("user_message: " + user_message.decode("utf-8"))

	# Send to LLM
	chat_completion = client.chat.completions.create(
	messages=[
	{"role": "system", "content": "you are a helpful assistant. answer always in one sentence"},
	{"role": "user", "content": user_message.decode("utf-8")}
	],
	model="llama-3.3-70b-versatile",
	)

	llm_response = chat_completion.choices[0].message.content
	print(llm_response)

	# Convert response to audio
	encoded_waveform = ggwave.encode(llm_response, protocolId=1, volume=100)

	# Convert byte data into float32 array
	waveform_float32 = np.frombuffer(encoded_waveform, dtype=np.float32)

	# Normalize float32 data to the range of int16
	waveform_int16 = np.int16(waveform_float32 * 32767)

	# Save to buffer instead of a file
	buffer = io.BytesIO()
	with wave.open(buffer, "wb") as wf:
	wf.setnchannels(1) # Mono audio
	wf.setsampwidth(2) # 2 bytes per sample (16-bit PCM)
	wf.setframerate(48000) # Sample rate
	wf.writeframes(waveform_int16.tobytes()) # Write waveform as bytes

	buffer.seek(0)

	return Response(
	content=buffer.getvalue(),
	media_type="audio/wav",
	headers={
	"X-User-Message": user_message.decode("utf-8"),
	"X-LLM-Response": llm_response
	}
	)

	except Exception as e:
	print(f"Error processing audio: {str(e)}")
	return Response(
	content=f"Error processing audio: {str(e)}",
	media_type="text/plain",
	status_code=500
	)