Spaces:

yasserrmd
/

ggwave

Sleeping

App Files Files Community

ggwave / app.py

yasserrmd

Update app.py

7db8704 verified 4 months ago

raw

history blame

3.4 kB

	from fastapi import FastAPI, UploadFile, File, Response, Request
	from fastapi.staticfiles import StaticFiles
	import ggwave
	import scipy.io.wavfile as wav
	import numpy as np
	import os
	from pydantic import BaseModel
	from groq import Groq
	import io
	import wave

	app = FastAPI()

	# Serve static files
	app.mount("/static", StaticFiles(directory="static"), name="static")

	# Initialize ggwave instance
	instance = ggwave.init()

	# Initialize Groq client
	client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

	class TextInput(BaseModel):
	text: str

	@app.get("/")
	async def serve_homepage():
	"""Serve the chat interface HTML."""
	with open("static/index.html", "r") as f:
	return Response(content=f.read(), media_type="text/html")

	@app.post("/stt/")
	async def speech_to_text(file: UploadFile = File(...)):
	"""Convert WAV audio file to text using ggwave."""
	with open("temp.wav", "wb") as audio_file:
	audio_file.write(await file.read())

	# Load WAV file
	fs, recorded_waveform = wav.read("temp.wav")
	os.remove("temp.wav")

	# Convert to bytes and decode
	waveform_bytes = recorded_waveform.astype(np.uint8).tobytes()
	decoded_message = ggwave.decode(instance, waveform_bytes)

	return {"text": decoded_message}

	@app.post("/tts/")
	def text_to_speech(input_text: TextInput):
	"""Convert text to a WAV audio file using ggwave and return as response."""
	encoded_waveform = ggwave.encode(instance, input_text.text.encode('utf-8'), protocolId=1, volume=100)

	# Convert byte data into float32 array
	waveform_float32 = np.frombuffer(encoded_waveform, dtype=np.float32)

	# Normalize float32 data to the range of int16
	waveform_int16 = np.int16(waveform_float32 * 32767)

	# Save to buffer instead of a file
	buffer = io.BytesIO()
	with wave.open(buffer, "wb") as wf:
	wf.setnchannels(1) # Mono audio
	wf.setsampwidth(2) # 2 bytes per sample (16-bit PCM)
	wf.setframerate(48000) # Sample rate
	wf.writeframes(waveform_int16.tobytes()) # Write waveform as bytes

	buffer.seek(0)
	return Response(content=buffer.getvalue(), media_type="audio/wav")

	@app.post("/chat/")
	async def chat_with_llm(file: UploadFile = File(...)):
	"""Process input WAV, send text to LLM, and return generated response as WAV."""
	with open("input_chat.wav", "wb") as audio_file:
	audio_file.write(await file.read())

	# Load WAV file
	fs, recorded_waveform = wav.read("input_chat.wav")
	os.remove("input_chat.wav")

	# Convert to bytes and decode
	waveform_bytes = recorded_waveform.astype(np.uint8).tobytes()
	user_message = ggwave.decode(instance, waveform_bytes)

	# Send to LLM
	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": user_message}],
	model="llama-3.3-70b-versatile",
	)
	llm_response = chat_completion.choices[0].message.content

	# Convert response to audio
	response_waveform = ggwave.encode(instance, llm_response)
	buffer = io.BytesIO()
	wav.write(buffer, 44100, np.frombuffer(response_waveform, dtype=np.uint8))
	buffer.seek(0)

	return Response(content=buffer.getvalue(), media_type="audio/wav", headers={
	"X-User-Message": user_message,
	"X-LLM-Response": llm_response
	})