from fastapi import FastAPI, UploadFile, File, Response, Request from fastapi.staticfiles import StaticFiles import ggwave import scipy.io.wavfile as wav import numpy as np import os from pydantic import BaseModel from groq import Groq import io import wave app = FastAPI() # Serve static files app.mount("/static", StaticFiles(directory="static"), name="static") # Initialize ggwave instance instance = ggwave.init() # Initialize Groq client client = Groq(api_key=os.environ.get("GROQ_API_KEY")) class TextInput(BaseModel): text: str @app.get("/") async def serve_homepage(): """Serve the chat interface HTML.""" with open("static/index.html", "r") as f: return Response(content=f.read(), media_type="text/html") @app.post("/stt/") async def speech_to_text(file: UploadFile = File(...)): """Convert WAV audio file to text using ggwave.""" with open("temp.wav", "wb") as audio_file: audio_file.write(await file.read()) # Load WAV file fs, recorded_waveform = wav.read("temp.wav") os.remove("temp.wav") # Convert to bytes and decode waveform_bytes = recorded_waveform.astype(np.uint8).tobytes() decoded_message = ggwave.decode(instance, waveform_bytes) return {"text": decoded_message} @app.post("/tts/") def text_to_speech(input_text: TextInput): """Convert text to a WAV audio file using ggwave and return as response.""" encoded_waveform = ggwave.encode(input_text.text, protocolId=1, volume=100) # Convert byte data into float32 array waveform_float32 = np.frombuffer(encoded_waveform, dtype=np.float32) # Normalize float32 data to the range of int16 waveform_int16 = np.int16(waveform_float32 * 32767) # Save to buffer instead of a file buffer = io.BytesIO() with wave.open(buffer, "wb") as wf: wf.setnchannels(1) # Mono audio wf.setsampwidth(2) # 2 bytes per sample (16-bit PCM) wf.setframerate(48000) # Sample rate wf.writeframes(waveform_int16.tobytes()) # Write waveform as bytes buffer.seek(0) return Response(content=buffer.getvalue(), media_type="audio/wav") @app.post("/chat/") async def chat_with_llm(file: UploadFile = File(...)): """Process input WAV, send text to LLM, and return generated response as WAV.""" with open("input_chat.wav", "wb") as audio_file: audio_file.write(await file.read()) # Load WAV file fs, recorded_waveform = wav.read("input_chat.wav") os.remove("input_chat.wav") recorded_waveform = recorded_waveform.astype(np.float32) / 32767.0 waveform_bytes = recorded_waveform.tobytes() user_message = ggwave.decode(instance, waveform_bytes) print("user_message" + user_message.decode("utf-8") ) # Send to LLM chat_completion = client.chat.completions.create( messages=[ { "role": "system", "content": "you are a helpful assistant. answer alway in one sentence" }, {"role": "user", "content": user_message.decode("utf-8")}], model="llama-3.3-70b-versatile", ) llm_response = chat_completion.choices[0].message.content print(llm_response) # Convert response to audio """Convert text to a WAV audio file using ggwave and return as response.""" encoded_waveform = ggwave.encode(llm_response , protocolId=1, volume=100) # Convert byte data into float32 array waveform_float32 = np.frombuffer(encoded_waveform, dtype=np.float32) # Normalize float32 data to the range of int16 waveform_int16 = np.int16(waveform_float32 * 32767) # Save to buffer instead of a file buffer = io.BytesIO() with wave.open(buffer, "wb") as wf: wf.setnchannels(1) # Mono audio wf.setsampwidth(2) # 2 bytes per sample (16-bit PCM) wf.setframerate(48000) # Sample rate wf.writeframes(waveform_int16.tobytes()) # Write waveform as bytes buffer.seek(0) return Response(content=buffer.getvalue(), media_type="audio/wav", headers={ "X-User-Message": user_message.decode("utf-8"), "X-LLM-Response": llm_response })