|
from fastapi import FastAPI, UploadFile, File, Response, Request |
|
from fastapi.staticfiles import StaticFiles |
|
import ggwave |
|
import scipy.io.wavfile as wav |
|
import numpy as np |
|
import os |
|
from pydantic import BaseModel |
|
from groq import Groq |
|
import io |
|
import wave |
|
|
|
app = FastAPI() |
|
|
|
|
|
app.mount("/static", StaticFiles(directory="static"), name="static") |
|
|
|
|
|
instance = ggwave.init() |
|
|
|
|
|
client = Groq(api_key=os.environ.get("GROQ_API_KEY")) |
|
|
|
class TextInput(BaseModel): |
|
text: str |
|
|
|
@app.get("/") |
|
async def serve_homepage(): |
|
"""Serve the chat interface HTML.""" |
|
with open("static/index.html", "r") as f: |
|
return Response(content=f.read(), media_type="text/html") |
|
|
|
@app.post("/stt/") |
|
async def speech_to_text(file: UploadFile = File(...)): |
|
"""Convert WAV audio file to text using ggwave.""" |
|
with open("temp.wav", "wb") as audio_file: |
|
audio_file.write(await file.read()) |
|
|
|
|
|
fs, recorded_waveform = wav.read("temp.wav") |
|
os.remove("temp.wav") |
|
|
|
|
|
waveform_bytes = recorded_waveform.astype(np.uint8).tobytes() |
|
decoded_message = ggwave.decode(instance, waveform_bytes) |
|
|
|
return {"text": decoded_message} |
|
|
|
@app.post("/tts/") |
|
def text_to_speech(input_text: TextInput): |
|
"""Convert text to a WAV audio file using ggwave and return as response.""" |
|
encoded_waveform = ggwave.encode(instance, input_text.text.encode('utf-8'), protocolId=1, volume=100) |
|
|
|
|
|
waveform_float32 = np.frombuffer(encoded_waveform, dtype=np.float32) |
|
|
|
|
|
waveform_int16 = np.int16(waveform_float32 * 32767) |
|
|
|
|
|
buffer = io.BytesIO() |
|
with wave.open(buffer, "wb") as wf: |
|
wf.setnchannels(1) |
|
wf.setsampwidth(2) |
|
wf.setframerate(48000) |
|
wf.writeframes(waveform_int16.tobytes()) |
|
|
|
buffer.seek(0) |
|
return Response(content=buffer.getvalue(), media_type="audio/wav") |
|
|
|
@app.post("/chat/") |
|
async def chat_with_llm(file: UploadFile = File(...)): |
|
"""Process input WAV, send text to LLM, and return generated response as WAV.""" |
|
with open("input_chat.wav", "wb") as audio_file: |
|
audio_file.write(await file.read()) |
|
|
|
|
|
fs, recorded_waveform = wav.read("input_chat.wav") |
|
os.remove("input_chat.wav") |
|
|
|
|
|
waveform_bytes = recorded_waveform.astype(np.uint8).tobytes() |
|
user_message = ggwave.decode(instance, waveform_bytes) |
|
|
|
|
|
chat_completion = client.chat.completions.create( |
|
messages=[{"role": "user", "content": user_message}], |
|
model="llama-3.3-70b-versatile", |
|
) |
|
llm_response = chat_completion.choices[0].message.content |
|
|
|
|
|
response_waveform = ggwave.encode(instance, llm_response) |
|
buffer = io.BytesIO() |
|
wav.write(buffer, 44100, np.frombuffer(response_waveform, dtype=np.uint8)) |
|
buffer.seek(0) |
|
|
|
return Response(content=buffer.getvalue(), media_type="audio/wav", headers={ |
|
"X-User-Message": user_message, |
|
"X-LLM-Response": llm_response |
|
}) |