from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import List, Tuple from huggingface_hub import InferenceClient import edge_tts import tempfile import asyncio import os from fastapi.responses import FileResponse from groq import Groq app = FastAPI() # Initialize the client for Hugging Face Inference API # client = InferenceClient("unsloth/gemma-2b-it-bnb-4bit") client = Groq( api_key='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' ) async def text_to_speech(text, voice, rate, pitch): voice_short_name = voice.split(" - ")[0] rate_str = f"{rate:+d}%" pitch_str = f"{pitch:+d}Hz" communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str) submaker = edge_tts.SubMaker() with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tmp_path = tmp_file.name async for chunk in communicate.stream(): if chunk["type"] == "audio": tmp_file.write(chunk["data"]) elif chunk["type"] == "WordBoundary": submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"]) # with open('test.vtt', "w", encoding="utf-8") as file: with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode='w+', encoding='utf-8') as tmp_file: tmp_vtt_path = tmp_file.name tmp_file.write(submaker.generate_subs()) return tmp_path, tmp_vtt_path, None def tts_interface(text, voice, rate, pitch): audio, vtt, warning = asyncio.run(text_to_speech(text, voice, rate, pitch)) return audio, vtt, warning @app.get("/") def greet_json(): return {"Hello": "World!"} # Define a model for the incoming request class ChatRequest(BaseModel): message: str history: List[Tuple[str, str]] = [] system_message: str max_tokens: int = 512 temperature: float = 0.7 top_p: float = 0.95 @app.get("/file/") def file(path: str): return FileResponse(path, media_type="audio/mpeg", filename="audio.mp3") @app.get("/file-vtt/") def fileVtt(path: str): return FileResponse(path) # Define a route to handle POST requests @app.post("/chat") def chat(request: ChatRequest): messages = [{"role": "system", "content": request.system_message}] for val in request.history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": request.message}) try: response = client.chat.completions.create( model="llama-3.1-8b-instant", messages=messages, max_tokens=request.max_tokens, stream=False, stop=None, temperature=request.temperature, top_p=request.top_p, ) data = tts_interface((response.choices[0].message.content.replace('**', '')).replace('**', ''), 'en-GB-MaisieNeural - en-GB (Female)', 0, 0) if os.path.exists(data[0]): return { "text": response.choices[0].message.content.replace('**', ''), "audio" : data[0], "vtt" : data[1] } except Exception as e: raise HTTPException(status_code=500, detail=str(e))