File size: 3,337 Bytes
917878c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Tuple
from huggingface_hub import InferenceClient
import edge_tts
import tempfile
import asyncio
import os
from fastapi.responses import FileResponse
from groq import Groq

app = FastAPI()

# Initialize the client for Hugging Face Inference API
# client = InferenceClient("unsloth/gemma-2b-it-bnb-4bit")
client = Groq(
    api_key='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'
)

async def text_to_speech(text, voice, rate, pitch):
    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
    submaker = edge_tts.SubMaker()
    

    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name

        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                tmp_file.write(chunk["data"])
            elif chunk["type"] == "WordBoundary":
                submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])


    # with open('test.vtt', "w", encoding="utf-8") as file:
    with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode='w+', encoding='utf-8') as tmp_file:
        tmp_vtt_path = tmp_file.name
        tmp_file.write(submaker.generate_subs())

    return tmp_path, tmp_vtt_path, None

def tts_interface(text, voice, rate, pitch):
    audio, vtt, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
    return audio, vtt, warning

@app.get("/")
def greet_json():
    return {"Hello": "World!"}


# Define a model for the incoming request
class ChatRequest(BaseModel):
    message: str
    history: List[Tuple[str, str]] = []
    system_message: str
    max_tokens: int = 512
    temperature: float = 0.7
    top_p: float = 0.95


@app.get("/file/")
def file(path: str):
    return FileResponse(path, media_type="audio/mpeg", filename="audio.mp3")

@app.get("/file-vtt/")
def fileVtt(path: str):
    return FileResponse(path)

# Define a route to handle POST requests
@app.post("/chat")
def chat(request: ChatRequest):
    messages = [{"role": "system", "content": request.system_message}]

    for val in request.history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": request.message})

    try:
        response = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=messages,
            max_tokens=request.max_tokens,
            stream=False,
            stop=None,
            temperature=request.temperature,
            top_p=request.top_p,
        )

        data = tts_interface((response.choices[0].message.content.replace('**', '')).replace('**', ''), 'en-GB-MaisieNeural - en-GB (Female)', 0, 0)

        if os.path.exists(data[0]):
            return {
                "text": response.choices[0].message.content.replace('**', ''),
                "audio" : data[0],
                "vtt" : data[1]
            }
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))