File size: 8,916 Bytes
2b42466
42343e0
526f427
3e15f14
 
 
2b42466
 
3e15f14
 
3344380
 
3e15f14
 
069092a
fa8a19e
3e15f14
f58fddb
3e15f14
f58fddb
 
 
 
 
 
 
 
 
 
2b42466
 
f58fddb
2b42466
 
f58fddb
 
 
 
 
 
 
2b42466
f58fddb
2b42466
f58fddb
fa8a19e
 
 
 
 
3e15f14
79f1f8d
f58fddb
79f1f8d
 
 
f58fddb
3e15f14
f58fddb
 
 
2b42466
 
3e15f14
 
 
 
 
 
 
 
 
 
 
 
2b42466
f58fddb
2b42466
 
3e15f14
 
 
 
 
 
 
 
 
2b42466
 
3e15f14
 
f58fddb
 
2b42466
01b8eeb
 
 
 
 
dc4f25f
3344380
79f1f8d
 
 
 
f58fddb
 
2b42466
3344380
 
 
2b42466
f58fddb
2b42466
f58fddb
2b42466
3344380
 
 
 
 
 
 
 
 
2b42466
3344380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526f427
 
 
 
 
 
 
 
 
 
 
 
 
 
3344380
 
 
 
 
 
 
526f427
 
 
 
 
 
 
 
 
 
 
 
3344380
 
 
 
526f427
 
 
 
 
 
 
 
 
 
 
 
3344380
 
3e15f14
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from collections.abc import AsyncGenerator
from pathlib import Path
import platform

import gradio as gr
import httpx
from httpx_sse import aconnect_sse
from openai import AsyncOpenAI

from faster_whisper_server.config import Config, Task
from faster_whisper_server.hf_utils import PiperModel

TRANSCRIPTION_ENDPOINT = "/v1/audio/transcriptions"
TRANSLATION_ENDPOINT = "/v1/audio/translations"
TIMEOUT_SECONDS = 180
TIMEOUT = httpx.Timeout(timeout=TIMEOUT_SECONDS)

# NOTE: `gr.Request` seems to be passed in as the last positional (not keyword) argument


def base_url_from_gradio_req(request: gr.Request) -> str:
    # NOTE: `request.request.url` seems to always have a path of "/gradio_api/queue/join"
    assert request.request is not None
    return f"{request.request.url.scheme}://{request.request.url.netloc}"


def http_client_from_gradio_req(request: gr.Request, config: Config) -> httpx.AsyncClient:
    base_url = base_url_from_gradio_req(request)
    return httpx.AsyncClient(
        base_url=base_url,
        timeout=TIMEOUT,
        headers={"Authorization": f"Bearer {config.api_key}"} if config.api_key else None,
    )


def openai_client_from_gradio_req(request: gr.Request, config: Config) -> AsyncOpenAI:
    base_url = base_url_from_gradio_req(request)
    return AsyncOpenAI(base_url=f"{base_url}/v1", api_key=config.api_key if config.api_key else "cant-be-empty")


def create_gradio_demo(config: Config) -> gr.Blocks:  # noqa: C901, PLR0915
    async def whisper_handler(
        file_path: str, model: str, task: Task, temperature: float, stream: bool, request: gr.Request
    ) -> AsyncGenerator[str, None]:
        http_client = http_client_from_gradio_req(request, config)
        if task == Task.TRANSCRIBE:
            endpoint = TRANSCRIPTION_ENDPOINT
        elif task == Task.TRANSLATE:
            endpoint = TRANSLATION_ENDPOINT

        if stream:
            previous_transcription = ""
            async for transcription in streaming_audio_task(http_client, file_path, endpoint, temperature, model):
                previous_transcription += transcription
                yield previous_transcription
        else:
            yield await audio_task(http_client, file_path, endpoint, temperature, model)

    async def audio_task(
        http_client: httpx.AsyncClient, file_path: str, endpoint: str, temperature: float, model: str
    ) -> str:
        with Path(file_path).open("rb") as file:  # noqa: ASYNC230
            response = await http_client.post(
                endpoint,
                files={"file": file},
                data={
                    "model": model,
                    "response_format": "text",
                    "temperature": temperature,
                },
            )

        response.raise_for_status()
        return response.text

    async def streaming_audio_task(
        http_client: httpx.AsyncClient, file_path: str, endpoint: str, temperature: float, model: str
    ) -> AsyncGenerator[str, None]:
        with Path(file_path).open("rb") as file:  # noqa: ASYNC230
            kwargs = {
                "files": {"file": file},
                "data": {
                    "response_format": "text",
                    "temperature": temperature,
                    "model": model,
                    "stream": True,
                },
            }
            async with aconnect_sse(http_client, "POST", endpoint, **kwargs) as event_source:
                async for event in event_source.aiter_sse():
                    yield event.data

    async def update_whisper_model_dropdown(request: gr.Request) -> gr.Dropdown:
        openai_client = openai_client_from_gradio_req(request, config)
        models = (await openai_client.models.list()).data
        model_names: list[str] = [model.id for model in models]
        assert config.whisper.model in model_names
        recommended_models = {model for model in model_names if model.startswith("Systran")}
        other_models = [model for model in model_names if model not in recommended_models]
        model_names = list(recommended_models) + other_models
        return gr.Dropdown(
            choices=model_names,
            label="Model",
            value=config.whisper.model,
        )

    async def update_piper_voices_dropdown(request: gr.Request) -> gr.Dropdown:
        http_client = http_client_from_gradio_req(request, config)
        res = (await http_client.get("/v1/audio/speech/voices")).raise_for_status()
        piper_models = [PiperModel.model_validate(x) for x in res.json()]
        return gr.Dropdown(choices=[model.voice for model in piper_models], label="Voice", value=DEFAULT_VOICE)

    async def handle_audio_speech(
        text: str, voice: str, response_format: str, speed: float, sample_rate: int | None, request: gr.Request
    ) -> Path:
        openai_client = openai_client_from_gradio_req(request, config)
        res = await openai_client.audio.speech.create(
            input=text,
            model="piper",
            voice=voice,  # pyright: ignore[reportArgumentType]
            response_format=response_format,  # pyright: ignore[reportArgumentType]
            speed=speed,
            extra_body={"sample_rate": sample_rate},
        )
        audio_bytes = res.response.read()
        file_path = Path(f"audio.{response_format}")
        with file_path.open("wb") as file:  # noqa: ASYNC230
            file.write(audio_bytes)
        return file_path

    with gr.Blocks(title="faster-whisper-server Playground") as demo:
        gr.Markdown(
            "### Consider supporting the project by starring the [repository on GitHub](https://github.com/fedirz/faster-whisper-server)."
        )
        with gr.Tab(label="Transcribe/Translate"):
            audio = gr.Audio(type="filepath")
            model_dropdown = gr.Dropdown(
                choices=[config.whisper.model],
                label="Model",
                value=config.whisper.model,
            )
            task_dropdown = gr.Dropdown(
                choices=[task.value for task in Task],
                label="Task",
                value=Task.TRANSCRIBE,
            )
            temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label="Temperature", value=0.0)
            stream_checkbox = gr.Checkbox(label="Stream", value=True)
            button = gr.Button("Generate")

            output = gr.Textbox()

            # NOTE: the inputs order must match the `whisper_handler` signature
            button.click(
                whisper_handler, [audio, model_dropdown, task_dropdown, temperature_slider, stream_checkbox], output
            )

        with gr.Tab(label="Speech Generation"):
            if platform.machine() != "x86_64":
                from faster_whisper_server.routers.speech import (
                    DEFAULT_VOICE,
                    MAX_SAMPLE_RATE,
                    MIN_SAMPLE_RATE,
                    SUPPORTED_RESPONSE_FORMATS,
                )

                text = gr.Textbox(label="Input Text")
                voice_dropdown = gr.Dropdown(
                    choices=["en_US-amy-medium"],
                    label="Voice",
                    value="en_US-amy-medium",
                    info="""
The last part of the voice name is the quality (x_low, low, medium, high).
Each quality has a different default sample rate:
- x_low: 16000 Hz
- low: 16000 Hz
- medium: 22050 Hz
- high: 22050 Hz
""",
                )
                response_fromat_dropdown = gr.Dropdown(
                    choices=SUPPORTED_RESPONSE_FORMATS,
                    label="Response Format",
                    value="wav",
                )
                speed_slider = gr.Slider(minimum=0.25, maximum=4.0, step=0.05, label="Speed", value=1.0)
                sample_rate_slider = gr.Number(
                    minimum=MIN_SAMPLE_RATE,
                    maximum=MAX_SAMPLE_RATE,
                    label="Desired Sample Rate",
                    info="""
Setting this will resample the generated audio to the desired sample rate.
You may want to set this if you are going to use voices of different qualities but want to keep the same sample rate.
Default: None (No resampling)
""",
                    value=lambda: None,
                )
                button = gr.Button("Generate Speech")
                output = gr.Audio(type="filepath")
                button.click(
                    handle_audio_speech,
                    [text, voice_dropdown, response_fromat_dropdown, speed_slider, sample_rate_slider],
                    output,
                )
                demo.load(update_piper_voices_dropdown, inputs=None, outputs=voice_dropdown)
            else:
                gr.Textbox("Speech generation is only supported on x86_64 machines.")

        demo.load(update_whisper_model_dropdown, inputs=None, outputs=model_dropdown)
    return demo