File size: 3,840 Bytes
4eb5f46
 
 
 
 
 
 
 
 
 
 
 
 
 
6f15984
 
 
 
 
4eb5f46
 
 
957474e
6f15984
4eb5f46
6f15984
4eb5f46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

import spaces
import torch
import gradio as gr
import tempfile
import os
import uuid
import scipy.io.wavfile
import time  
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
from kokoro import KPipeline
import soundfile as sf
import subprocess
# subprocess.run(
#     "pip install flash-attn --no-build-isolation",
#     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
#     shell=True,
# )

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16
MODEL_NAME = "openai/whisper-large-v3-turbo"
#  attn_implementation="flash_attention_2"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True,
)
model.to(device)


# πŸ‡ΊπŸ‡Έ 'a' => American English
# πŸ‡¬πŸ‡§ 'b' => British English
# πŸ‡«πŸ‡· 'f' => French fr-fr
tts_pipeline = KPipeline(lang_code='a', device=device) # <= make sure lang_code matches voice

processor = AutoProcessor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=10,
    torch_dtype=torch_dtype,
    device=device,
)

@spaces.GPU
def stream_transcribe(stream, new_chunk):
    start_time = time.time() 
    try:
        sr, y = new_chunk
        y[y!=y]=0
        # Convert to mono if stereo
        if y.ndim > 1:
            y = y.mean(axis=1)
            
        y = y.astype(np.float32)
        y /= np.max(np.abs(y))
    
        if stream is not None:
            stream = np.concatenate([stream, y])
        else:
            stream = y
            
        transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
        end_time = time.time()
        latency = end_time - start_time

        return stream, transcription, f"{latency:.2f}"
    except Exception as e:
        print(f"Error during Transcription: {e}")
        return stream, e, "Error"

def clear():
    return ""

def clear_state():
    return None

@spaces.GPU
def tts(target_text):
    generator = tts_pipeline(
        target_text, voice='af_heart', # <= change voice here
        speed=1, split_pattern=r'\n+'
    )
    audios = []
    for i, (gs, ps, audio) in enumerate(generator):
        audios.append(audio.cpu().numpy())
    return (24000, np.concatenate(audios))

with gr.Blocks() as microphone:
    with gr.Column():
        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
        with gr.Row():
            input_audio_microphone = gr.Audio(streaming=True)
            output = gr.Textbox(label="Transcription", value="")
            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
        with gr.Row():
            clear_button = gr.Button("Clear Output")
        state = gr.State()
        input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
        clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
    
    gen_text_input = gr.Textbox(label="Text to Generate", lines=10)

    generate_btn = gr.Button("Synthesize", variant="primary")

    audio_output = gr.Audio(label="Synthesized Audio")

    generate_btn.click(
        tts,
        inputs=[
            gen_text_input,
        ],
        outputs=[audio_output],
    )

with gr.Blocks(theme=gr.themes.Ocean()) as demo:
    gr.TabbedInterface([microphone], ["vc chat"])

demo.launch()