Spaces:
Running
Running
File size: 3,840 Bytes
4eb5f46 6f15984 4eb5f46 957474e 6f15984 4eb5f46 6f15984 4eb5f46 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import spaces
import torch
import gradio as gr
import tempfile
import os
import uuid
import scipy.io.wavfile
import time
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
from kokoro import KPipeline
import soundfile as sf
import subprocess
# subprocess.run(
# "pip install flash-attn --no-build-isolation",
# env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
# shell=True,
# )
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16
MODEL_NAME = "openai/whisper-large-v3-turbo"
# attn_implementation="flash_attention_2"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True,
)
model.to(device)
# πΊπΈ 'a' => American English
# π¬π§ 'b' => British English
# π«π· 'f' => French fr-fr
tts_pipeline = KPipeline(lang_code='a', device=device) # <= make sure lang_code matches voice
processor = AutoProcessor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=tokenizer,
feature_extractor=processor.feature_extractor,
chunk_length_s=10,
torch_dtype=torch_dtype,
device=device,
)
@spaces.GPU
def stream_transcribe(stream, new_chunk):
start_time = time.time()
try:
sr, y = new_chunk
y[y!=y]=0
# Convert to mono if stereo
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
y /= np.max(np.abs(y))
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
end_time = time.time()
latency = end_time - start_time
return stream, transcription, f"{latency:.2f}"
except Exception as e:
print(f"Error during Transcription: {e}")
return stream, e, "Error"
def clear():
return ""
def clear_state():
return None
@spaces.GPU
def tts(target_text):
generator = tts_pipeline(
target_text, voice='af_heart', # <= change voice here
speed=1, split_pattern=r'\n+'
)
audios = []
for i, (gs, ps, audio) in enumerate(generator):
audios.append(audio.cpu().numpy())
return (24000, np.concatenate(audios))
with gr.Blocks() as microphone:
with gr.Column():
gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and π€ Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
with gr.Row():
input_audio_microphone = gr.Audio(streaming=True)
output = gr.Textbox(label="Transcription", value="")
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
with gr.Row():
clear_button = gr.Button("Clear Output")
state = gr.State()
input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
generate_btn = gr.Button("Synthesize", variant="primary")
audio_output = gr.Audio(label="Synthesized Audio")
generate_btn.click(
tts,
inputs=[
gen_text_input,
],
outputs=[audio_output],
)
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
gr.TabbedInterface([microphone], ["vc chat"])
demo.launch()
|