srinivasbilla commited on
Commit
4eb5f46
Β·
verified Β·
1 Parent(s): 92d008a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -0
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import spaces
3
+ import torch
4
+ import gradio as gr
5
+ import tempfile
6
+ import os
7
+ import uuid
8
+ import scipy.io.wavfile
9
+ import time
10
+ import numpy as np
11
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
12
+ from kokoro import KPipeline
13
+ from IPython.display import display, Audio
14
+ import soundfile as sf
15
+ import subprocess
16
+ subprocess.run(
17
+ "pip install flash-attn --no-build-isolation",
18
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
19
+ shell=True,
20
+ )
21
+
22
+ device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ torch_dtype = torch.float16
24
+ MODEL_NAME = "openai/whisper-tiny"
25
+
26
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
27
+ MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
28
+ )
29
+ model.to(device)
30
+
31
+
32
+ # πŸ‡ΊπŸ‡Έ 'a' => American English
33
+ # πŸ‡¬πŸ‡§ 'b' => British English
34
+ # πŸ‡«πŸ‡· 'f' => French fr-fr
35
+ tts_pipeline = KPipeline(lang_code='a', device=device) # <= make sure lang_code matches voice
36
+
37
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
38
+ tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
39
+
40
+ pipe = pipeline(
41
+ task="automatic-speech-recognition",
42
+ model=model,
43
+ tokenizer=tokenizer,
44
+ feature_extractor=processor.feature_extractor,
45
+ chunk_length_s=10,
46
+ torch_dtype=torch_dtype,
47
+ device=device,
48
+ )
49
+
50
+ @spaces.GPU
51
+ def stream_transcribe(stream, new_chunk):
52
+ start_time = time.time()
53
+ try:
54
+ sr, y = new_chunk
55
+ y[y!=y]=0
56
+ # Convert to mono if stereo
57
+ if y.ndim > 1:
58
+ y = y.mean(axis=1)
59
+
60
+ y = y.astype(np.float32)
61
+ y /= np.max(np.abs(y))
62
+
63
+ if stream is not None:
64
+ stream = np.concatenate([stream, y])
65
+ else:
66
+ stream = y
67
+
68
+ transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
69
+ end_time = time.time()
70
+ latency = end_time - start_time
71
+
72
+ return stream, transcription, f"{latency:.2f}"
73
+ except Exception as e:
74
+ print(f"Error during Transcription: {e}")
75
+ return stream, e, "Error"
76
+
77
+ def clear():
78
+ return ""
79
+
80
+ def clear_state():
81
+ return None
82
+
83
+ @spaces.GPU
84
+ def tts(target_text):
85
+ generator = tts_pipeline(
86
+ target_text, voice='af_heart', # <= change voice here
87
+ speed=1, split_pattern=r'\n+'
88
+ )
89
+ audios = []
90
+ for i, (gs, ps, audio) in enumerate(generator):
91
+ audios.append(audio.cpu().numpy())
92
+ return (24000, np.concatenate(audios))
93
+
94
+ with gr.Blocks() as microphone:
95
+ with gr.Column():
96
+ gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
97
+ with gr.Row():
98
+ input_audio_microphone = gr.Audio(streaming=True)
99
+ output = gr.Textbox(label="Transcription", value="")
100
+ latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
101
+ with gr.Row():
102
+ clear_button = gr.Button("Clear Output")
103
+ state = gr.State()
104
+ input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
105
+ clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
106
+
107
+ gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
108
+
109
+ generate_btn = gr.Button("Synthesize", variant="primary")
110
+
111
+ audio_output = gr.Audio(label="Synthesized Audio")
112
+
113
+ generate_btn.click(
114
+ tts,
115
+ inputs=[
116
+ gen_text_input,
117
+ ],
118
+ outputs=[audio_output],
119
+ )
120
+
121
+ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
122
+ gr.TabbedInterface([microphone], ["vc chat"])
123
+
124
+ demo.launch()