Pijush2023 commited on
Commit
dd793c3
·
verified ·
1 Parent(s): e9e48e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -0
app.py CHANGED
@@ -93,6 +93,55 @@
93
  # audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time")
94
 
95
  # demo.launch(show_error=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  import gradio as gr
97
  import numpy as np
98
  import torch
@@ -106,6 +155,11 @@ processor = AutoProcessor.from_pretrained(model_id)
106
 
107
  pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=False)
108
 
 
 
 
 
 
109
  def transcribe_function(new_chunk, state):
110
  try:
111
  sr, y = new_chunk
@@ -113,6 +167,7 @@ def transcribe_function(new_chunk, state):
113
  print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
114
  return state, "", None
115
 
 
116
  y = y.astype(np.float32) / np.max(np.abs(y))
117
 
118
  if state is not None:
@@ -126,6 +181,13 @@ def transcribe_function(new_chunk, state):
126
 
127
  return state, full_text
128
 
 
 
 
 
 
 
 
129
  with gr.Blocks() as demo:
130
  gr.Markdown("# Voice to Text Transcription")
131
 
@@ -134,9 +196,13 @@ with gr.Blocks() as demo:
134
  with gr.Row():
135
  with gr.Column():
136
  audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', label="Microphone Input")
 
137
  with gr.Column():
138
  output_text = gr.Textbox(label="Transcription")
 
139
 
140
  audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time")
 
141
 
142
  demo.launch(show_error=True)
 
 
93
  # audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time")
94
 
95
  # demo.launch(show_error=True)
96
+ # import gradio as gr
97
+ # import numpy as np
98
+ # import torch
99
+ # from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
100
+
101
+ # model_id = 'openai/whisper-large-v3'
102
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
103
+ # torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
104
+ # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
105
+ # processor = AutoProcessor.from_pretrained(model_id)
106
+
107
+ # pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=False)
108
+
109
+ # def transcribe_function(new_chunk, state):
110
+ # try:
111
+ # sr, y = new_chunk
112
+ # except TypeError:
113
+ # print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
114
+ # return state, "", None
115
+
116
+ # y = y.astype(np.float32) / np.max(np.abs(y))
117
+
118
+ # if state is not None:
119
+ # state = np.concatenate([state, y])
120
+ # else:
121
+ # state = y
122
+
123
+ # result = pipe_asr({"array": state, "sampling_rate": sr}, return_timestamps=False)
124
+
125
+ # full_text = result.get("text", "")
126
+
127
+ # return state, full_text
128
+
129
+ # with gr.Blocks() as demo:
130
+ # gr.Markdown("# Voice to Text Transcription")
131
+
132
+ # state = gr.State(None)
133
+
134
+ # with gr.Row():
135
+ # with gr.Column():
136
+ # audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', label="Microphone Input")
137
+ # with gr.Column():
138
+ # output_text = gr.Textbox(label="Transcription")
139
+
140
+ # audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time")
141
+
142
+ # demo.launch(show_error=True)
143
+
144
+
145
  import gradio as gr
146
  import numpy as np
147
  import torch
 
155
 
156
  pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=False)
157
 
158
+ def ensure_mono(y):
159
+ if len(y.shape) > 1 and y.shape[1] > 1:
160
+ y = np.mean(y, axis=1)
161
+ return y
162
+
163
  def transcribe_function(new_chunk, state):
164
  try:
165
  sr, y = new_chunk
 
167
  print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
168
  return state, "", None
169
 
170
+ y = ensure_mono(y)
171
  y = y.astype(np.float32) / np.max(np.abs(y))
172
 
173
  if state is not None:
 
181
 
182
  return state, full_text
183
 
184
+ def upload_transcribe(file):
185
+ sr, y = file
186
+ y = ensure_mono(y)
187
+ y = y.astype(np.float32) / np.max(np.abs(y))
188
+ result = pipe_asr({"array": y, "sampling_rate": sr}, return_timestamps=False)
189
+ return result.get("text", "")
190
+
191
  with gr.Blocks() as demo:
192
  gr.Markdown("# Voice to Text Transcription")
193
 
 
196
  with gr.Row():
197
  with gr.Column():
198
  audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', label="Microphone Input")
199
+ audio_upload = gr.Audio(sources="upload", type='numpy', label="Upload Audio File")
200
  with gr.Column():
201
  output_text = gr.Textbox(label="Transcription")
202
+ upload_text = gr.Textbox(label="Uploaded Audio Transcription")
203
 
204
  audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time")
205
+ audio_upload.change(upload_transcribe, inputs=audio_upload, outputs=upload_text)
206
 
207
  demo.launch(show_error=True)
208
+