Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -92,7 +92,7 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
92 |
chunk_size = chunk_length_s * sample_rate
|
93 |
num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)
|
94 |
|
95 |
-
# Initialize
|
96 |
full_text = []
|
97 |
|
98 |
for i in range(num_chunks):
|
@@ -100,15 +100,20 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
100 |
end = min((i + 1) * chunk_size, waveform.shape[1])
|
101 |
chunk_waveform = waveform[:, start:end]
|
102 |
|
103 |
-
#
|
104 |
if chunk_waveform.shape[0] > 1:
|
105 |
chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
|
106 |
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
108 |
# Tokenize the input batch with the processor
|
109 |
inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, padding="max_length", return_tensors="pt", task="transcribe", device=device)
|
110 |
|
111 |
-
# ASR model inference on
|
112 |
with torch.no_grad():
|
113 |
generated_ids = model.generate(
|
114 |
input_features=inputs.input_features.to(device),
|
|
|
92 |
chunk_size = chunk_length_s * sample_rate
|
93 |
num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)
|
94 |
|
95 |
+
# Initialize empty list@store transcribed text from ea.chunk
|
96 |
full_text = []
|
97 |
|
98 |
for i in range(num_chunks):
|
|
|
100 |
end = min((i + 1) * chunk_size, waveform.shape[1])
|
101 |
chunk_waveform = waveform[:, start:end]
|
102 |
|
103 |
+
# Check chunk waveform is properly shaped
|
104 |
if chunk_waveform.shape[0] > 1:
|
105 |
chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
|
106 |
|
107 |
+
if processor.tokenizer.pad_token is None or processor.tokenizer.pad_token_id == processor.tokenizer.eos_token_id:
|
108 |
+
processor.tokenizer.add_special_tokens({'pad_token': '<PAD>'})
|
109 |
+
pad_token_id = processor.tokenizer.convert_tokens_to_ids('<PAD>')
|
110 |
+
|
111 |
+
model.config.pad_token_id = pad_token_id # update model configuration with new pad token ID
|
112 |
+
|
113 |
# Tokenize the input batch with the processor
|
114 |
inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, padding="max_length", return_tensors="pt", task="transcribe", device=device)
|
115 |
|
116 |
+
# ASR model inference on chunk
|
117 |
with torch.no_grad():
|
118 |
generated_ids = model.generate(
|
119 |
input_features=inputs.input_features.to(device),
|