Spaces:
Sleeping
Sleeping
jtlonsako
commited on
Commit
·
635f416
1
Parent(s):
ab34adc
added a batch_size input and allow for multiple outputs, also edited the details screen
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ import time
|
|
8 |
import gc
|
9 |
import gradio as gr
|
10 |
import librosa
|
11 |
-
from transformers import Wav2Vec2ForCTC,
|
12 |
from huggingface_hub import hf_hub_download
|
13 |
from torchaudio.models.decoder import ctc_decoder
|
14 |
from numba import cuda
|
@@ -47,7 +47,7 @@ beam_search_decoder = ctc_decoder(
|
|
47 |
tokens=token_file,
|
48 |
lm=lm_file,
|
49 |
nbest=1,
|
50 |
-
beam_size=
|
51 |
beam_size_token=50,
|
52 |
lm_weight=float(decoding_config["lmweight"]),
|
53 |
word_score=float(decoding_config["wordscore"]),
|
@@ -67,7 +67,7 @@ def preprocessAudio(audioFile):
|
|
67 |
os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audioToConvert.wav")
|
68 |
|
69 |
#Transcribe!!!
|
70 |
-
def Transcribe(file):
|
71 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
72 |
start_time = time.time()
|
73 |
model.load_adapter("amh")
|
@@ -75,7 +75,6 @@ def Transcribe(file):
|
|
75 |
|
76 |
preprocessAudio(file)
|
77 |
block_size = 30
|
78 |
-
batch_size = 8 # or whatever number you choose
|
79 |
|
80 |
transcripts = []
|
81 |
speech_segments = []
|
@@ -94,9 +93,7 @@ def Transcribe(file):
|
|
94 |
encoding_start = 0
|
95 |
encoding_end = 0
|
96 |
sbv_file = open("subtitle.sbv", "w")
|
97 |
-
|
98 |
-
# Define batch size
|
99 |
-
batch_size = 11
|
100 |
|
101 |
# Create an empty list to hold batches
|
102 |
batch = []
|
@@ -122,7 +119,6 @@ def Transcribe(file):
|
|
122 |
# Transcribe each segment in the batch
|
123 |
for i in range(batch_size):
|
124 |
transcription = " ".join(beam_search_result[i][0].words).strip()
|
125 |
-
print(transcription)
|
126 |
transcripts.append(transcription)
|
127 |
|
128 |
encoding_end = encoding_start + block_size
|
@@ -176,15 +172,19 @@ def Transcribe(file):
|
|
176 |
|
177 |
# Join all transcripts into a single transcript
|
178 |
transcript = ' '.join(transcripts)
|
|
|
179 |
sbv_file.close()
|
|
|
180 |
|
181 |
end_time = time.time()
|
182 |
print(f"The script ran for {end_time - start_time} seconds.")
|
183 |
-
return("./subtitle.sbv")
|
184 |
|
185 |
-
demo = gr.Interface(fn=Transcribe, inputs=gr.File(label="Upload an audio file of Amharic content"),
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
189 |
demo.launch()
|
190 |
|
|
|
8 |
import gc
|
9 |
import gradio as gr
|
10 |
import librosa
|
11 |
+
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
12 |
from huggingface_hub import hf_hub_download
|
13 |
from torchaudio.models.decoder import ctc_decoder
|
14 |
from numba import cuda
|
|
|
47 |
tokens=token_file,
|
48 |
lm=lm_file,
|
49 |
nbest=1,
|
50 |
+
beam_size=400,
|
51 |
beam_size_token=50,
|
52 |
lm_weight=float(decoding_config["lmweight"]),
|
53 |
word_score=float(decoding_config["wordscore"]),
|
|
|
67 |
os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audioToConvert.wav")
|
68 |
|
69 |
#Transcribe!!!
|
70 |
+
def Transcribe(file, batch_size):
|
71 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
72 |
start_time = time.time()
|
73 |
model.load_adapter("amh")
|
|
|
75 |
|
76 |
preprocessAudio(file)
|
77 |
block_size = 30
|
|
|
78 |
|
79 |
transcripts = []
|
80 |
speech_segments = []
|
|
|
93 |
encoding_start = 0
|
94 |
encoding_end = 0
|
95 |
sbv_file = open("subtitle.sbv", "w")
|
96 |
+
transcription_file = open("transcription.txt", "w")
|
|
|
|
|
97 |
|
98 |
# Create an empty list to hold batches
|
99 |
batch = []
|
|
|
119 |
# Transcribe each segment in the batch
|
120 |
for i in range(batch_size):
|
121 |
transcription = " ".join(beam_search_result[i][0].words).strip()
|
|
|
122 |
transcripts.append(transcription)
|
123 |
|
124 |
encoding_end = encoding_start + block_size
|
|
|
172 |
|
173 |
# Join all transcripts into a single transcript
|
174 |
transcript = ' '.join(transcripts)
|
175 |
+
transcription_file.write(f"{transcript}")
|
176 |
sbv_file.close()
|
177 |
+
transcription_file.close()
|
178 |
|
179 |
end_time = time.time()
|
180 |
print(f"The script ran for {end_time - start_time} seconds.")
|
181 |
+
return(["./subtitle.sbv", "./transcription.txt"])
|
182 |
|
183 |
+
demo = gr.Interface(fn=Transcribe, inputs=[gr.File(label="Upload an audio file of Amharic content"), gr.Slider(0, 25, value=4, step=1, label="batch size", info="Approximately .5GB per batch")],
|
184 |
+
outputs=gr.File(label="Download .sbv transcription", file_count="multiple"),
|
185 |
+
title="Amharic Audio Transcription",
|
186 |
+
description="This application uses Meta MMS and an Amharic kenLM model to transcribe Amharic Audio files of arbitrary length into .sbv and .txt files. Upload an Amharic audio file and get your transcription! \n(Note: Transcription quality is quite low, you should review and edit transcriptions before making them publicly available)"
|
187 |
+
)
|
188 |
+
|
189 |
demo.launch()
|
190 |
|