Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -24,17 +24,80 @@ status_placeholder.info("Loading Whisper model from Hugging Face...")
|
|
24 |
def load_whisper_model():
|
25 |
"""
|
26 |
Load the Whisper model and processor from Hugging Face.
|
27 |
-
Change 'openai/whisper-
|
28 |
"""
|
29 |
-
model_name = "openai/whisper-
|
30 |
processor = WhisperProcessor.from_pretrained(model_name)
|
31 |
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
32 |
return processor, model
|
33 |
|
34 |
processor, model = load_whisper_model()
|
35 |
-
|
36 |
status_placeholder.info("Whisper model loaded successfully!")
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
|
39 |
"""
|
40 |
Split an audio file into chunks using silence detection.
|
@@ -50,42 +113,29 @@ def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=
|
|
50 |
status_placeholder.info(f"Audio split into {len(chunks)} chunks.")
|
51 |
return chunks
|
52 |
|
53 |
-
|
54 |
-
# """
|
55 |
-
# Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
|
56 |
-
# This uses librosa to load and resample the audio as required.
|
57 |
-
# """
|
58 |
-
# # Load audio with librosa at 16kHz (as required by Whisper)
|
59 |
-
# speech, sr = librosa.load(audio_file, sr=16000)
|
60 |
-
# input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
|
61 |
-
# predicted_ids = model.generate(input_features)
|
62 |
-
# transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
63 |
-
# return transcription
|
64 |
-
|
65 |
-
def transcribe(audio_file):
|
66 |
"""
|
67 |
Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
|
68 |
This uses librosa to load and resample the audio as required.
|
69 |
-
The transcription is forced to
|
70 |
|
71 |
Args:
|
72 |
audio_file (str): Path to the audio file.
|
|
|
73 |
|
74 |
Returns:
|
75 |
-
str: Transcribed text
|
76 |
"""
|
77 |
# Load audio with librosa at 16kHz (as required by Whisper)
|
78 |
speech, sr = librosa.load(audio_file, sr=16000)
|
79 |
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
|
80 |
-
# Force the transcription output to
|
81 |
-
forced_ids = processor.get_decoder_prompt_ids(language=
|
82 |
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_ids)
|
83 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
84 |
return transcription
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
def transcribe_chunk(chunk, index, min_length_ms=100):
|
89 |
"""
|
90 |
Transcribe an individual audio chunk.
|
91 |
"""
|
@@ -96,13 +146,13 @@ def transcribe_chunk(chunk, index, min_length_ms=100):
|
|
96 |
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
|
97 |
chunk.export(temp_audio_file.name, format="wav")
|
98 |
temp_audio_file_path = temp_audio_file.name
|
99 |
-
status_placeholder.info(f"Transcribing chunk {index}...")
|
100 |
-
transcription = transcribe(temp_audio_file_path)
|
101 |
os.remove(temp_audio_file_path)
|
102 |
st.write(f"Transcription for chunk {index}: {transcription}")
|
103 |
return (index, transcription)
|
104 |
|
105 |
-
def process_audio_chunks(audio_chunks):
|
106 |
"""
|
107 |
Process and transcribe each audio chunk in sequence.
|
108 |
Reports the total time taken.
|
@@ -111,7 +161,7 @@ def process_audio_chunks(audio_chunks):
|
|
111 |
min_length_ms = 100 # minimum duration for processing
|
112 |
start_transcription = time.time()
|
113 |
for i, chunk in enumerate(audio_chunks):
|
114 |
-
index, text = transcribe_chunk(chunk, i, min_length_ms)
|
115 |
transcriptions.append((index, text))
|
116 |
transcriptions.sort(key=lambda x: x[0])
|
117 |
total_time = time.time() - start_transcription
|
@@ -152,7 +202,7 @@ if uploaded_file is not None and st.session_state.transcription is None:
|
|
152 |
processing_start = time.time()
|
153 |
with st.spinner('Processing audio...'):
|
154 |
audio_chunks = split_audio_on_silence(temp_audio_file)
|
155 |
-
transcription = process_audio_chunks(audio_chunks)
|
156 |
if transcription:
|
157 |
st.session_state.transcription = transcription
|
158 |
st.success('Transcription complete!')
|
|
|
24 |
def load_whisper_model():
|
25 |
"""
|
26 |
Load the Whisper model and processor from Hugging Face.
|
27 |
+
Change 'openai/whisper-small' to another variant if needed.
|
28 |
"""
|
29 |
+
model_name = "openai/whisper-small" # You can change to "tiny", "base", "medium", or "large" based on resources.
|
30 |
processor = WhisperProcessor.from_pretrained(model_name)
|
31 |
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
32 |
return processor, model
|
33 |
|
34 |
processor, model = load_whisper_model()
|
|
|
35 |
status_placeholder.info("Whisper model loaded successfully!")
|
36 |
|
37 |
+
# Comprehensive dictionary of languages supported by Whisper (most common ones)
|
38 |
+
LANGUAGES = {
|
39 |
+
"en": "English",
|
40 |
+
"zh": "Chinese",
|
41 |
+
"de": "German",
|
42 |
+
"es": "Spanish",
|
43 |
+
"ru": "Russian",
|
44 |
+
"ko": "Korean",
|
45 |
+
"fr": "French",
|
46 |
+
"ja": "Japanese",
|
47 |
+
"pt": "Portuguese",
|
48 |
+
"tr": "Turkish",
|
49 |
+
"pl": "Polish",
|
50 |
+
"ca": "Catalan",
|
51 |
+
"nl": "Dutch",
|
52 |
+
"ar": "Arabic",
|
53 |
+
"sv": "Swedish",
|
54 |
+
"it": "Italian",
|
55 |
+
"id": "Indonesian",
|
56 |
+
"hi": "Hindi",
|
57 |
+
"fi": "Finnish",
|
58 |
+
"vi": "Vietnamese",
|
59 |
+
"fa": "Persian",
|
60 |
+
"mr": "Marathi",
|
61 |
+
"uk": "Ukrainian",
|
62 |
+
"el": "Greek",
|
63 |
+
"ms": "Malay",
|
64 |
+
"cs": "Czech",
|
65 |
+
"ro": "Romanian",
|
66 |
+
"da": "Danish",
|
67 |
+
"hu": "Hungarian",
|
68 |
+
"ta": "Tamil",
|
69 |
+
"no": "Norwegian",
|
70 |
+
"th": "Thai",
|
71 |
+
"ur": "Urdu",
|
72 |
+
"hr": "Croatian",
|
73 |
+
"bg": "Bulgarian",
|
74 |
+
"lt": "Lithuanian",
|
75 |
+
"la": "Latin",
|
76 |
+
"mi": "Maori",
|
77 |
+
"ml": "Malayalam",
|
78 |
+
"cy": "Welsh",
|
79 |
+
"sk": "Slovak",
|
80 |
+
"te": "Telugu",
|
81 |
+
"ka": "Georgian",
|
82 |
+
"sl": "Slovenian",
|
83 |
+
"kn": "Kannada",
|
84 |
+
"et": "Estonian",
|
85 |
+
"mk": "Macedonian",
|
86 |
+
"br": "Breton",
|
87 |
+
"eu": "Basque",
|
88 |
+
"is": "Icelandic",
|
89 |
+
"hy": "Armenian",
|
90 |
+
"af": "Afrikaans"
|
91 |
+
}
|
92 |
+
|
93 |
+
# Create a sorted list of language names for the selectbox
|
94 |
+
language_names = sorted(LANGUAGES.values())
|
95 |
+
default_language = "English" # Default language
|
96 |
+
|
97 |
+
selected_lang_name = st.selectbox("Select transcription language", language_names, index=language_names.index(default_language))
|
98 |
+
# Find the language code by reverse lookup in LANGUAGES
|
99 |
+
selected_language = [code for code, name in LANGUAGES.items() if name == selected_lang_name][0]
|
100 |
+
|
101 |
def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
|
102 |
"""
|
103 |
Split an audio file into chunks using silence detection.
|
|
|
113 |
status_placeholder.info(f"Audio split into {len(chunks)} chunks.")
|
114 |
return chunks
|
115 |
|
116 |
+
def transcribe(audio_file, language):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
"""
|
118 |
Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
|
119 |
This uses librosa to load and resample the audio as required.
|
120 |
+
The transcription is forced to the specified language.
|
121 |
|
122 |
Args:
|
123 |
audio_file (str): Path to the audio file.
|
124 |
+
language (str): Language code (e.g., "en", "es").
|
125 |
|
126 |
Returns:
|
127 |
+
str: Transcribed text.
|
128 |
"""
|
129 |
# Load audio with librosa at 16kHz (as required by Whisper)
|
130 |
speech, sr = librosa.load(audio_file, sr=16000)
|
131 |
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
|
132 |
+
# Force the transcription output to the chosen language:
|
133 |
+
forced_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
|
134 |
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_ids)
|
135 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
136 |
return transcription
|
137 |
|
138 |
+
def transcribe_chunk(chunk, index, language, min_length_ms=100):
|
|
|
|
|
139 |
"""
|
140 |
Transcribe an individual audio chunk.
|
141 |
"""
|
|
|
146 |
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
|
147 |
chunk.export(temp_audio_file.name, format="wav")
|
148 |
temp_audio_file_path = temp_audio_file.name
|
149 |
+
status_placeholder.info(f"Transcribing chunk {index} in {selected_lang_name}...")
|
150 |
+
transcription = transcribe(temp_audio_file_path, language)
|
151 |
os.remove(temp_audio_file_path)
|
152 |
st.write(f"Transcription for chunk {index}: {transcription}")
|
153 |
return (index, transcription)
|
154 |
|
155 |
+
def process_audio_chunks(audio_chunks, language):
|
156 |
"""
|
157 |
Process and transcribe each audio chunk in sequence.
|
158 |
Reports the total time taken.
|
|
|
161 |
min_length_ms = 100 # minimum duration for processing
|
162 |
start_transcription = time.time()
|
163 |
for i, chunk in enumerate(audio_chunks):
|
164 |
+
index, text = transcribe_chunk(chunk, i, language, min_length_ms)
|
165 |
transcriptions.append((index, text))
|
166 |
transcriptions.sort(key=lambda x: x[0])
|
167 |
total_time = time.time() - start_transcription
|
|
|
202 |
processing_start = time.time()
|
203 |
with st.spinner('Processing audio...'):
|
204 |
audio_chunks = split_audio_on_silence(temp_audio_file)
|
205 |
+
transcription = process_audio_chunks(audio_chunks, selected_language)
|
206 |
if transcription:
|
207 |
st.session_state.transcription = transcription
|
208 |
st.success('Transcription complete!')
|