Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import librosa | |
import torch | |
from pydub import AudioSegment | |
from pydub.silence import split_on_silence | |
from dotenv import load_dotenv | |
from tempfile import NamedTemporaryFile | |
import math | |
from docx import Document | |
import time | |
from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
# Load environment variables from .env file (if needed for other config) | |
load_dotenv() | |
# Create a placeholder for status messages | |
status_placeholder = st.empty() | |
# Display status while loading the model | |
status_placeholder.info("Loading Whisper model from Hugging Face...") | |
def load_whisper_model(): | |
""" | |
Load the Whisper model and processor from Hugging Face. | |
Change 'openai/whisper-small' to another variant if needed. | |
""" | |
model_name = "openai/whisper-small" # You can change to "tiny", "base", "medium", or "large" based on resources. | |
processor = WhisperProcessor.from_pretrained(model_name) | |
model = WhisperForConditionalGeneration.from_pretrained(model_name) | |
return processor, model | |
processor, model = load_whisper_model() | |
status_placeholder.info("Whisper model loaded successfully!") | |
# Comprehensive dictionary of languages supported by Whisper (most common ones) | |
LANGUAGES = { | |
"en": "English", | |
"zh": "Chinese", | |
"de": "German", | |
"es": "Spanish", | |
"ru": "Russian", | |
"ko": "Korean", | |
"fr": "French", | |
"ja": "Japanese", | |
"pt": "Portuguese", | |
"tr": "Turkish", | |
"pl": "Polish", | |
"ca": "Catalan", | |
"nl": "Dutch", | |
"ar": "Arabic", | |
"sv": "Swedish", | |
"it": "Italian", | |
"id": "Indonesian", | |
"hi": "Hindi", | |
"fi": "Finnish", | |
"vi": "Vietnamese", | |
"fa": "Persian", | |
"mr": "Marathi", | |
"uk": "Ukrainian", | |
"el": "Greek", | |
"ms": "Malay", | |
"cs": "Czech", | |
"ro": "Romanian", | |
"da": "Danish", | |
"hu": "Hungarian", | |
"ta": "Tamil", | |
"no": "Norwegian", | |
"th": "Thai", | |
"ur": "Urdu", | |
"hr": "Croatian", | |
"bg": "Bulgarian", | |
"lt": "Lithuanian", | |
"la": "Latin", | |
"mi": "Maori", | |
"ml": "Malayalam", | |
"cy": "Welsh", | |
"sk": "Slovak", | |
"te": "Telugu", | |
"ka": "Georgian", | |
"sl": "Slovenian", | |
"kn": "Kannada", | |
"et": "Estonian", | |
"mk": "Macedonian", | |
"br": "Breton", | |
"eu": "Basque", | |
"is": "Icelandic", | |
"hy": "Armenian", | |
"af": "Afrikaans" | |
} | |
# Create a sorted list of language names for the selectbox | |
language_names = sorted(LANGUAGES.values()) | |
default_language = "English" # Default language | |
selected_lang_name = st.selectbox("Select transcription language", language_names, index=language_names.index(default_language)) | |
# Find the language code by reverse lookup in LANGUAGES | |
selected_language = [code for code, name in LANGUAGES.items() if name == selected_lang_name][0] | |
def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250): | |
""" | |
Split an audio file into chunks using silence detection. | |
""" | |
status_placeholder.info("Splitting audio on silence...") | |
audio = AudioSegment.from_file(audio_file_path) | |
chunks = split_on_silence( | |
audio, | |
min_silence_len=min_silence_len, | |
silence_thresh=silence_thresh, | |
keep_silence=keep_silence | |
) | |
status_placeholder.info(f"Audio split into {len(chunks)} chunks.") | |
return chunks | |
def transcribe(audio_file, language): | |
""" | |
Transcribe an audio file using the locally loaded Whisper model from Hugging Face. | |
This uses librosa to load and resample the audio as required. | |
The transcription is forced to the specified language. | |
Args: | |
audio_file (str): Path to the audio file. | |
language (str): Language code (e.g., "en", "es"). | |
Returns: | |
str: Transcribed text. | |
""" | |
# Load audio with librosa at 16kHz (as required by Whisper) | |
speech, sr = librosa.load(audio_file, sr=16000) | |
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features | |
# Force the transcription output to the chosen language: | |
forced_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe") | |
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_ids) | |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
return transcription | |
def transcribe_chunk(chunk, index, language, min_length_ms=100): | |
""" | |
Transcribe an individual audio chunk. | |
""" | |
if len(chunk) < min_length_ms: | |
st.warning(f"Chunk {index} is too short to be processed.") | |
return (index, "") | |
# Save chunk temporarily as a WAV file | |
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: | |
chunk.export(temp_audio_file.name, format="wav") | |
temp_audio_file_path = temp_audio_file.name | |
status_placeholder.info(f"Transcribing chunk {index} in {selected_lang_name}...") | |
transcription = transcribe(temp_audio_file_path, language) | |
os.remove(temp_audio_file_path) | |
st.write(f"Transcription for chunk {index}: {transcription}") | |
return (index, transcription) | |
def process_audio_chunks(audio_chunks, language): | |
""" | |
Process and transcribe each audio chunk in sequence. | |
Reports the total time taken. | |
""" | |
transcriptions = [] | |
min_length_ms = 100 # minimum duration for processing | |
start_transcription = time.time() | |
for i, chunk in enumerate(audio_chunks): | |
index, text = transcribe_chunk(chunk, i, language, min_length_ms) | |
transcriptions.append((index, text)) | |
transcriptions.sort(key=lambda x: x[0]) | |
total_time = time.time() - start_transcription | |
status_placeholder.info(f"All chunks transcribed in {total_time:.2f} seconds.") | |
combined = " ".join([text for idx, text in transcriptions]) | |
return combined | |
def save_transcription_to_docx(transcription, audio_file_path): | |
""" | |
Save the transcription as a .docx file. | |
""" | |
base_name = os.path.splitext(os.path.basename(audio_file_path))[0] | |
output_file_name = f"{base_name}_full_transcription.docx" | |
status_placeholder.info("Saving transcription to DOCX...") | |
doc = Document() | |
doc.add_paragraph(transcription) | |
doc.save(output_file_name) | |
status_placeholder.info("Transcription saved as DOCX.") | |
return output_file_name | |
st.title("Audio Transcription with Whisper (Local via Hugging Face)") | |
# Allow uploading of audio or video files | |
uploaded_file = st.file_uploader("Upload an audio or video file", type=["wav", "mp3", "ogg", "m4a", "mp4", "mov"]) | |
if 'transcription' not in st.session_state: | |
st.session_state.transcription = None | |
if uploaded_file is not None and st.session_state.transcription is None: | |
st.audio(uploaded_file) | |
# Save uploaded file temporarily | |
file_extension = uploaded_file.name.split(".")[-1] | |
temp_audio_file = f"temp_audio_file.{file_extension}" | |
with open(temp_audio_file, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
processing_start = time.time() | |
with st.spinner('Processing audio...'): | |
audio_chunks = split_audio_on_silence(temp_audio_file) | |
transcription = process_audio_chunks(audio_chunks, selected_language) | |
if transcription: | |
st.session_state.transcription = transcription | |
st.success('Transcription complete!') | |
output_docx_file = save_transcription_to_docx(transcription, uploaded_file.name) | |
st.session_state.output_docx_file = output_docx_file | |
processing_duration = time.time() - processing_start | |
status_placeholder.info(f"Total processing time: {processing_duration:.2f} seconds.") | |
if os.path.exists(temp_audio_file): | |
os.remove(temp_audio_file) | |
if st.session_state.transcription: | |
st.text_area("Transcription", st.session_state.transcription, key="transcription_area_final") | |
with open(st.session_state.output_docx_file, "rb") as docx_file: | |
st.download_button( | |
label="Download Transcription (.docx)", | |
data=docx_file, | |
file_name=st.session_state.output_docx_file, | |
mime='application/vnd.openxmlformats-officedocument.wordprocessingml.document' | |
) | |