speech-to-text / app.py
cptsubtext
update with transformers
394dcf7
raw
history blame
5.11 kB
import streamlit as st
from transformers import pipeline
from pydub import AudioSegment
import pysrt
import os
import io
# Variables (for potential future API integration)
# valid_api_token = st.secrets.get("API_TOKEN") # Using st.secrets for better security
st.title("Speech-to-Text with Transformers")
with st.expander("README"):
st.write("This tool transcribes audio files using Hugging Face Transformers. Upload an audio file, choose your model size, and optionally translate to English. A WebVTT/SRT file will be generated and can be downloaded. This is suitable for use as a subtitle file (e.g., in DaVinci Resolve Import Subtitles).")
# Upload audio file
uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"])
# Model selection
# Note: For Hugging Face Spaces, larger models might require more resources (GPU).
# "tiny", "base", "small", "medium" are common Whisper sizes.
model_size = st.selectbox(
"Model Size (select a smaller model for faster inference or limited resources)",
("openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium")
)
# Should we translate to English?
translate = st.checkbox("Would you like a translation to English?")
# Free tier or API token option (more relevant if you were to use an external API like AssemblyAI or OpenAI's API)
# For local model inference on Hugging Face Spaces, "free tier" typically refers to the space's compute limits.
st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. There's no explicit 'free tier' checkbox in this context for model size, but larger models will consume more resources and time.")
# api_token = st.text_input("API Token (Optional, for external APIs like OpenAI's if not using local models)")
@st.cache_resource
def load_whisper_pipeline(model_name):
"""
Loads the Hugging Face Whisper ASR pipeline.
Uses st.cache_resource to avoid reloading the model on every rerun.
"""
st.info(f"Loading {model_name} model... This may take a moment.")
return pipeline("automatic-speech-recognition", model=model_name)
def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
"""
Transcribes audio using the Hugging Face Transformers pipeline and generates an SRT.
"""
try:
asr_pipeline = load_whisper_pipeline(model_name)
st.info("Transcribing audio... Please wait.")
if translate_to_english:
# When task is 'translate', Whisper models directly translate to English
prediction = asr_pipeline(audio_file_path, generate_kwargs={"task": "translate"})
else:
prediction = asr_pipeline(audio_file_path)
transcribed_text = prediction["text"]
st.subheader("Transcription Output:")
st.write(transcribed_text)
# Generate SRT content (simplified for demonstration)
# For more precise timings, you'd need to process word-level timestamps if available from the pipeline
# or use a library that offers more granular control like stable-whisper provides.
# For simplicity, this example just puts the whole transcription into one caption.
# A real-world scenario would segment the audio and get timestamps for each segment.
srt_content = pysrt.SubRipFile()
# Create a single subtitle entry for the entire transcription for demonstration.
# In a real application, you'd want to segment the audio and create multiple entries with timestamps.
# The transformers pipeline returns a single text string by default.
# To get segment-level timestamps, you might need to configure the pipeline
# or use the underlying model directly.
item = pysrt.SubRipItem(index=1, start=pysrt.SubRipTime(0, 0, 0, 0), end=pysrt.SubRipTime(0, 0, int(len(transcribed_text)/10), 0), text=transcribed_text)
srt_content.append(item)
srt_file_path = "audio.srt"
srt_content.save(srt_file_path, encoding='utf-8')
st.success("Transcription successful! Download subtitle file?")
with open(srt_file_path, "rb") as f:
st.download_button("Download Subtitle in SRT Format", f, file_name="audio.srt")
os.remove(srt_file_path)
except Exception as e:
st.error(f"Error during transcription: {str(e)}")
# Optionally, provide more specific error handling based on the exception type
st.info("Common issues: File format not supported, model loading failed, or audio too long for available memory.")
if uploaded_file is not None:
# Save uploaded file to a temporary location for transformers pipeline
# The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
with open("temp_audio_file", "wb") as f:
f.write(uploaded_file.getbuffer())
audio_file_path = "temp_audio_file"
transcribe_with_transformers(audio_file_path, model_size, translate)
# Clean up the temporary file
if os.path.exists(audio_file_path):
os.remove(audio_file_path)