vericudebuget's picture
Update app.py
ca365ff verified
# requirements.txt
# app.py
import streamlit as st
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import tempfile
import os
from moviepy.editor import VideoFileClip
import datetime
def create_srt(chunks):
srt_content = ""
for i, chunk in enumerate(chunks, start=1):
start_time = str(datetime.timedelta(seconds=chunk['timestamp'][0]))
end_time = str(datetime.timedelta(seconds=chunk['timestamp'][1]))
# Ensure proper SRT timestamp format (HH:MM:SS,mmm)
start_time = start_time.rstrip('0').rstrip('.') + ',000' if '.' in start_time else start_time + ',000'
end_time = end_time.rstrip('0').rstrip('.') + ',000' if '.' in end_time else end_time + ',000'
srt_content += f"{i}\n{start_time} --> {end_time}\n{chunk['text']}\n\n"
return srt_content
def extract_audio(video_path):
with VideoFileClip(video_path) as video:
audio = video.audio
_, temp_audio_path = tempfile.mkstemp(suffix='.mp3')
audio.write_audiofile(temp_audio_path)
return temp_audio_path
def setup_model():
device = "cpu"
torch_dtype = torch.float32
model_id = "openai/whisper-tiny"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
torch_dtype=torch_dtype,
device=device,
)
return pipe
def main():
st.title("Audio/Video Transcription App")
# Initialize session state for model
if 'pipe' not in st.session_state:
with st.spinner("Loading model... This might take a few minutes."):
st.session_state.pipe = setup_model()
uploaded_file = st.file_uploader("Upload an audio or video file", type=['mp3', 'wav', 'mp4', 'avi', 'mov'])
if uploaded_file is not None:
with st.spinner("Processing file..."):
# Save uploaded file temporarily
temp_dir = tempfile.mkdtemp()
temp_path = os.path.join(temp_dir, uploaded_file.name)
with open(temp_path, 'wb') as f:
f.write(uploaded_file.getvalue())
# Extract audio if it's a video file
if uploaded_file.type.startswith('video'):
audio_path = extract_audio(temp_path)
else:
audio_path = temp_path
# Transcribe
generate_kwargs = {
"return_timestamps": True
}
result = st.session_state.pipe(
audio_path,
generate_kwargs=generate_kwargs,
chunk_length_s=30,
batch_size=8
)
# Display results
st.subheader("Transcription:")
st.write(result["text"])
# Create and offer SRT download
srt_content = create_srt(result["chunks"])
st.download_button(
label="Download SRT file",
data=srt_content,
file_name="transcription.srt",
mime="text/plain"
)
# Cleanup
os.remove(temp_path)
if uploaded_file.type.startswith('video'):
os.remove(audio_path)
os.rmdir(temp_dir)
if __name__ == "__main__":
main()