Spaces:

vericudebuget
/

subtitle-generator

Runtime error

App Files Files Community

vericudebuget commited on Oct 4, 2024

Commit

ca365ff

verified ·

1 Parent(s): b4bfd19

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -67

app.py CHANGED Viewed

@@ -1,27 +1,50 @@
 import streamlit as st
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-from pydub import AudioSegment
 import tempfile
-import torch
 import os
-# Set the device to CPU only
-device = "cpu"
-torch_dtype = torch.float32
-# Initialize session state
-if 'transcription_text' not in st.session_state:
-    st.session_state.transcription_text = None
-if 'srt_content' not in st.session_state:
-    st.session_state.srt_content = None
-@st.cache_resource
-def load_model():
     model_id = "openai/whisper-tiny"
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=False, use_safetensors=True
-    ).to(device)
     processor = AutoProcessor.from_pretrained(model_id)
     pipe = pipeline(
         "automatic-speech-recognition",
         model=model,
@@ -30,65 +53,64 @@ def load_model():
         torch_dtype=torch_dtype,
         device=device,
     )
     return pipe
-def format_srt_time(seconds):
-    hours, remainder = divmod(seconds, 3600)
-    minutes, seconds = divmod(remainder, 60)
-    milliseconds = int((seconds % 1) * 1000)
-    seconds = int(seconds)
-    return f"{int(hours):02}:{int(minutes):02}:{seconds:02},{milliseconds:03}"
-st.title("Audio/Video Transcription App")
-# Load model
-pipe = load_model()
-# File upload
-uploaded_file = st.file_uploader("Upload an audio or video file", type=["mp3", "wav", "mp4", "m4a"])
-if uploaded_file is not None:
-    with st.spinner("Processing audio..."):
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
-            # If it's a video, extract audio
-            if uploaded_file.name.endswith(("mp4", "m4a")):
-                audio = AudioSegment.from_file(uploaded_file)
-                audio.export(temp_audio.name, format="wav")
             else:
-                audio = AudioSegment.from_file(uploaded_file)
-                audio.export(temp_audio.name, format="wav")
-            # Run the transcription
-            transcription_result = pipe(temp_audio.name, return_timestamps="word")
-            # Extract text and timestamps
-            st.session_state.transcription_text = transcription_result['text']
-            transcription_chunks = transcription_result['chunks']
-            # Generate SRT content
-            srt_content = ""
-            for i, chunk in enumerate(transcription_chunks, start=1):
-                start_time = chunk["timestamp"][0]
-                end_time = chunk["timestamp"][1]
-                text = chunk["text"]
-                srt_content += f"{i}\n"
-                srt_content += f"{format_srt_time(start_time)} --> {format_srt_time(end_time)}\n"
-                srt_content += f"{text}\n\n"
-            st.session_state.srt_content = srt_content
-# Display transcription
-if st.session_state.transcription_text:
-    st.subheader("Transcription")
-    st.write(st.session_state.transcription_text)
-    # Provide download for SRT file
-    if st.session_state.srt_content:
-        st.subheader("Download SRT File")
-        st.download_button(
-            label="Download SRT",
-            data=st.session_state.srt_content,
-            file_name="transcription.srt",
-            mime="text/plain"
-        )

+# requirements.txt
+# app.py
 import streamlit as st
+import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import tempfile
 import os
+from moviepy.editor import VideoFileClip
+import datetime
+def create_srt(chunks):
+    srt_content = ""
+    for i, chunk in enumerate(chunks, start=1):
+        start_time = str(datetime.timedelta(seconds=chunk['timestamp'][0]))
+        end_time = str(datetime.timedelta(seconds=chunk['timestamp'][1]))
+        # Ensure proper SRT timestamp format (HH:MM:SS,mmm)
+        start_time = start_time.rstrip('0').rstrip('.') + ',000' if '.' in start_time else start_time + ',000'
+        end_time = end_time.rstrip('0').rstrip('.') + ',000' if '.' in end_time else end_time + ',000'
+        srt_content += f"{i}\n{start_time} --> {end_time}\n{chunk['text']}\n\n"
+    return srt_content
+def extract_audio(video_path):
+    with VideoFileClip(video_path) as video:
+        audio = video.audio
+        _, temp_audio_path = tempfile.mkstemp(suffix='.mp3')
+        audio.write_audiofile(temp_audio_path)
+    return temp_audio_path
+def setup_model():
+    device = "cpu"
+    torch_dtype = torch.float32
     model_id = "openai/whisper-tiny"
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id,
+        torch_dtype=torch_dtype,
+        low_cpu_mem_usage=True,
+        use_safetensors=True
+    )
+    model.to(device)
     processor = AutoProcessor.from_pretrained(model_id)
     pipe = pipeline(
         "automatic-speech-recognition",
         model=model,
         torch_dtype=torch_dtype,
         device=device,
     )
     return pipe
+def main():
+    st.title("Audio/Video Transcription App")
+    # Initialize session state for model
+    if 'pipe' not in st.session_state:
+        with st.spinner("Loading model... This might take a few minutes."):
+            st.session_state.pipe = setup_model()
+    uploaded_file = st.file_uploader("Upload an audio or video file", type=['mp3', 'wav', 'mp4', 'avi', 'mov'])
+    if uploaded_file is not None:
+        with st.spinner("Processing file..."):
+            # Save uploaded file temporarily
+            temp_dir = tempfile.mkdtemp()
+            temp_path = os.path.join(temp_dir, uploaded_file.name)
+            with open(temp_path, 'wb') as f:
+                f.write(uploaded_file.getvalue())
+            # Extract audio if it's a video file
+            if uploaded_file.type.startswith('video'):
+                audio_path = extract_audio(temp_path)
             else:
+                audio_path = temp_path
+            # Transcribe
+            generate_kwargs = {
+                "return_timestamps": True
+            }
+            result = st.session_state.pipe(
+                audio_path,
+                generate_kwargs=generate_kwargs,
+                chunk_length_s=30,
+                batch_size=8
+            )
+            # Display results
+            st.subheader("Transcription:")
+            st.write(result["text"])
+            # Create and offer SRT download
+            srt_content = create_srt(result["chunks"])
+            st.download_button(
+                label="Download SRT file",
+                data=srt_content,
+                file_name="transcription.srt",
+                mime="text/plain"
+            )
+            # Cleanup
+            os.remove(temp_path)
+            if uploaded_file.type.startswith('video'):
+                os.remove(audio_path)
+            os.rmdir(temp_dir)
+if __name__ == "__main__":
+    main()