Spaces:

Yuki20
/

video_subtitle

Runtime error

App Files Files Community

yuki-2025 commited on Sep 5, 2024

Commit

abfcb93

1 Parent(s): b8f6b6e

commit1

Browse files

Files changed (4) hide show

app.py +155 -0
audio_utils.py +120 -0
packages.txt +1 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import streamlit as st
+import tempfile
+import os
+from io import StringIO
+import torch
+import numpy as np
+import logging
+import datetime
+import time
+import psutil
+from audio_utils import format_timestamp, generate_srt, transcribe_audio, set_logger
+# Configure logging
+class StreamlitHandler(logging.Handler):
+    def __init__(self, placeholder):
+        super().__init__()
+        self.placeholder = placeholder
+        self.log_output = StringIO()
+    def emit(self, record):
+        log_entry = f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {self.format(record)}"
+        self.log_output.write(log_entry + '\n')
+        self.placeholder.code(self.log_output.getvalue())
+logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def get_gpu_info():
+    if torch.cuda.is_available():
+        gpu = torch.cuda.get_device_properties(0)
+        return f"GPU: {gpu.name}, Total Memory: {gpu.total_memory / 1e9:.2f} GB"
+    return "GPU: Not available"
+def get_cpu_info():
+    cpu_info = psutil.cpu_freq()
+    cpu_count = psutil.cpu_count(logical=False)
+    cpu_logical_count = psutil.cpu_count(logical=True)
+    return f"CPU: {cpu_count} physical cores, {cpu_logical_count} logical cores, Max Frequency: {cpu_info.max:.2f} MHz"
+def main():
+    st.set_page_config(page_title="Video Subtitle Generator", page_icon="🎬")
+    st.markdown("""
+    <style>
+    .stButton > button {
+        background-color: #4CAF50;
+        color: white;
+        font-size: 16px;
+        padding: 10px 20px;
+        border-radius: 5px;
+        border: none;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+    st.title("Video Subtitle Generator")
+    st.markdown("Generate subtitles from an audio/video file, using OpenAI's Whisper model.")
+    # Input section
+    st.header("Input")
+    with st.form(key='subtitle_form'):
+    # Create two columns with 2:1 ratio
+        col1, col2 = st.columns([2, 1])
+        # Elements in the wider column (2/3 width)
+        with col1:
+            uploaded_file = st.file_uploader("Upload video/audio file", type=["mp3", "wav", "mp4"])
+        # Elements in the narrower column (1/3 width)
+        with col2:
+            model_name = st.selectbox("Choose Whisper model", [
+                "openai/whisper-base",
+                "openai/whisper-tiny",
+                "openai/whisper-small",
+                "openai/whisper-medium",
+                "openai/whisper-large"
+            ])
+            language = st.selectbox("Choose language", ["en", "fr", "de", "es", "it", "ja", "ko", "pt", "ru", "zh"])
+        # Add a submit button to the form
+        submit_button = st.form_submit_button(label='Generate Subtitles')
+    st.subheader("Logs")
+    # Create a placeholder for logs
+    logs_placeholder = st.empty()
+    # Add StreamlitHandler to logger
+    streamlit_handler = StreamlitHandler(logs_placeholder)
+    logger.addHandler(streamlit_handler)
+    set_logger(logger)
+    # Handle form submission
+    if submit_button:
+        if uploaded_file is not None:
+            start_time = time.time()
+            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
+                tmp_file.write(uploaded_file.getvalue())
+                tmp_file_path = tmp_file.name
+            with st.spinner("Processing..."):
+                logger.info("Starting transcription process...")
+                full_text, srt_content = transcribe_audio(model_name, tmp_file_path, language=language)
+            if full_text and srt_content:
+                # Output section
+                with st.form(key='output_form'):
+                    st.header("Output")
+                    # col1, col2 = st.columns(1)
+                    # with col1:
+                    st.subheader("Full Transcription")
+                    st.text_area("", value=full_text, height=200)
+                    st.subheader("Detected Language")
+                    st.write(language)
+                    # with col2:
+                    st.subheader("Subtitles (SRT format)")
+                    st.text_area("", value=srt_content, height=200)
+                    logger.info("Processing completed successfully")
+                    submitted = st.form_submit_button("Download Subtitles")
+                    if submitted:
+                        st.download(
+                            data=srt_content,
+                            file_name="subtitles.srt",
+                            mime="text/plain"
+                        )
+                    # # Add download button for subtitles
+                    # st.download_button(
+                    #     label="Download Subtitles",
+                    #     data=srt_content,
+                    #     file_name="subtitles.srt",
+                    #     mime="text/plain"
+                    # )
+                end_time = time.time()
+                total_time = end_time - start_time
+                logger.info(f"Total processing time: {total_time:.2f} seconds")
+            else:
+                logger.error("Transcription failed. No text or subtitle was generated.")
+                st.error("Transcription failed. No text or subtitle was generated.")
+            os.unlink(tmp_file_path)
+        else:
+            logger.warning("No file uploaded")
+            st.error("Please upload an audio/video file")
+if __name__ == "__main__":
+    main()

audio_utils.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+import datetime
+import numpy as np
+from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer
+from pydub import AudioSegment
+import logging
+# Set the path to FFmpeg and FFprobe
+ffmpeg_path = "/usr/bin/ffmpeg"
+ffprobe_path = "/usr/bin/ffprobe"
+# Set the paths for pydub
+AudioSegment.converter = ffmpeg_path
+AudioSegment.ffmpeg = ffmpeg_path
+AudioSegment.ffprobe = ffprobe_path
+# Initialize logger
+logger = logging.getLogger(__name__)
+def set_logger(new_logger):
+    global logger
+    logger = new_logger
+def format_timestamp(milliseconds):
+    """Convert milliseconds to SRT timestamp format."""
+    delta = datetime.timedelta(milliseconds=milliseconds)
+    hours, remainder = divmod(delta.seconds, 3600)
+    minutes, seconds = divmod(remainder, 60)
+    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds % 1000:03d}"
+def generate_srt(transcriptions, chunk_length_ms, subtitle_duration_ms=5000):
+    """Generate SRT content from transcribed chunks with specified subtitle duration."""
+    srt_output = ""
+    srt_index = 1
+    for i, chunk_text in enumerate(transcriptions):
+        chunk_start_time = i * chunk_length_ms
+        chunk_end_time = (i + 1) * chunk_length_ms
+        # Split chunk text into words
+        words = chunk_text.split()
+        # Calculate number of subtitles for this chunk
+        num_subtitles = max(1, int(chunk_length_ms / subtitle_duration_ms))
+        words_per_subtitle = max(1, len(words) // num_subtitles)
+        for j in range(0, len(words), words_per_subtitle):
+            subtitle_words = words[j:j+words_per_subtitle]
+            subtitle_text = " ".join(subtitle_words)
+            start_time = chunk_start_time + (j // words_per_subtitle) * subtitle_duration_ms
+            end_time = min(start_time + subtitle_duration_ms, chunk_end_time)
+            srt_output += f"{srt_index}\n"
+            srt_output += f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n"
+            srt_output += f"{subtitle_text}\n\n"
+            srt_index += 1
+    return srt_output
+def transcribe_audio(model_name, audio_path, language='en', chunk_length_ms=30000):
+    try:
+        # Check if CUDA is available
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {device}")
+        # Load model and processor
+        model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
+        processor = WhisperProcessor.from_pretrained(model_name)
+        tokenizer = WhisperTokenizer.from_pretrained(model_name)
+        # Load audio
+        audio = AudioSegment.from_file(audio_path)
+        # Resample to 16000 Hz
+        audio = audio.set_frame_rate(16000)
+        # Initialize lists to store chunk transcriptions
+        chunk_transcriptions = []
+        # Process audio in chunks
+        for i in range(0, len(audio), chunk_length_ms):
+            chunk = audio[i:i+chunk_length_ms]
+            # Convert chunk to numpy array
+            chunk_array = np.array(chunk.get_array_of_samples()).astype(np.float32)
+            # Normalize
+            chunk_array = chunk_array / np.max(np.abs(chunk_array))
+            # Process audio chunk
+            input_features = processor(chunk_array, sampling_rate=16000, return_tensors="pt").input_features
+            input_features = input_features.to(device)
+            # Generate token ids
+            forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language=language, task="transcribe")
+            predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
+            # Decode token ids to text
+            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+            chunk_text = transcription[0].strip()
+            chunk_transcriptions.append(chunk_text)
+            # Print chunk transcription in real-time
+            print(f"Chunk {i // chunk_length_ms + 1} transcription:")
+            print(chunk_text)
+            print("-" * 50)
+        # Combine all chunk transcriptions
+        full_text = " ".join(chunk_transcriptions)
+        # Generate SRT content with 5-second subtitles
+        srt_content = generate_srt(chunk_transcriptions, chunk_length_ms, subtitle_duration_ms=5000)
+        return full_text, srt_content
+    except Exception as e:
+        logger.error(f"An error occurred during transcription: {str(e)}")
+        return None, None

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+torchvision
+numpy
+streamlit
+transformers
+pydub
+ffmpeg-python
+psutil