yuki-2025 commited on
Commit
abfcb93
·
1 Parent(s): b8f6b6e
Files changed (4) hide show
  1. app.py +155 -0
  2. audio_utils.py +120 -0
  3. packages.txt +1 -0
  4. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import os
4
+ from io import StringIO
5
+ import torch
6
+ import numpy as np
7
+ import logging
8
+ import datetime
9
+ import time
10
+ import psutil
11
+ from audio_utils import format_timestamp, generate_srt, transcribe_audio, set_logger
12
+
13
+ # Configure logging
14
+ class StreamlitHandler(logging.Handler):
15
+ def __init__(self, placeholder):
16
+ super().__init__()
17
+ self.placeholder = placeholder
18
+ self.log_output = StringIO()
19
+
20
+ def emit(self, record):
21
+ log_entry = f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {self.format(record)}"
22
+ self.log_output.write(log_entry + '\n')
23
+ self.placeholder.code(self.log_output.getvalue())
24
+
25
+ logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ def get_gpu_info():
30
+ if torch.cuda.is_available():
31
+ gpu = torch.cuda.get_device_properties(0)
32
+ return f"GPU: {gpu.name}, Total Memory: {gpu.total_memory / 1e9:.2f} GB"
33
+ return "GPU: Not available"
34
+
35
+ def get_cpu_info():
36
+ cpu_info = psutil.cpu_freq()
37
+ cpu_count = psutil.cpu_count(logical=False)
38
+ cpu_logical_count = psutil.cpu_count(logical=True)
39
+ return f"CPU: {cpu_count} physical cores, {cpu_logical_count} logical cores, Max Frequency: {cpu_info.max:.2f} MHz"
40
+
41
+ def main():
42
+ st.set_page_config(page_title="Video Subtitle Generator", page_icon="🎬")
43
+ st.markdown("""
44
+ <style>
45
+
46
+ .stButton > button {
47
+ background-color: #4CAF50;
48
+ color: white;
49
+ font-size: 16px;
50
+ padding: 10px 20px;
51
+ border-radius: 5px;
52
+ border: none;
53
+ }
54
+ </style>
55
+ """, unsafe_allow_html=True)
56
+
57
+ st.title("Video Subtitle Generator")
58
+ st.markdown("Generate subtitles from an audio/video file, using OpenAI's Whisper model.")
59
+
60
+ # Input section
61
+ st.header("Input")
62
+
63
+ with st.form(key='subtitle_form'):
64
+ # Create two columns with 2:1 ratio
65
+ col1, col2 = st.columns([2, 1])
66
+
67
+ # Elements in the wider column (2/3 width)
68
+ with col1:
69
+ uploaded_file = st.file_uploader("Upload video/audio file", type=["mp3", "wav", "mp4"])
70
+
71
+ # Elements in the narrower column (1/3 width)
72
+ with col2:
73
+ model_name = st.selectbox("Choose Whisper model", [
74
+ "openai/whisper-base",
75
+ "openai/whisper-tiny",
76
+ "openai/whisper-small",
77
+ "openai/whisper-medium",
78
+ "openai/whisper-large"
79
+ ])
80
+ language = st.selectbox("Choose language", ["en", "fr", "de", "es", "it", "ja", "ko", "pt", "ru", "zh"])
81
+
82
+ # Add a submit button to the form
83
+ submit_button = st.form_submit_button(label='Generate Subtitles')
84
+
85
+ st.subheader("Logs")
86
+ # Create a placeholder for logs
87
+ logs_placeholder = st.empty()
88
+
89
+ # Add StreamlitHandler to logger
90
+ streamlit_handler = StreamlitHandler(logs_placeholder)
91
+ logger.addHandler(streamlit_handler)
92
+ set_logger(logger)
93
+
94
+ # Handle form submission
95
+ if submit_button:
96
+ if uploaded_file is not None:
97
+ start_time = time.time()
98
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
99
+ tmp_file.write(uploaded_file.getvalue())
100
+ tmp_file_path = tmp_file.name
101
+
102
+ with st.spinner("Processing..."):
103
+ logger.info("Starting transcription process...")
104
+ full_text, srt_content = transcribe_audio(model_name, tmp_file_path, language=language)
105
+
106
+ if full_text and srt_content:
107
+ # Output section
108
+
109
+ with st.form(key='output_form'):
110
+ st.header("Output")
111
+ # col1, col2 = st.columns(1)
112
+
113
+ # with col1:
114
+ st.subheader("Full Transcription")
115
+ st.text_area("", value=full_text, height=200)
116
+
117
+ st.subheader("Detected Language")
118
+ st.write(language)
119
+
120
+ # with col2:
121
+ st.subheader("Subtitles (SRT format)")
122
+ st.text_area("", value=srt_content, height=200)
123
+
124
+ logger.info("Processing completed successfully")
125
+
126
+ submitted = st.form_submit_button("Download Subtitles")
127
+ if submitted:
128
+ st.download(
129
+ data=srt_content,
130
+ file_name="subtitles.srt",
131
+ mime="text/plain"
132
+ )
133
+ # # Add download button for subtitles
134
+ # st.download_button(
135
+ # label="Download Subtitles",
136
+ # data=srt_content,
137
+ # file_name="subtitles.srt",
138
+ # mime="text/plain"
139
+ # )
140
+
141
+ end_time = time.time()
142
+ total_time = end_time - start_time
143
+ logger.info(f"Total processing time: {total_time:.2f} seconds")
144
+
145
+ else:
146
+ logger.error("Transcription failed. No text or subtitle was generated.")
147
+ st.error("Transcription failed. No text or subtitle was generated.")
148
+
149
+ os.unlink(tmp_file_path)
150
+ else:
151
+ logger.warning("No file uploaded")
152
+ st.error("Please upload an audio/video file")
153
+
154
+ if __name__ == "__main__":
155
+ main()
audio_utils.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import datetime
3
+ import numpy as np
4
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer
5
+ from pydub import AudioSegment
6
+ import logging
7
+
8
+ # Set the path to FFmpeg and FFprobe
9
+ ffmpeg_path = "/usr/bin/ffmpeg"
10
+ ffprobe_path = "/usr/bin/ffprobe"
11
+
12
+ # Set the paths for pydub
13
+ AudioSegment.converter = ffmpeg_path
14
+ AudioSegment.ffmpeg = ffmpeg_path
15
+ AudioSegment.ffprobe = ffprobe_path
16
+
17
+ # Initialize logger
18
+ logger = logging.getLogger(__name__)
19
+
20
+ def set_logger(new_logger):
21
+ global logger
22
+ logger = new_logger
23
+
24
+ def format_timestamp(milliseconds):
25
+ """Convert milliseconds to SRT timestamp format."""
26
+ delta = datetime.timedelta(milliseconds=milliseconds)
27
+ hours, remainder = divmod(delta.seconds, 3600)
28
+ minutes, seconds = divmod(remainder, 60)
29
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds % 1000:03d}"
30
+
31
+ def generate_srt(transcriptions, chunk_length_ms, subtitle_duration_ms=5000):
32
+ """Generate SRT content from transcribed chunks with specified subtitle duration."""
33
+ srt_output = ""
34
+ srt_index = 1
35
+ for i, chunk_text in enumerate(transcriptions):
36
+ chunk_start_time = i * chunk_length_ms
37
+ chunk_end_time = (i + 1) * chunk_length_ms
38
+
39
+ # Split chunk text into words
40
+ words = chunk_text.split()
41
+
42
+ # Calculate number of subtitles for this chunk
43
+ num_subtitles = max(1, int(chunk_length_ms / subtitle_duration_ms))
44
+ words_per_subtitle = max(1, len(words) // num_subtitles)
45
+
46
+ for j in range(0, len(words), words_per_subtitle):
47
+ subtitle_words = words[j:j+words_per_subtitle]
48
+ subtitle_text = " ".join(subtitle_words)
49
+
50
+ start_time = chunk_start_time + (j // words_per_subtitle) * subtitle_duration_ms
51
+ end_time = min(start_time + subtitle_duration_ms, chunk_end_time)
52
+
53
+ srt_output += f"{srt_index}\n"
54
+ srt_output += f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n"
55
+ srt_output += f"{subtitle_text}\n\n"
56
+
57
+ srt_index += 1
58
+
59
+ return srt_output
60
+
61
+ def transcribe_audio(model_name, audio_path, language='en', chunk_length_ms=30000):
62
+ try:
63
+ # Check if CUDA is available
64
+ device = "cuda" if torch.cuda.is_available() else "cpu"
65
+ logger.info(f"Using device: {device}")
66
+
67
+ # Load model and processor
68
+ model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
69
+ processor = WhisperProcessor.from_pretrained(model_name)
70
+ tokenizer = WhisperTokenizer.from_pretrained(model_name)
71
+
72
+ # Load audio
73
+ audio = AudioSegment.from_file(audio_path)
74
+
75
+ # Resample to 16000 Hz
76
+ audio = audio.set_frame_rate(16000)
77
+
78
+ # Initialize lists to store chunk transcriptions
79
+ chunk_transcriptions = []
80
+
81
+ # Process audio in chunks
82
+ for i in range(0, len(audio), chunk_length_ms):
83
+ chunk = audio[i:i+chunk_length_ms]
84
+
85
+ # Convert chunk to numpy array
86
+ chunk_array = np.array(chunk.get_array_of_samples()).astype(np.float32)
87
+
88
+ # Normalize
89
+ chunk_array = chunk_array / np.max(np.abs(chunk_array))
90
+
91
+ # Process audio chunk
92
+ input_features = processor(chunk_array, sampling_rate=16000, return_tensors="pt").input_features
93
+ input_features = input_features.to(device)
94
+
95
+ # Generate token ids
96
+ forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language=language, task="transcribe")
97
+ predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
98
+
99
+ # Decode token ids to text
100
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
101
+ chunk_text = transcription[0].strip()
102
+ chunk_transcriptions.append(chunk_text)
103
+
104
+ # Print chunk transcription in real-time
105
+ print(f"Chunk {i // chunk_length_ms + 1} transcription:")
106
+ print(chunk_text)
107
+ print("-" * 50)
108
+
109
+ # Combine all chunk transcriptions
110
+ full_text = " ".join(chunk_transcriptions)
111
+
112
+ # Generate SRT content with 5-second subtitles
113
+ srt_content = generate_srt(chunk_transcriptions, chunk_length_ms, subtitle_duration_ms=5000)
114
+
115
+ return full_text, srt_content
116
+
117
+ except Exception as e:
118
+ logger.error(f"An error occurred during transcription: {str(e)}")
119
+ return None, None
120
+
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ numpy
4
+ streamlit
5
+ transformers
6
+ pydub
7
+ ffmpeg-python
8
+ psutil