Spaces:
Runtime error
Runtime error
yuki-2025
commited on
Commit
·
abfcb93
1
Parent(s):
b8f6b6e
commit1
Browse files- app.py +155 -0
- audio_utils.py +120 -0
- packages.txt +1 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import tempfile
|
3 |
+
import os
|
4 |
+
from io import StringIO
|
5 |
+
import torch
|
6 |
+
import numpy as np
|
7 |
+
import logging
|
8 |
+
import datetime
|
9 |
+
import time
|
10 |
+
import psutil
|
11 |
+
from audio_utils import format_timestamp, generate_srt, transcribe_audio, set_logger
|
12 |
+
|
13 |
+
# Configure logging
|
14 |
+
class StreamlitHandler(logging.Handler):
|
15 |
+
def __init__(self, placeholder):
|
16 |
+
super().__init__()
|
17 |
+
self.placeholder = placeholder
|
18 |
+
self.log_output = StringIO()
|
19 |
+
|
20 |
+
def emit(self, record):
|
21 |
+
log_entry = f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {self.format(record)}"
|
22 |
+
self.log_output.write(log_entry + '\n')
|
23 |
+
self.placeholder.code(self.log_output.getvalue())
|
24 |
+
|
25 |
+
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
|
26 |
+
logger = logging.getLogger(__name__)
|
27 |
+
|
28 |
+
|
29 |
+
def get_gpu_info():
|
30 |
+
if torch.cuda.is_available():
|
31 |
+
gpu = torch.cuda.get_device_properties(0)
|
32 |
+
return f"GPU: {gpu.name}, Total Memory: {gpu.total_memory / 1e9:.2f} GB"
|
33 |
+
return "GPU: Not available"
|
34 |
+
|
35 |
+
def get_cpu_info():
|
36 |
+
cpu_info = psutil.cpu_freq()
|
37 |
+
cpu_count = psutil.cpu_count(logical=False)
|
38 |
+
cpu_logical_count = psutil.cpu_count(logical=True)
|
39 |
+
return f"CPU: {cpu_count} physical cores, {cpu_logical_count} logical cores, Max Frequency: {cpu_info.max:.2f} MHz"
|
40 |
+
|
41 |
+
def main():
|
42 |
+
st.set_page_config(page_title="Video Subtitle Generator", page_icon="🎬")
|
43 |
+
st.markdown("""
|
44 |
+
<style>
|
45 |
+
|
46 |
+
.stButton > button {
|
47 |
+
background-color: #4CAF50;
|
48 |
+
color: white;
|
49 |
+
font-size: 16px;
|
50 |
+
padding: 10px 20px;
|
51 |
+
border-radius: 5px;
|
52 |
+
border: none;
|
53 |
+
}
|
54 |
+
</style>
|
55 |
+
""", unsafe_allow_html=True)
|
56 |
+
|
57 |
+
st.title("Video Subtitle Generator")
|
58 |
+
st.markdown("Generate subtitles from an audio/video file, using OpenAI's Whisper model.")
|
59 |
+
|
60 |
+
# Input section
|
61 |
+
st.header("Input")
|
62 |
+
|
63 |
+
with st.form(key='subtitle_form'):
|
64 |
+
# Create two columns with 2:1 ratio
|
65 |
+
col1, col2 = st.columns([2, 1])
|
66 |
+
|
67 |
+
# Elements in the wider column (2/3 width)
|
68 |
+
with col1:
|
69 |
+
uploaded_file = st.file_uploader("Upload video/audio file", type=["mp3", "wav", "mp4"])
|
70 |
+
|
71 |
+
# Elements in the narrower column (1/3 width)
|
72 |
+
with col2:
|
73 |
+
model_name = st.selectbox("Choose Whisper model", [
|
74 |
+
"openai/whisper-base",
|
75 |
+
"openai/whisper-tiny",
|
76 |
+
"openai/whisper-small",
|
77 |
+
"openai/whisper-medium",
|
78 |
+
"openai/whisper-large"
|
79 |
+
])
|
80 |
+
language = st.selectbox("Choose language", ["en", "fr", "de", "es", "it", "ja", "ko", "pt", "ru", "zh"])
|
81 |
+
|
82 |
+
# Add a submit button to the form
|
83 |
+
submit_button = st.form_submit_button(label='Generate Subtitles')
|
84 |
+
|
85 |
+
st.subheader("Logs")
|
86 |
+
# Create a placeholder for logs
|
87 |
+
logs_placeholder = st.empty()
|
88 |
+
|
89 |
+
# Add StreamlitHandler to logger
|
90 |
+
streamlit_handler = StreamlitHandler(logs_placeholder)
|
91 |
+
logger.addHandler(streamlit_handler)
|
92 |
+
set_logger(logger)
|
93 |
+
|
94 |
+
# Handle form submission
|
95 |
+
if submit_button:
|
96 |
+
if uploaded_file is not None:
|
97 |
+
start_time = time.time()
|
98 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
|
99 |
+
tmp_file.write(uploaded_file.getvalue())
|
100 |
+
tmp_file_path = tmp_file.name
|
101 |
+
|
102 |
+
with st.spinner("Processing..."):
|
103 |
+
logger.info("Starting transcription process...")
|
104 |
+
full_text, srt_content = transcribe_audio(model_name, tmp_file_path, language=language)
|
105 |
+
|
106 |
+
if full_text and srt_content:
|
107 |
+
# Output section
|
108 |
+
|
109 |
+
with st.form(key='output_form'):
|
110 |
+
st.header("Output")
|
111 |
+
# col1, col2 = st.columns(1)
|
112 |
+
|
113 |
+
# with col1:
|
114 |
+
st.subheader("Full Transcription")
|
115 |
+
st.text_area("", value=full_text, height=200)
|
116 |
+
|
117 |
+
st.subheader("Detected Language")
|
118 |
+
st.write(language)
|
119 |
+
|
120 |
+
# with col2:
|
121 |
+
st.subheader("Subtitles (SRT format)")
|
122 |
+
st.text_area("", value=srt_content, height=200)
|
123 |
+
|
124 |
+
logger.info("Processing completed successfully")
|
125 |
+
|
126 |
+
submitted = st.form_submit_button("Download Subtitles")
|
127 |
+
if submitted:
|
128 |
+
st.download(
|
129 |
+
data=srt_content,
|
130 |
+
file_name="subtitles.srt",
|
131 |
+
mime="text/plain"
|
132 |
+
)
|
133 |
+
# # Add download button for subtitles
|
134 |
+
# st.download_button(
|
135 |
+
# label="Download Subtitles",
|
136 |
+
# data=srt_content,
|
137 |
+
# file_name="subtitles.srt",
|
138 |
+
# mime="text/plain"
|
139 |
+
# )
|
140 |
+
|
141 |
+
end_time = time.time()
|
142 |
+
total_time = end_time - start_time
|
143 |
+
logger.info(f"Total processing time: {total_time:.2f} seconds")
|
144 |
+
|
145 |
+
else:
|
146 |
+
logger.error("Transcription failed. No text or subtitle was generated.")
|
147 |
+
st.error("Transcription failed. No text or subtitle was generated.")
|
148 |
+
|
149 |
+
os.unlink(tmp_file_path)
|
150 |
+
else:
|
151 |
+
logger.warning("No file uploaded")
|
152 |
+
st.error("Please upload an audio/video file")
|
153 |
+
|
154 |
+
if __name__ == "__main__":
|
155 |
+
main()
|
audio_utils.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import datetime
|
3 |
+
import numpy as np
|
4 |
+
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer
|
5 |
+
from pydub import AudioSegment
|
6 |
+
import logging
|
7 |
+
|
8 |
+
# Set the path to FFmpeg and FFprobe
|
9 |
+
ffmpeg_path = "/usr/bin/ffmpeg"
|
10 |
+
ffprobe_path = "/usr/bin/ffprobe"
|
11 |
+
|
12 |
+
# Set the paths for pydub
|
13 |
+
AudioSegment.converter = ffmpeg_path
|
14 |
+
AudioSegment.ffmpeg = ffmpeg_path
|
15 |
+
AudioSegment.ffprobe = ffprobe_path
|
16 |
+
|
17 |
+
# Initialize logger
|
18 |
+
logger = logging.getLogger(__name__)
|
19 |
+
|
20 |
+
def set_logger(new_logger):
|
21 |
+
global logger
|
22 |
+
logger = new_logger
|
23 |
+
|
24 |
+
def format_timestamp(milliseconds):
|
25 |
+
"""Convert milliseconds to SRT timestamp format."""
|
26 |
+
delta = datetime.timedelta(milliseconds=milliseconds)
|
27 |
+
hours, remainder = divmod(delta.seconds, 3600)
|
28 |
+
minutes, seconds = divmod(remainder, 60)
|
29 |
+
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds % 1000:03d}"
|
30 |
+
|
31 |
+
def generate_srt(transcriptions, chunk_length_ms, subtitle_duration_ms=5000):
|
32 |
+
"""Generate SRT content from transcribed chunks with specified subtitle duration."""
|
33 |
+
srt_output = ""
|
34 |
+
srt_index = 1
|
35 |
+
for i, chunk_text in enumerate(transcriptions):
|
36 |
+
chunk_start_time = i * chunk_length_ms
|
37 |
+
chunk_end_time = (i + 1) * chunk_length_ms
|
38 |
+
|
39 |
+
# Split chunk text into words
|
40 |
+
words = chunk_text.split()
|
41 |
+
|
42 |
+
# Calculate number of subtitles for this chunk
|
43 |
+
num_subtitles = max(1, int(chunk_length_ms / subtitle_duration_ms))
|
44 |
+
words_per_subtitle = max(1, len(words) // num_subtitles)
|
45 |
+
|
46 |
+
for j in range(0, len(words), words_per_subtitle):
|
47 |
+
subtitle_words = words[j:j+words_per_subtitle]
|
48 |
+
subtitle_text = " ".join(subtitle_words)
|
49 |
+
|
50 |
+
start_time = chunk_start_time + (j // words_per_subtitle) * subtitle_duration_ms
|
51 |
+
end_time = min(start_time + subtitle_duration_ms, chunk_end_time)
|
52 |
+
|
53 |
+
srt_output += f"{srt_index}\n"
|
54 |
+
srt_output += f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n"
|
55 |
+
srt_output += f"{subtitle_text}\n\n"
|
56 |
+
|
57 |
+
srt_index += 1
|
58 |
+
|
59 |
+
return srt_output
|
60 |
+
|
61 |
+
def transcribe_audio(model_name, audio_path, language='en', chunk_length_ms=30000):
|
62 |
+
try:
|
63 |
+
# Check if CUDA is available
|
64 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
65 |
+
logger.info(f"Using device: {device}")
|
66 |
+
|
67 |
+
# Load model and processor
|
68 |
+
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
|
69 |
+
processor = WhisperProcessor.from_pretrained(model_name)
|
70 |
+
tokenizer = WhisperTokenizer.from_pretrained(model_name)
|
71 |
+
|
72 |
+
# Load audio
|
73 |
+
audio = AudioSegment.from_file(audio_path)
|
74 |
+
|
75 |
+
# Resample to 16000 Hz
|
76 |
+
audio = audio.set_frame_rate(16000)
|
77 |
+
|
78 |
+
# Initialize lists to store chunk transcriptions
|
79 |
+
chunk_transcriptions = []
|
80 |
+
|
81 |
+
# Process audio in chunks
|
82 |
+
for i in range(0, len(audio), chunk_length_ms):
|
83 |
+
chunk = audio[i:i+chunk_length_ms]
|
84 |
+
|
85 |
+
# Convert chunk to numpy array
|
86 |
+
chunk_array = np.array(chunk.get_array_of_samples()).astype(np.float32)
|
87 |
+
|
88 |
+
# Normalize
|
89 |
+
chunk_array = chunk_array / np.max(np.abs(chunk_array))
|
90 |
+
|
91 |
+
# Process audio chunk
|
92 |
+
input_features = processor(chunk_array, sampling_rate=16000, return_tensors="pt").input_features
|
93 |
+
input_features = input_features.to(device)
|
94 |
+
|
95 |
+
# Generate token ids
|
96 |
+
forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language=language, task="transcribe")
|
97 |
+
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
|
98 |
+
|
99 |
+
# Decode token ids to text
|
100 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
101 |
+
chunk_text = transcription[0].strip()
|
102 |
+
chunk_transcriptions.append(chunk_text)
|
103 |
+
|
104 |
+
# Print chunk transcription in real-time
|
105 |
+
print(f"Chunk {i // chunk_length_ms + 1} transcription:")
|
106 |
+
print(chunk_text)
|
107 |
+
print("-" * 50)
|
108 |
+
|
109 |
+
# Combine all chunk transcriptions
|
110 |
+
full_text = " ".join(chunk_transcriptions)
|
111 |
+
|
112 |
+
# Generate SRT content with 5-second subtitles
|
113 |
+
srt_content = generate_srt(chunk_transcriptions, chunk_length_ms, subtitle_duration_ms=5000)
|
114 |
+
|
115 |
+
return full_text, srt_content
|
116 |
+
|
117 |
+
except Exception as e:
|
118 |
+
logger.error(f"An error occurred during transcription: {str(e)}")
|
119 |
+
return None, None
|
120 |
+
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
torchvision
|
3 |
+
numpy
|
4 |
+
streamlit
|
5 |
+
transformers
|
6 |
+
pydub
|
7 |
+
ffmpeg-python
|
8 |
+
psutil
|