Spaces:
Sleeping
Sleeping
cptsubtext
commited on
Commit
·
7d6c6c5
1
Parent(s):
3a0ff0a
fix mel error
Browse files
app.py
CHANGED
@@ -5,9 +5,6 @@ import pysrt
|
|
5 |
import os
|
6 |
import io
|
7 |
|
8 |
-
# Variables (for potential future API integration)
|
9 |
-
# valid_api_token = st.secrets.get("API_TOKEN") # Using st.secrets for better security
|
10 |
-
|
11 |
st.title("Speech-to-Text with Transformers")
|
12 |
|
13 |
with st.expander("README"):
|
@@ -27,19 +24,19 @@ model_size = st.selectbox(
|
|
27 |
# Should we translate to English?
|
28 |
translate = st.checkbox("Would you like a translation to English?")
|
29 |
|
30 |
-
#
|
31 |
-
|
32 |
-
st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. There's no explicit 'free tier' checkbox in this context for model size, but larger models will consume more resources and time.")
|
33 |
-
# api_token = st.text_input("API Token (Optional, for external APIs like OpenAI's if not using local models)")
|
34 |
|
35 |
@st.cache_resource
|
36 |
def load_whisper_pipeline(model_name):
|
37 |
"""
|
38 |
Loads the Hugging Face Whisper ASR pipeline.
|
39 |
Uses st.cache_resource to avoid reloading the model on every rerun.
|
|
|
40 |
"""
|
41 |
st.info(f"Loading {model_name} model... This may take a moment.")
|
42 |
-
|
|
|
43 |
|
44 |
def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
|
45 |
"""
|
@@ -49,29 +46,61 @@ def transcribe_with_transformers(audio_file_path, model_name, translate_to_engli
|
|
49 |
asr_pipeline = load_whisper_pipeline(model_name)
|
50 |
|
51 |
st.info("Transcribing audio... Please wait.")
|
|
|
|
|
|
|
52 |
if translate_to_english:
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
|
58 |
transcribed_text = prediction["text"]
|
59 |
-
st.subheader("Transcription Output:")
|
60 |
st.write(transcribed_text)
|
61 |
|
62 |
-
# Generate SRT content (simplified for demonstration)
|
63 |
-
# For more precise timings, you'd need to process word-level timestamps if available from the pipeline
|
64 |
-
# or use a library that offers more granular control like stable-whisper provides.
|
65 |
-
# For simplicity, this example just puts the whole transcription into one caption.
|
66 |
-
# A real-world scenario would segment the audio and get timestamps for each segment.
|
67 |
srt_content = pysrt.SubRipFile()
|
68 |
-
|
69 |
-
#
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
|
77 |
srt_file_path = "audio.srt"
|
@@ -84,17 +113,18 @@ def transcribe_with_transformers(audio_file_path, model_name, translate_to_engli
|
|
84 |
|
85 |
except Exception as e:
|
86 |
st.error(f"Error during transcription: {str(e)}")
|
87 |
-
|
88 |
-
st.info("Common issues: File format not supported, model loading failed, or audio too long for available memory.")
|
89 |
|
90 |
|
91 |
if uploaded_file is not None:
|
92 |
# Save uploaded file to a temporary location for transformers pipeline
|
93 |
# The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
|
94 |
-
|
|
|
|
|
95 |
f.write(uploaded_file.getbuffer())
|
96 |
|
97 |
-
audio_file_path =
|
98 |
|
99 |
transcribe_with_transformers(audio_file_path, model_size, translate)
|
100 |
|
|
|
5 |
import os
|
6 |
import io
|
7 |
|
|
|
|
|
|
|
8 |
st.title("Speech-to-Text with Transformers")
|
9 |
|
10 |
with st.expander("README"):
|
|
|
24 |
# Should we translate to English?
|
25 |
translate = st.checkbox("Would you like a translation to English?")
|
26 |
|
27 |
+
# Information about resource usage on Hugging Face Spaces
|
28 |
+
st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. Larger models will consume more resources and time.")
|
|
|
|
|
29 |
|
30 |
@st.cache_resource
|
31 |
def load_whisper_pipeline(model_name):
|
32 |
"""
|
33 |
Loads the Hugging Face Whisper ASR pipeline.
|
34 |
Uses st.cache_resource to avoid reloading the model on every rerun.
|
35 |
+
We explicitly tell the pipeline to return timestamps for long-form audio.
|
36 |
"""
|
37 |
st.info(f"Loading {model_name} model... This may take a moment.")
|
38 |
+
# Set return_timestamps=True to handle audio longer than 30 seconds
|
39 |
+
return pipeline("automatic-speech-recognition", model=model_name, return_timestamps=True)
|
40 |
|
41 |
def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
|
42 |
"""
|
|
|
46 |
asr_pipeline = load_whisper_pipeline(model_name)
|
47 |
|
48 |
st.info("Transcribing audio... Please wait.")
|
49 |
+
|
50 |
+
# Configure generation arguments for translation if requested
|
51 |
+
generate_kwargs = {}
|
52 |
if translate_to_english:
|
53 |
+
generate_kwargs["task"] = "translate"
|
54 |
+
|
55 |
+
# Pass the audio file path and any generation arguments to the pipeline
|
56 |
+
prediction = asr_pipeline(audio_file_path, generate_kwargs=generate_kwargs)
|
57 |
|
58 |
transcribed_text = prediction["text"]
|
59 |
+
st.subheader("Full Transcription Output:")
|
60 |
st.write(transcribed_text)
|
61 |
|
|
|
|
|
|
|
|
|
|
|
62 |
srt_content = pysrt.SubRipFile()
|
63 |
+
|
64 |
+
# The 'chunks' key will be present if return_timestamps=True was set
|
65 |
+
if "chunks" in prediction:
|
66 |
+
for i, chunk in enumerate(prediction["chunks"]):
|
67 |
+
start_time_seconds = chunk["timestamp"][0] if chunk["timestamp"][0] is not None else 0.0
|
68 |
+
end_time_seconds = chunk["timestamp"][1] if chunk["timestamp"][1] is not None else start_time_seconds + 1.0 # Default if end is None
|
69 |
+
|
70 |
+
# Helper function to convert seconds to pysrt.SubRipTime
|
71 |
+
def seconds_to_srt_time(total_seconds):
|
72 |
+
hours = int(total_seconds / 3600)
|
73 |
+
minutes = int((total_seconds % 3600) / 60)
|
74 |
+
seconds = int(total_seconds % 60)
|
75 |
+
milliseconds = int((total_seconds - int(total_seconds)) * 1000)
|
76 |
+
return pysrt.SubRipTime(hours, minutes, seconds, milliseconds)
|
77 |
+
|
78 |
+
item = pysrt.SubRipItem(
|
79 |
+
index=i + 1,
|
80 |
+
start=seconds_to_srt_time(start_time_seconds),
|
81 |
+
end=seconds_to_srt_time(end_time_seconds),
|
82 |
+
text=chunk["text"]
|
83 |
+
)
|
84 |
+
srt_content.append(item)
|
85 |
+
else:
|
86 |
+
st.warning("Could not retrieve segmented timestamps. Generating a single subtitle entry.")
|
87 |
+
# Fallback: Create a single subtitle entry if chunks are not available
|
88 |
+
# This is less ideal but ensures some output even if timestamps are missing
|
89 |
+
audio_duration_seconds = 0
|
90 |
+
try:
|
91 |
+
audio = AudioSegment.from_file(audio_file_path)
|
92 |
+
audio_duration_seconds = audio.duration_seconds
|
93 |
+
except Exception:
|
94 |
+
# Estimate duration if pydub fails
|
95 |
+
audio_duration_seconds = len(transcribed_text) * 0.1 # Very rough estimate
|
96 |
+
|
97 |
+
item = pysrt.SubRipItem(
|
98 |
+
index=1,
|
99 |
+
start=pysrt.SubRipTime(0, 0, 0, 0),
|
100 |
+
end=pysrt.SubRipTime(0, 0, int(audio_duration_seconds), 0),
|
101 |
+
text=transcribed_text
|
102 |
+
)
|
103 |
+
srt_content.append(item)
|
104 |
|
105 |
|
106 |
srt_file_path = "audio.srt"
|
|
|
113 |
|
114 |
except Exception as e:
|
115 |
st.error(f"Error during transcription: {str(e)}")
|
116 |
+
st.info("Common issues: File format not supported, model loading failed (check Hugging Face Space logs), or audio too large for available memory.")
|
|
|
117 |
|
118 |
|
119 |
if uploaded_file is not None:
|
120 |
# Save uploaded file to a temporary location for transformers pipeline
|
121 |
# The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
|
122 |
+
# It's crucial to give the file a proper extension for pydub to identify format
|
123 |
+
temp_file_name = "temp_audio_file." + uploaded_file.type.split('/')[-1]
|
124 |
+
with open(temp_file_name, "wb") as f:
|
125 |
f.write(uploaded_file.getbuffer())
|
126 |
|
127 |
+
audio_file_path = temp_file_name
|
128 |
|
129 |
transcribe_with_transformers(audio_file_path, model_size, translate)
|
130 |
|