Spaces:
Sleeping
Sleeping
cptsubtext
commited on
Commit
·
394dcf7
1
Parent(s):
66a8b57
update with transformers
Browse files- app.py +88 -62
- requirements.txt +0 -131
app.py
CHANGED
@@ -1,77 +1,103 @@
|
|
1 |
import streamlit as st
|
2 |
-
from
|
3 |
-
from stable_whisper import load_hf_whisper
|
4 |
from pydub import AudioSegment
|
5 |
-
import webvtt
|
6 |
import pysrt
|
7 |
-
import requests
|
8 |
import os
|
|
|
9 |
|
10 |
-
# Variables
|
11 |
-
#valid_api_token = st.secrets
|
12 |
|
13 |
-
st.title("Speech-to-Text")
|
14 |
|
15 |
with st.expander("README"):
|
16 |
-
st.write("This
|
17 |
|
18 |
# Upload audio file
|
19 |
-
uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"
|
20 |
|
21 |
-
#
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
# Should we translate to
|
26 |
-
translate = st.checkbox("Would you like a translation to
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
# Model selection
|
29 |
-
model_size = st.selectbox("Model Size", ("tiny", "base", "small", "medium"))
|
30 |
-
|
31 |
-
def transcribe_to_subtitle(audio_bytes, model_name):
|
32 |
-
"""Transcribe audio to subtitle using OpenAI Whisper"""
|
33 |
-
# Load model based on selection
|
34 |
-
model = load_model(model_name)
|
35 |
-
#speedmodel = load_hf_whisper(model_name)
|
36 |
-
|
37 |
-
# Check how long the audio is free tier
|
38 |
-
# newAudio = AudioSegment.from_wav("audiofiles/download.wav")
|
39 |
-
#if use_free_tier and len(audio_bytes) > 0.048 * 2 * 60 * 1024:
|
40 |
-
# st.error(len(audio_bytes))
|
41 |
-
# st.error("Free tier only supports audio files under 2 minutes")
|
42 |
-
# return
|
43 |
-
|
44 |
-
# Transcribe audio
|
45 |
-
try:
|
46 |
-
if translate:
|
47 |
-
result = model.transcribe(audio_bytes, verbose=True, task = 'translate')
|
48 |
-
result.to_srt_vtt('audio.srt')
|
49 |
-
else:
|
50 |
-
result = model.transcribe(audio_bytes, verbose=True)
|
51 |
-
result.to_srt_vtt('audio.srt')
|
52 |
-
except Exception as e:
|
53 |
-
return {"error": f"Error during transcription: {str(e)}"}
|
54 |
-
|
55 |
-
captions = pysrt.open("audio.srt")
|
56 |
-
for caption in captions:
|
57 |
-
print(caption.start)
|
58 |
-
print(caption.text)
|
59 |
-
print(caption.end)
|
60 |
-
print()
|
61 |
-
|
62 |
-
output = captions.text
|
63 |
-
st.markdown(output, unsafe_allow_html=True)
|
64 |
-
|
65 |
-
# Download option
|
66 |
-
st.success("Transcription successful! Download subtitle file?")
|
67 |
-
with open("audio.srt", "rb") as f:
|
68 |
-
st.download_button("Download Subtitle in WebVtt Format", f, "audio.srt")
|
69 |
-
os.remove("audio.srt") # Remove temporary file
|
70 |
|
71 |
if uploaded_file is not None:
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from transformers import pipeline
|
|
|
3 |
from pydub import AudioSegment
|
|
|
4 |
import pysrt
|
|
|
5 |
import os
|
6 |
+
import io
|
7 |
|
8 |
+
# Variables (for potential future API integration)
|
9 |
+
# valid_api_token = st.secrets.get("API_TOKEN") # Using st.secrets for better security
|
10 |
|
11 |
+
st.title("Speech-to-Text with Transformers")
|
12 |
|
13 |
with st.expander("README"):
|
14 |
+
st.write("This tool transcribes audio files using Hugging Face Transformers. Upload an audio file, choose your model size, and optionally translate to English. A WebVTT/SRT file will be generated and can be downloaded. This is suitable for use as a subtitle file (e.g., in DaVinci Resolve Import Subtitles).")
|
15 |
|
16 |
# Upload audio file
|
17 |
+
uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"])
|
18 |
|
19 |
+
# Model selection
|
20 |
+
# Note: For Hugging Face Spaces, larger models might require more resources (GPU).
|
21 |
+
# "tiny", "base", "small", "medium" are common Whisper sizes.
|
22 |
+
model_size = st.selectbox(
|
23 |
+
"Model Size (select a smaller model for faster inference or limited resources)",
|
24 |
+
("openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium")
|
25 |
+
)
|
26 |
|
27 |
+
# Should we translate to English?
|
28 |
+
translate = st.checkbox("Would you like a translation to English?")
|
29 |
+
|
30 |
+
# Free tier or API token option (more relevant if you were to use an external API like AssemblyAI or OpenAI's API)
|
31 |
+
# For local model inference on Hugging Face Spaces, "free tier" typically refers to the space's compute limits.
|
32 |
+
st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. There's no explicit 'free tier' checkbox in this context for model size, but larger models will consume more resources and time.")
|
33 |
+
# api_token = st.text_input("API Token (Optional, for external APIs like OpenAI's if not using local models)")
|
34 |
+
|
35 |
+
@st.cache_resource
|
36 |
+
def load_whisper_pipeline(model_name):
|
37 |
+
"""
|
38 |
+
Loads the Hugging Face Whisper ASR pipeline.
|
39 |
+
Uses st.cache_resource to avoid reloading the model on every rerun.
|
40 |
+
"""
|
41 |
+
st.info(f"Loading {model_name} model... This may take a moment.")
|
42 |
+
return pipeline("automatic-speech-recognition", model=model_name)
|
43 |
+
|
44 |
+
def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
|
45 |
+
"""
|
46 |
+
Transcribes audio using the Hugging Face Transformers pipeline and generates an SRT.
|
47 |
+
"""
|
48 |
+
try:
|
49 |
+
asr_pipeline = load_whisper_pipeline(model_name)
|
50 |
+
|
51 |
+
st.info("Transcribing audio... Please wait.")
|
52 |
+
if translate_to_english:
|
53 |
+
# When task is 'translate', Whisper models directly translate to English
|
54 |
+
prediction = asr_pipeline(audio_file_path, generate_kwargs={"task": "translate"})
|
55 |
+
else:
|
56 |
+
prediction = asr_pipeline(audio_file_path)
|
57 |
+
|
58 |
+
transcribed_text = prediction["text"]
|
59 |
+
st.subheader("Transcription Output:")
|
60 |
+
st.write(transcribed_text)
|
61 |
+
|
62 |
+
# Generate SRT content (simplified for demonstration)
|
63 |
+
# For more precise timings, you'd need to process word-level timestamps if available from the pipeline
|
64 |
+
# or use a library that offers more granular control like stable-whisper provides.
|
65 |
+
# For simplicity, this example just puts the whole transcription into one caption.
|
66 |
+
# A real-world scenario would segment the audio and get timestamps for each segment.
|
67 |
+
srt_content = pysrt.SubRipFile()
|
68 |
+
# Create a single subtitle entry for the entire transcription for demonstration.
|
69 |
+
# In a real application, you'd want to segment the audio and create multiple entries with timestamps.
|
70 |
+
# The transformers pipeline returns a single text string by default.
|
71 |
+
# To get segment-level timestamps, you might need to configure the pipeline
|
72 |
+
# or use the underlying model directly.
|
73 |
+
item = pysrt.SubRipItem(index=1, start=pysrt.SubRipTime(0, 0, 0, 0), end=pysrt.SubRipTime(0, 0, int(len(transcribed_text)/10), 0), text=transcribed_text)
|
74 |
+
srt_content.append(item)
|
75 |
+
|
76 |
+
|
77 |
+
srt_file_path = "audio.srt"
|
78 |
+
srt_content.save(srt_file_path, encoding='utf-8')
|
79 |
+
|
80 |
+
st.success("Transcription successful! Download subtitle file?")
|
81 |
+
with open(srt_file_path, "rb") as f:
|
82 |
+
st.download_button("Download Subtitle in SRT Format", f, file_name="audio.srt")
|
83 |
+
os.remove(srt_file_path)
|
84 |
+
|
85 |
+
except Exception as e:
|
86 |
+
st.error(f"Error during transcription: {str(e)}")
|
87 |
+
# Optionally, provide more specific error handling based on the exception type
|
88 |
+
st.info("Common issues: File format not supported, model loading failed, or audio too long for available memory.")
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
if uploaded_file is not None:
|
92 |
+
# Save uploaded file to a temporary location for transformers pipeline
|
93 |
+
# The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
|
94 |
+
with open("temp_audio_file", "wb") as f:
|
95 |
+
f.write(uploaded_file.getbuffer())
|
96 |
+
|
97 |
+
audio_file_path = "temp_audio_file"
|
98 |
+
|
99 |
+
transcribe_with_transformers(audio_file_path, model_size, translate)
|
100 |
+
|
101 |
+
# Clean up the temporary file
|
102 |
+
if os.path.exists(audio_file_path):
|
103 |
+
os.remove(audio_file_path)
|
requirements.txt
CHANGED
@@ -1,131 +0,0 @@
|
|
1 |
-
aiohttp==3.9.5
|
2 |
-
aiosignal==1.3.1
|
3 |
-
alembic==1.13.1
|
4 |
-
altair==5.4.1
|
5 |
-
antlr4-python3-runtime==4.9.3
|
6 |
-
asteroid-filterbanks==0.4.0
|
7 |
-
attrs==24.2.0
|
8 |
-
audioread==3.0.1
|
9 |
-
blinker==1.8.2
|
10 |
-
Brotli==1.1.0
|
11 |
-
cachetools==5.5.0
|
12 |
-
certifi==2024.8.30
|
13 |
-
cffi==1.16.0
|
14 |
-
chardet==5.2.0
|
15 |
-
charset-normalizer==3.4.0
|
16 |
-
click==8.1.7
|
17 |
-
colorlog==6.8.2
|
18 |
-
contourpy==1.2.1
|
19 |
-
cycler==0.12.1
|
20 |
-
decorator==5.1.1
|
21 |
-
docopt==0.6.2
|
22 |
-
einops==0.8.0
|
23 |
-
filelock==3.14.0
|
24 |
-
fonttools==4.51.0
|
25 |
-
frozenlist==1.4.1
|
26 |
-
fsspec==2024.3.1
|
27 |
-
gitdb==4.0.11
|
28 |
-
GitPython==3.1.43
|
29 |
-
greenlet==3.0.3
|
30 |
-
huggingface-hub==0.22.2
|
31 |
-
HyperPyYAML==1.2.2
|
32 |
-
idna==3.10
|
33 |
-
Jinja2==3.1.4
|
34 |
-
joblib==1.4.0
|
35 |
-
jsonschema==4.23.0
|
36 |
-
jsonschema-specifications==2024.10.1
|
37 |
-
julius==0.2.7
|
38 |
-
kiwisolver==1.4.5
|
39 |
-
lazy_loader==0.4
|
40 |
-
librosa==0.10.1
|
41 |
-
lightning==2.2.3
|
42 |
-
lightning-utilities==0.11.2
|
43 |
-
llvmlite==0.42.0
|
44 |
-
Mako==1.3.3
|
45 |
-
markdown-it-py==3.0.0
|
46 |
-
MarkupSafe==3.0.2
|
47 |
-
matplotlib==3.8.4
|
48 |
-
mdurl==0.1.2
|
49 |
-
more-itertools==10.2.0
|
50 |
-
mpmath==1.3.0
|
51 |
-
msgpack==1.0.8
|
52 |
-
multidict==6.0.5
|
53 |
-
mutagen==1.47.0
|
54 |
-
narwhals==1.10.0
|
55 |
-
networkx==3.3
|
56 |
-
numba==0.59.1
|
57 |
-
numpy==1.26.4
|
58 |
-
omegaconf==2.3.0
|
59 |
-
openai-whisper==20240930
|
60 |
-
optuna==3.6.1
|
61 |
-
packaging==24.1
|
62 |
-
pandas==2.2.3
|
63 |
-
pillow==10.4.0
|
64 |
-
platformdirs==4.2.1
|
65 |
-
pooch==1.8.1
|
66 |
-
primePy==1.3
|
67 |
-
protobuf==5.28.3
|
68 |
-
pyannote.audio==3.1.1
|
69 |
-
pyannote.core==5.0.0
|
70 |
-
pyannote.database==5.1.0
|
71 |
-
pyannote.metrics==3.2.1
|
72 |
-
pyannote.pipeline==3.0.1
|
73 |
-
pyarrow==17.0.0
|
74 |
-
pycparser==2.22
|
75 |
-
pycryptodomex==3.20.0
|
76 |
-
pydeck==0.9.1
|
77 |
-
pydub==0.25.1
|
78 |
-
Pygments==2.18.0
|
79 |
-
pyparsing==3.1.2
|
80 |
-
pysrt==1.1.2
|
81 |
-
python-dateutil==2.9.0.post0
|
82 |
-
pytorch-lightning==2.2.3
|
83 |
-
pytorch-metric-learning==2.5.0
|
84 |
-
pytz==2024.2
|
85 |
-
PyYAML==6.0.1
|
86 |
-
referencing==0.35.1
|
87 |
-
regex==2024.4.28
|
88 |
-
requests==2.32.3
|
89 |
-
rich==13.9.3
|
90 |
-
rpds-py==0.20.0
|
91 |
-
ruamel.yaml==0.18.6
|
92 |
-
ruamel.yaml.clib==0.2.8
|
93 |
-
scikit-learn==1.4.2
|
94 |
-
scipy==1.13.0
|
95 |
-
semver==3.0.2
|
96 |
-
sentencepiece==0.2.0
|
97 |
-
setuptools==69.5.1
|
98 |
-
shellingham==1.5.4
|
99 |
-
six==1.16.0
|
100 |
-
smmap==5.0.1
|
101 |
-
sortedcontainers==2.4.0
|
102 |
-
soundfile==0.12.1
|
103 |
-
soxr==0.3.7
|
104 |
-
speechbrain==1.0.0
|
105 |
-
SQLAlchemy==2.0.29
|
106 |
-
stable-ts==2.19.0
|
107 |
-
streamlit==1.44.1
|
108 |
-
sympy==1.12
|
109 |
-
tabulate==0.9.0
|
110 |
-
tenacity==9.0.0
|
111 |
-
tensorboardX==2.6.2.2
|
112 |
-
threadpoolctl==3.5.0
|
113 |
-
tiktoken==0.6.0
|
114 |
-
toml==0.10.2
|
115 |
-
toolz==0.12.1
|
116 |
-
torch==2.2.2
|
117 |
-
torch-audiomentations==0.11.1
|
118 |
-
torch-pitch-shift==1.2.4
|
119 |
-
torchaudio==2.2.2
|
120 |
-
torchmetrics==1.3.2
|
121 |
-
tornado==6.4.1
|
122 |
-
tqdm==4.66.2
|
123 |
-
typer==0.12.3
|
124 |
-
typing_extensions==4.12.2
|
125 |
-
tzdata==2024.2
|
126 |
-
urllib3==2.2.3
|
127 |
-
watchdog==4.0.0
|
128 |
-
websockets==12.0
|
129 |
-
webvtt-py==0.4.6
|
130 |
-
yarl==1.9.4
|
131 |
-
yt-dlp==2025.1.26
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|