cptsubtext commited on
Commit
394dcf7
·
1 Parent(s): 66a8b57

update with transformers

Browse files
Files changed (2) hide show
  1. app.py +88 -62
  2. requirements.txt +0 -131
app.py CHANGED
@@ -1,77 +1,103 @@
1
  import streamlit as st
2
- from stable_whisper import load_model
3
- from stable_whisper import load_hf_whisper
4
  from pydub import AudioSegment
5
- import webvtt
6
  import pysrt
7
- import requests
8
  import os
 
9
 
10
- # Variables
11
- #valid_api_token = st.secrets["API_TOKEN"]
12
 
13
- st.title("Speech-to-Text")
14
 
15
  with st.expander("README"):
16
- st.write("This little tool accepts and audiofile. After choosing the model a WebVTT file will be generated. The content of the WebVTT will be shown and a user can choose to download it. This can be used as Subtitle file e.g. in Davinci Resolve Import Subtitles" )
17
 
18
  # Upload audio file
19
- uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav", "mov"])
20
 
21
- # Free tier or API token option
22
- use_free_tier = st.checkbox("Free Tier (Max 2 minutes)")
23
- api_token = st.text_input("API Token (Unlimited)")
 
 
 
 
24
 
25
- # Should we translate to english?
26
- translate = st.checkbox("Would you like a translation to english?")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # Model selection
29
- model_size = st.selectbox("Model Size", ("tiny", "base", "small", "medium"))
30
-
31
- def transcribe_to_subtitle(audio_bytes, model_name):
32
- """Transcribe audio to subtitle using OpenAI Whisper"""
33
- # Load model based on selection
34
- model = load_model(model_name)
35
- #speedmodel = load_hf_whisper(model_name)
36
-
37
- # Check how long the audio is free tier
38
- # newAudio = AudioSegment.from_wav("audiofiles/download.wav")
39
- #if use_free_tier and len(audio_bytes) > 0.048 * 2 * 60 * 1024:
40
- # st.error(len(audio_bytes))
41
- # st.error("Free tier only supports audio files under 2 minutes")
42
- # return
43
-
44
- # Transcribe audio
45
- try:
46
- if translate:
47
- result = model.transcribe(audio_bytes, verbose=True, task = 'translate')
48
- result.to_srt_vtt('audio.srt')
49
- else:
50
- result = model.transcribe(audio_bytes, verbose=True)
51
- result.to_srt_vtt('audio.srt')
52
- except Exception as e:
53
- return {"error": f"Error during transcription: {str(e)}"}
54
-
55
- captions = pysrt.open("audio.srt")
56
- for caption in captions:
57
- print(caption.start)
58
- print(caption.text)
59
- print(caption.end)
60
- print()
61
-
62
- output = captions.text
63
- st.markdown(output, unsafe_allow_html=True)
64
-
65
- # Download option
66
- st.success("Transcription successful! Download subtitle file?")
67
- with open("audio.srt", "rb") as f:
68
- st.download_button("Download Subtitle in WebVtt Format", f, "audio.srt")
69
- os.remove("audio.srt") # Remove temporary file
70
 
71
  if uploaded_file is not None:
72
- audio_bytes = uploaded_file.read()
73
- # Check for API token if free tier is not selected
74
- if not use_free_tier and not api_token:
75
- st.error("API token required for non-free tier usage")
76
- else:
77
- transcribe_to_subtitle(audio_bytes, model_size)
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from transformers import pipeline
 
3
  from pydub import AudioSegment
 
4
  import pysrt
 
5
  import os
6
+ import io
7
 
8
+ # Variables (for potential future API integration)
9
+ # valid_api_token = st.secrets.get("API_TOKEN") # Using st.secrets for better security
10
 
11
+ st.title("Speech-to-Text with Transformers")
12
 
13
  with st.expander("README"):
14
+ st.write("This tool transcribes audio files using Hugging Face Transformers. Upload an audio file, choose your model size, and optionally translate to English. A WebVTT/SRT file will be generated and can be downloaded. This is suitable for use as a subtitle file (e.g., in DaVinci Resolve Import Subtitles).")
15
 
16
  # Upload audio file
17
+ uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"])
18
 
19
+ # Model selection
20
+ # Note: For Hugging Face Spaces, larger models might require more resources (GPU).
21
+ # "tiny", "base", "small", "medium" are common Whisper sizes.
22
+ model_size = st.selectbox(
23
+ "Model Size (select a smaller model for faster inference or limited resources)",
24
+ ("openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium")
25
+ )
26
 
27
+ # Should we translate to English?
28
+ translate = st.checkbox("Would you like a translation to English?")
29
+
30
+ # Free tier or API token option (more relevant if you were to use an external API like AssemblyAI or OpenAI's API)
31
+ # For local model inference on Hugging Face Spaces, "free tier" typically refers to the space's compute limits.
32
+ st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. There's no explicit 'free tier' checkbox in this context for model size, but larger models will consume more resources and time.")
33
+ # api_token = st.text_input("API Token (Optional, for external APIs like OpenAI's if not using local models)")
34
+
35
+ @st.cache_resource
36
+ def load_whisper_pipeline(model_name):
37
+ """
38
+ Loads the Hugging Face Whisper ASR pipeline.
39
+ Uses st.cache_resource to avoid reloading the model on every rerun.
40
+ """
41
+ st.info(f"Loading {model_name} model... This may take a moment.")
42
+ return pipeline("automatic-speech-recognition", model=model_name)
43
+
44
+ def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
45
+ """
46
+ Transcribes audio using the Hugging Face Transformers pipeline and generates an SRT.
47
+ """
48
+ try:
49
+ asr_pipeline = load_whisper_pipeline(model_name)
50
+
51
+ st.info("Transcribing audio... Please wait.")
52
+ if translate_to_english:
53
+ # When task is 'translate', Whisper models directly translate to English
54
+ prediction = asr_pipeline(audio_file_path, generate_kwargs={"task": "translate"})
55
+ else:
56
+ prediction = asr_pipeline(audio_file_path)
57
+
58
+ transcribed_text = prediction["text"]
59
+ st.subheader("Transcription Output:")
60
+ st.write(transcribed_text)
61
+
62
+ # Generate SRT content (simplified for demonstration)
63
+ # For more precise timings, you'd need to process word-level timestamps if available from the pipeline
64
+ # or use a library that offers more granular control like stable-whisper provides.
65
+ # For simplicity, this example just puts the whole transcription into one caption.
66
+ # A real-world scenario would segment the audio and get timestamps for each segment.
67
+ srt_content = pysrt.SubRipFile()
68
+ # Create a single subtitle entry for the entire transcription for demonstration.
69
+ # In a real application, you'd want to segment the audio and create multiple entries with timestamps.
70
+ # The transformers pipeline returns a single text string by default.
71
+ # To get segment-level timestamps, you might need to configure the pipeline
72
+ # or use the underlying model directly.
73
+ item = pysrt.SubRipItem(index=1, start=pysrt.SubRipTime(0, 0, 0, 0), end=pysrt.SubRipTime(0, 0, int(len(transcribed_text)/10), 0), text=transcribed_text)
74
+ srt_content.append(item)
75
+
76
+
77
+ srt_file_path = "audio.srt"
78
+ srt_content.save(srt_file_path, encoding='utf-8')
79
+
80
+ st.success("Transcription successful! Download subtitle file?")
81
+ with open(srt_file_path, "rb") as f:
82
+ st.download_button("Download Subtitle in SRT Format", f, file_name="audio.srt")
83
+ os.remove(srt_file_path)
84
+
85
+ except Exception as e:
86
+ st.error(f"Error during transcription: {str(e)}")
87
+ # Optionally, provide more specific error handling based on the exception type
88
+ st.info("Common issues: File format not supported, model loading failed, or audio too long for available memory.")
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  if uploaded_file is not None:
92
+ # Save uploaded file to a temporary location for transformers pipeline
93
+ # The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
94
+ with open("temp_audio_file", "wb") as f:
95
+ f.write(uploaded_file.getbuffer())
96
+
97
+ audio_file_path = "temp_audio_file"
98
+
99
+ transcribe_with_transformers(audio_file_path, model_size, translate)
100
+
101
+ # Clean up the temporary file
102
+ if os.path.exists(audio_file_path):
103
+ os.remove(audio_file_path)
requirements.txt CHANGED
@@ -1,131 +0,0 @@
1
- aiohttp==3.9.5
2
- aiosignal==1.3.1
3
- alembic==1.13.1
4
- altair==5.4.1
5
- antlr4-python3-runtime==4.9.3
6
- asteroid-filterbanks==0.4.0
7
- attrs==24.2.0
8
- audioread==3.0.1
9
- blinker==1.8.2
10
- Brotli==1.1.0
11
- cachetools==5.5.0
12
- certifi==2024.8.30
13
- cffi==1.16.0
14
- chardet==5.2.0
15
- charset-normalizer==3.4.0
16
- click==8.1.7
17
- colorlog==6.8.2
18
- contourpy==1.2.1
19
- cycler==0.12.1
20
- decorator==5.1.1
21
- docopt==0.6.2
22
- einops==0.8.0
23
- filelock==3.14.0
24
- fonttools==4.51.0
25
- frozenlist==1.4.1
26
- fsspec==2024.3.1
27
- gitdb==4.0.11
28
- GitPython==3.1.43
29
- greenlet==3.0.3
30
- huggingface-hub==0.22.2
31
- HyperPyYAML==1.2.2
32
- idna==3.10
33
- Jinja2==3.1.4
34
- joblib==1.4.0
35
- jsonschema==4.23.0
36
- jsonschema-specifications==2024.10.1
37
- julius==0.2.7
38
- kiwisolver==1.4.5
39
- lazy_loader==0.4
40
- librosa==0.10.1
41
- lightning==2.2.3
42
- lightning-utilities==0.11.2
43
- llvmlite==0.42.0
44
- Mako==1.3.3
45
- markdown-it-py==3.0.0
46
- MarkupSafe==3.0.2
47
- matplotlib==3.8.4
48
- mdurl==0.1.2
49
- more-itertools==10.2.0
50
- mpmath==1.3.0
51
- msgpack==1.0.8
52
- multidict==6.0.5
53
- mutagen==1.47.0
54
- narwhals==1.10.0
55
- networkx==3.3
56
- numba==0.59.1
57
- numpy==1.26.4
58
- omegaconf==2.3.0
59
- openai-whisper==20240930
60
- optuna==3.6.1
61
- packaging==24.1
62
- pandas==2.2.3
63
- pillow==10.4.0
64
- platformdirs==4.2.1
65
- pooch==1.8.1
66
- primePy==1.3
67
- protobuf==5.28.3
68
- pyannote.audio==3.1.1
69
- pyannote.core==5.0.0
70
- pyannote.database==5.1.0
71
- pyannote.metrics==3.2.1
72
- pyannote.pipeline==3.0.1
73
- pyarrow==17.0.0
74
- pycparser==2.22
75
- pycryptodomex==3.20.0
76
- pydeck==0.9.1
77
- pydub==0.25.1
78
- Pygments==2.18.0
79
- pyparsing==3.1.2
80
- pysrt==1.1.2
81
- python-dateutil==2.9.0.post0
82
- pytorch-lightning==2.2.3
83
- pytorch-metric-learning==2.5.0
84
- pytz==2024.2
85
- PyYAML==6.0.1
86
- referencing==0.35.1
87
- regex==2024.4.28
88
- requests==2.32.3
89
- rich==13.9.3
90
- rpds-py==0.20.0
91
- ruamel.yaml==0.18.6
92
- ruamel.yaml.clib==0.2.8
93
- scikit-learn==1.4.2
94
- scipy==1.13.0
95
- semver==3.0.2
96
- sentencepiece==0.2.0
97
- setuptools==69.5.1
98
- shellingham==1.5.4
99
- six==1.16.0
100
- smmap==5.0.1
101
- sortedcontainers==2.4.0
102
- soundfile==0.12.1
103
- soxr==0.3.7
104
- speechbrain==1.0.0
105
- SQLAlchemy==2.0.29
106
- stable-ts==2.19.0
107
- streamlit==1.44.1
108
- sympy==1.12
109
- tabulate==0.9.0
110
- tenacity==9.0.0
111
- tensorboardX==2.6.2.2
112
- threadpoolctl==3.5.0
113
- tiktoken==0.6.0
114
- toml==0.10.2
115
- toolz==0.12.1
116
- torch==2.2.2
117
- torch-audiomentations==0.11.1
118
- torch-pitch-shift==1.2.4
119
- torchaudio==2.2.2
120
- torchmetrics==1.3.2
121
- tornado==6.4.1
122
- tqdm==4.66.2
123
- typer==0.12.3
124
- typing_extensions==4.12.2
125
- tzdata==2024.2
126
- urllib3==2.2.3
127
- watchdog==4.0.0
128
- websockets==12.0
129
- webvtt-py==0.4.6
130
- yarl==1.9.4
131
- yt-dlp==2025.1.26