Spaces:

cpt-subtext
/

speech-to-text

Sleeping

App Files Files Community

cptsubtext commited on Jun 14

Commit

394dcf7

1 Parent(s): 66a8b57

update with transformers

Browse files

Files changed (2) hide show

app.py +88 -62
requirements.txt +0 -131

app.py CHANGED Viewed

@@ -1,77 +1,103 @@
 import streamlit as st
-from stable_whisper import load_model
-from stable_whisper import load_hf_whisper
 from pydub import AudioSegment
-import webvtt
 import pysrt
-import requests
 import os
-# Variables
-#valid_api_token = st.secrets["API_TOKEN"]
-st.title("Speech-to-Text")
 with st.expander("README"):
-  st.write("This little tool accepts and audiofile. After choosing the model a WebVTT file will be generated. The content of the WebVTT will be shown and a user can choose to download it. This can be used as Subtitle file e.g. in Davinci Resolve Import Subtitles" )
 # Upload audio file
-uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav", "mov"])
-# Free tier or API token option
-use_free_tier = st.checkbox("Free Tier (Max 2 minutes)")
-api_token = st.text_input("API Token (Unlimited)")
-# Should we translate to english?
-translate = st.checkbox("Would you like a translation to english?")
-# Model selection
-model_size = st.selectbox("Model Size", ("tiny", "base", "small", "medium"))
-def transcribe_to_subtitle(audio_bytes, model_name):
-  """Transcribe audio to subtitle using OpenAI Whisper"""
-  # Load model based on selection
-  model = load_model(model_name)
-  #speedmodel = load_hf_whisper(model_name)
-  # Check how long the audio is free tier
-  # newAudio = AudioSegment.from_wav("audiofiles/download.wav")
-  #if use_free_tier and len(audio_bytes) > 0.048 * 2 * 60 * 1024:
-  #    st.error(len(audio_bytes))
-  #    st.error("Free tier only supports audio files under 2 minutes")
-  #    return
-  #  Transcribe audio
-  try:
-    if translate:
-      result = model.transcribe(audio_bytes, verbose=True, task = 'translate')
-      result.to_srt_vtt('audio.srt')
-    else:
-      result = model.transcribe(audio_bytes, verbose=True)
-      result.to_srt_vtt('audio.srt')
-  except Exception as e:
-     return {"error": f"Error during transcription: {str(e)}"}
-  captions = pysrt.open("audio.srt")
-  for caption in captions:
-     print(caption.start)
-     print(caption.text)
-     print(caption.end)
-     print()
-  output = captions.text
-  st.markdown(output, unsafe_allow_html=True)
-  # Download option
-  st.success("Transcription successful! Download subtitle file?")
-  with open("audio.srt", "rb") as f:
-      st.download_button("Download Subtitle in WebVtt Format", f, "audio.srt")
-  os.remove("audio.srt")  # Remove temporary file
 if uploaded_file is not None:
-  audio_bytes = uploaded_file.read()
-  # Check for API token if free tier is not selected
-  if not use_free_tier and not api_token:
-      st.error("API token required for non-free tier usage")
-  else:
-      transcribe_to_subtitle(audio_bytes, model_size)

 import streamlit as st
+from transformers import pipeline
 from pydub import AudioSegment
 import pysrt
 import os
+import io
+# Variables (for potential future API integration)
+# valid_api_token = st.secrets.get("API_TOKEN") # Using st.secrets for better security
+st.title("Speech-to-Text with Transformers")
 with st.expander("README"):
+  st.write("This tool transcribes audio files using Hugging Face Transformers. Upload an audio file, choose your model size, and optionally translate to English. A WebVTT/SRT file will be generated and can be downloaded. This is suitable for use as a subtitle file (e.g., in DaVinci Resolve Import Subtitles).")
 # Upload audio file
+uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"])
+# Model selection
+# Note: For Hugging Face Spaces, larger models might require more resources (GPU).
+# "tiny", "base", "small", "medium" are common Whisper sizes.
+model_size = st.selectbox(
+    "Model Size (select a smaller model for faster inference or limited resources)",
+    ("openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium")
+)
+# Should we translate to English?
+translate = st.checkbox("Would you like a translation to English?")
+# Free tier or API token option (more relevant if you were to use an external API like AssemblyAI or OpenAI's API)
+# For local model inference on Hugging Face Spaces, "free tier" typically refers to the space's compute limits.
+st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. There's no explicit 'free tier' checkbox in this context for model size, but larger models will consume more resources and time.")
+# api_token = st.text_input("API Token (Optional, for external APIs like OpenAI's if not using local models)")
+@st.cache_resource
+def load_whisper_pipeline(model_name):
+    """
+    Loads the Hugging Face Whisper ASR pipeline.
+    Uses st.cache_resource to avoid reloading the model on every rerun.
+    """
+    st.info(f"Loading {model_name} model... This may take a moment.")
+    return pipeline("automatic-speech-recognition", model=model_name)
+def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
+    """
+    Transcribes audio using the Hugging Face Transformers pipeline and generates an SRT.
+    """
+    try:
+        asr_pipeline = load_whisper_pipeline(model_name)
+        st.info("Transcribing audio... Please wait.")
+        if translate_to_english:
+            # When task is 'translate', Whisper models directly translate to English
+            prediction = asr_pipeline(audio_file_path, generate_kwargs={"task": "translate"})
+        else:
+            prediction = asr_pipeline(audio_file_path)
+        transcribed_text = prediction["text"]
+        st.subheader("Transcription Output:")
+        st.write(transcribed_text)
+        # Generate SRT content (simplified for demonstration)
+        # For more precise timings, you'd need to process word-level timestamps if available from the pipeline
+        # or use a library that offers more granular control like stable-whisper provides.
+        # For simplicity, this example just puts the whole transcription into one caption.
+        # A real-world scenario would segment the audio and get timestamps for each segment.
+        srt_content = pysrt.SubRipFile()
+        # Create a single subtitle entry for the entire transcription for demonstration.
+        # In a real application, you'd want to segment the audio and create multiple entries with timestamps.
+        # The transformers pipeline returns a single text string by default.
+        # To get segment-level timestamps, you might need to configure the pipeline
+        # or use the underlying model directly.
+        item = pysrt.SubRipItem(index=1, start=pysrt.SubRipTime(0, 0, 0, 0), end=pysrt.SubRipTime(0, 0, int(len(transcribed_text)/10), 0), text=transcribed_text)
+        srt_content.append(item)
+        srt_file_path = "audio.srt"
+        srt_content.save(srt_file_path, encoding='utf-8')
+        st.success("Transcription successful! Download subtitle file?")
+        with open(srt_file_path, "rb") as f:
+            st.download_button("Download Subtitle in SRT Format", f, file_name="audio.srt")
+        os.remove(srt_file_path)
+    except Exception as e:
+        st.error(f"Error during transcription: {str(e)}")
+        # Optionally, provide more specific error handling based on the exception type
+        st.info("Common issues: File format not supported, model loading failed, or audio too long for available memory.")
 if uploaded_file is not None:
+    # Save uploaded file to a temporary location for transformers pipeline
+    # The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
+    with open("temp_audio_file", "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    audio_file_path = "temp_audio_file"
+    transcribe_with_transformers(audio_file_path, model_size, translate)
+    # Clean up the temporary file
+    if os.path.exists(audio_file_path):
+        os.remove(audio_file_path)

requirements.txt CHANGED Viewed

@@ -1,131 +0,0 @@
-aiohttp==3.9.5
-aiosignal==1.3.1
-alembic==1.13.1
-altair==5.4.1
-antlr4-python3-runtime==4.9.3
-asteroid-filterbanks==0.4.0
-attrs==24.2.0
-audioread==3.0.1
-blinker==1.8.2
-Brotli==1.1.0
-cachetools==5.5.0
-certifi==2024.8.30
-cffi==1.16.0
-chardet==5.2.0
-charset-normalizer==3.4.0
-click==8.1.7
-colorlog==6.8.2
-contourpy==1.2.1
-cycler==0.12.1
-decorator==5.1.1
-docopt==0.6.2
-einops==0.8.0
-filelock==3.14.0
-fonttools==4.51.0
-frozenlist==1.4.1
-fsspec==2024.3.1
-gitdb==4.0.11
-GitPython==3.1.43
-greenlet==3.0.3
-huggingface-hub==0.22.2
-HyperPyYAML==1.2.2
-idna==3.10
-Jinja2==3.1.4
-joblib==1.4.0
-jsonschema==4.23.0
-jsonschema-specifications==2024.10.1
-julius==0.2.7
-kiwisolver==1.4.5
-lazy_loader==0.4
-librosa==0.10.1
-lightning==2.2.3
-lightning-utilities==0.11.2
-llvmlite==0.42.0
-Mako==1.3.3
-markdown-it-py==3.0.0
-MarkupSafe==3.0.2
-matplotlib==3.8.4
-mdurl==0.1.2
-more-itertools==10.2.0
-mpmath==1.3.0
-msgpack==1.0.8
-multidict==6.0.5
-mutagen==1.47.0
-narwhals==1.10.0
-networkx==3.3
-numba==0.59.1
-numpy==1.26.4
-omegaconf==2.3.0
-openai-whisper==20240930
-optuna==3.6.1
-packaging==24.1
-pandas==2.2.3
-pillow==10.4.0
-platformdirs==4.2.1
-pooch==1.8.1
-primePy==1.3
-protobuf==5.28.3
-pyannote.audio==3.1.1
-pyannote.core==5.0.0
-pyannote.database==5.1.0
-pyannote.metrics==3.2.1
-pyannote.pipeline==3.0.1
-pyarrow==17.0.0
-pycparser==2.22
-pycryptodomex==3.20.0
-pydeck==0.9.1
-pydub==0.25.1
-Pygments==2.18.0
-pyparsing==3.1.2
-pysrt==1.1.2
-python-dateutil==2.9.0.post0
-pytorch-lightning==2.2.3
-pytorch-metric-learning==2.5.0
-pytz==2024.2
-PyYAML==6.0.1
-referencing==0.35.1
-regex==2024.4.28
-requests==2.32.3
-rich==13.9.3
-rpds-py==0.20.0
-ruamel.yaml==0.18.6
-ruamel.yaml.clib==0.2.8
-scikit-learn==1.4.2
-scipy==1.13.0
-semver==3.0.2
-sentencepiece==0.2.0
-setuptools==69.5.1
-shellingham==1.5.4
-six==1.16.0
-smmap==5.0.1
-sortedcontainers==2.4.0
-soundfile==0.12.1
-soxr==0.3.7
-speechbrain==1.0.0
-SQLAlchemy==2.0.29
-stable-ts==2.19.0
-streamlit==1.44.1
-sympy==1.12
-tabulate==0.9.0
-tenacity==9.0.0
-tensorboardX==2.6.2.2
-threadpoolctl==3.5.0
-tiktoken==0.6.0
-toml==0.10.2
-toolz==0.12.1
-torch==2.2.2
-torch-audiomentations==0.11.1
-torch-pitch-shift==1.2.4
-torchaudio==2.2.2
-torchmetrics==1.3.2
-tornado==6.4.1
-tqdm==4.66.2
-typer==0.12.3
-typing_extensions==4.12.2
-tzdata==2024.2
-urllib3==2.2.3
-watchdog==4.0.0
-websockets==12.0
-webvtt-py==0.4.6
-yarl==1.9.4
-yt-dlp==2025.1.26