Spaces:

cpt-subtext
/

speech-to-text

Sleeping

App Files Files Community

cptsubtext commited on Jun 14

Commit

7d6c6c5

1 Parent(s): 3a0ff0a

fix mel error

Browse files

Files changed (1) hide show

app.py +59 -29

app.py CHANGED Viewed

@@ -5,9 +5,6 @@ import pysrt
 import os
 import io
-# Variables (for potential future API integration)
-# valid_api_token = st.secrets.get("API_TOKEN") # Using st.secrets for better security
 st.title("Speech-to-Text with Transformers")
 with st.expander("README"):
@@ -27,19 +24,19 @@ model_size = st.selectbox(
 # Should we translate to English?
 translate = st.checkbox("Would you like a translation to English?")
-# Free tier or API token option (more relevant if you were to use an external API like AssemblyAI or OpenAI's API)
-# For local model inference on Hugging Face Spaces, "free tier" typically refers to the space's compute limits.
-st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. There's no explicit 'free tier' checkbox in this context for model size, but larger models will consume more resources and time.")
-# api_token = st.text_input("API Token (Optional, for external APIs like OpenAI's if not using local models)")
 @st.cache_resource
 def load_whisper_pipeline(model_name):
     """
     Loads the Hugging Face Whisper ASR pipeline.
     Uses st.cache_resource to avoid reloading the model on every rerun.
     """
     st.info(f"Loading {model_name} model... This may take a moment.")
-    return pipeline("automatic-speech-recognition", model=model_name)
 def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
     """
@@ -49,29 +46,61 @@ def transcribe_with_transformers(audio_file_path, model_name, translate_to_engli
         asr_pipeline = load_whisper_pipeline(model_name)
         st.info("Transcribing audio... Please wait.")
         if translate_to_english:
-            # When task is 'translate', Whisper models directly translate to English
-            prediction = asr_pipeline(audio_file_path, generate_kwargs={"task": "translate"})
-        else:
-            prediction = asr_pipeline(audio_file_path)
         transcribed_text = prediction["text"]
-        st.subheader("Transcription Output:")
         st.write(transcribed_text)
-        # Generate SRT content (simplified for demonstration)
-        # For more precise timings, you'd need to process word-level timestamps if available from the pipeline
-        # or use a library that offers more granular control like stable-whisper provides.
-        # For simplicity, this example just puts the whole transcription into one caption.
-        # A real-world scenario would segment the audio and get timestamps for each segment.
         srt_content = pysrt.SubRipFile()
-        # Create a single subtitle entry for the entire transcription for demonstration.
-        # In a real application, you'd want to segment the audio and create multiple entries with timestamps.
-        # The transformers pipeline returns a single text string by default.
-        # To get segment-level timestamps, you might need to configure the pipeline
-        # or use the underlying model directly.
-        item = pysrt.SubRipItem(index=1, start=pysrt.SubRipTime(0, 0, 0, 0), end=pysrt.SubRipTime(0, 0, int(len(transcribed_text)/10), 0), text=transcribed_text)
-        srt_content.append(item)
         srt_file_path = "audio.srt"
@@ -84,17 +113,18 @@ def transcribe_with_transformers(audio_file_path, model_name, translate_to_engli
     except Exception as e:
         st.error(f"Error during transcription: {str(e)}")
-        # Optionally, provide more specific error handling based on the exception type
-        st.info("Common issues: File format not supported, model loading failed, or audio too long for available memory.")
 if uploaded_file is not None:
     # Save uploaded file to a temporary location for transformers pipeline
     # The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
-    with open("temp_audio_file", "wb") as f:
         f.write(uploaded_file.getbuffer())
-    audio_file_path = "temp_audio_file"
     transcribe_with_transformers(audio_file_path, model_size, translate)

 import os
 import io
 st.title("Speech-to-Text with Transformers")
 with st.expander("README"):
 # Should we translate to English?
 translate = st.checkbox("Would you like a translation to English?")
+# Information about resource usage on Hugging Face Spaces
+st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. Larger models will consume more resources and time.")
 @st.cache_resource
 def load_whisper_pipeline(model_name):
     """
     Loads the Hugging Face Whisper ASR pipeline.
     Uses st.cache_resource to avoid reloading the model on every rerun.
+    We explicitly tell the pipeline to return timestamps for long-form audio.
     """
     st.info(f"Loading {model_name} model... This may take a moment.")
+    # Set return_timestamps=True to handle audio longer than 30 seconds
+    return pipeline("automatic-speech-recognition", model=model_name, return_timestamps=True)
 def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
     """
         asr_pipeline = load_whisper_pipeline(model_name)
         st.info("Transcribing audio... Please wait.")
+        # Configure generation arguments for translation if requested
+        generate_kwargs = {}
         if translate_to_english:
+            generate_kwargs["task"] = "translate"
+        # Pass the audio file path and any generation arguments to the pipeline
+        prediction = asr_pipeline(audio_file_path, generate_kwargs=generate_kwargs)
         transcribed_text = prediction["text"]
+        st.subheader("Full Transcription Output:")
         st.write(transcribed_text)
         srt_content = pysrt.SubRipFile()
+        # The 'chunks' key will be present if return_timestamps=True was set
+        if "chunks" in prediction:
+            for i, chunk in enumerate(prediction["chunks"]):
+                start_time_seconds = chunk["timestamp"][0] if chunk["timestamp"][0] is not None else 0.0
+                end_time_seconds = chunk["timestamp"][1] if chunk["timestamp"][1] is not None else start_time_seconds + 1.0 # Default if end is None
+                # Helper function to convert seconds to pysrt.SubRipTime
+                def seconds_to_srt_time(total_seconds):
+                    hours = int(total_seconds / 3600)
+                    minutes = int((total_seconds % 3600) / 60)
+                    seconds = int(total_seconds % 60)
+                    milliseconds = int((total_seconds - int(total_seconds)) * 1000)
+                    return pysrt.SubRipTime(hours, minutes, seconds, milliseconds)
+                item = pysrt.SubRipItem(
+                    index=i + 1,
+                    start=seconds_to_srt_time(start_time_seconds),
+                    end=seconds_to_srt_time(end_time_seconds),
+                    text=chunk["text"]
+                )
+                srt_content.append(item)
+        else:
+            st.warning("Could not retrieve segmented timestamps. Generating a single subtitle entry.")
+            # Fallback: Create a single subtitle entry if chunks are not available
+            # This is less ideal but ensures some output even if timestamps are missing
+            audio_duration_seconds = 0
+            try:
+                audio = AudioSegment.from_file(audio_file_path)
+                audio_duration_seconds = audio.duration_seconds
+            except Exception:
+                # Estimate duration if pydub fails
+                audio_duration_seconds = len(transcribed_text) * 0.1 # Very rough estimate
+            item = pysrt.SubRipItem(
+                index=1,
+                start=pysrt.SubRipTime(0, 0, 0, 0),
+                end=pysrt.SubRipTime(0, 0, int(audio_duration_seconds), 0),
+                text=transcribed_text
+            )
+            srt_content.append(item)
         srt_file_path = "audio.srt"
     except Exception as e:
         st.error(f"Error during transcription: {str(e)}")
+        st.info("Common issues: File format not supported, model loading failed (check Hugging Face Space logs), or audio too large for available memory.")
 if uploaded_file is not None:
     # Save uploaded file to a temporary location for transformers pipeline
     # The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
+    # It's crucial to give the file a proper extension for pydub to identify format
+    temp_file_name = "temp_audio_file." + uploaded_file.type.split('/')[-1]
+    with open(temp_file_name, "wb") as f:
         f.write(uploaded_file.getbuffer())
+    audio_file_path = temp_file_name
     transcribe_with_transformers(audio_file_path, model_size, translate)