cptsubtext commited on
Commit
7d6c6c5
·
1 Parent(s): 3a0ff0a

fix mel error

Browse files
Files changed (1) hide show
  1. app.py +59 -29
app.py CHANGED
@@ -5,9 +5,6 @@ import pysrt
5
  import os
6
  import io
7
 
8
- # Variables (for potential future API integration)
9
- # valid_api_token = st.secrets.get("API_TOKEN") # Using st.secrets for better security
10
-
11
  st.title("Speech-to-Text with Transformers")
12
 
13
  with st.expander("README"):
@@ -27,19 +24,19 @@ model_size = st.selectbox(
27
  # Should we translate to English?
28
  translate = st.checkbox("Would you like a translation to English?")
29
 
30
- # Free tier or API token option (more relevant if you were to use an external API like AssemblyAI or OpenAI's API)
31
- # For local model inference on Hugging Face Spaces, "free tier" typically refers to the space's compute limits.
32
- st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. There's no explicit 'free tier' checkbox in this context for model size, but larger models will consume more resources and time.")
33
- # api_token = st.text_input("API Token (Optional, for external APIs like OpenAI's if not using local models)")
34
 
35
  @st.cache_resource
36
  def load_whisper_pipeline(model_name):
37
  """
38
  Loads the Hugging Face Whisper ASR pipeline.
39
  Uses st.cache_resource to avoid reloading the model on every rerun.
 
40
  """
41
  st.info(f"Loading {model_name} model... This may take a moment.")
42
- return pipeline("automatic-speech-recognition", model=model_name)
 
43
 
44
  def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
45
  """
@@ -49,29 +46,61 @@ def transcribe_with_transformers(audio_file_path, model_name, translate_to_engli
49
  asr_pipeline = load_whisper_pipeline(model_name)
50
 
51
  st.info("Transcribing audio... Please wait.")
 
 
 
52
  if translate_to_english:
53
- # When task is 'translate', Whisper models directly translate to English
54
- prediction = asr_pipeline(audio_file_path, generate_kwargs={"task": "translate"})
55
- else:
56
- prediction = asr_pipeline(audio_file_path)
57
 
58
  transcribed_text = prediction["text"]
59
- st.subheader("Transcription Output:")
60
  st.write(transcribed_text)
61
 
62
- # Generate SRT content (simplified for demonstration)
63
- # For more precise timings, you'd need to process word-level timestamps if available from the pipeline
64
- # or use a library that offers more granular control like stable-whisper provides.
65
- # For simplicity, this example just puts the whole transcription into one caption.
66
- # A real-world scenario would segment the audio and get timestamps for each segment.
67
  srt_content = pysrt.SubRipFile()
68
- # Create a single subtitle entry for the entire transcription for demonstration.
69
- # In a real application, you'd want to segment the audio and create multiple entries with timestamps.
70
- # The transformers pipeline returns a single text string by default.
71
- # To get segment-level timestamps, you might need to configure the pipeline
72
- # or use the underlying model directly.
73
- item = pysrt.SubRipItem(index=1, start=pysrt.SubRipTime(0, 0, 0, 0), end=pysrt.SubRipTime(0, 0, int(len(transcribed_text)/10), 0), text=transcribed_text)
74
- srt_content.append(item)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
 
77
  srt_file_path = "audio.srt"
@@ -84,17 +113,18 @@ def transcribe_with_transformers(audio_file_path, model_name, translate_to_engli
84
 
85
  except Exception as e:
86
  st.error(f"Error during transcription: {str(e)}")
87
- # Optionally, provide more specific error handling based on the exception type
88
- st.info("Common issues: File format not supported, model loading failed, or audio too long for available memory.")
89
 
90
 
91
  if uploaded_file is not None:
92
  # Save uploaded file to a temporary location for transformers pipeline
93
  # The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
94
- with open("temp_audio_file", "wb") as f:
 
 
95
  f.write(uploaded_file.getbuffer())
96
 
97
- audio_file_path = "temp_audio_file"
98
 
99
  transcribe_with_transformers(audio_file_path, model_size, translate)
100
 
 
5
  import os
6
  import io
7
 
 
 
 
8
  st.title("Speech-to-Text with Transformers")
9
 
10
  with st.expander("README"):
 
24
  # Should we translate to English?
25
  translate = st.checkbox("Would you like a translation to English?")
26
 
27
+ # Information about resource usage on Hugging Face Spaces
28
+ st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. Larger models will consume more resources and time.")
 
 
29
 
30
  @st.cache_resource
31
  def load_whisper_pipeline(model_name):
32
  """
33
  Loads the Hugging Face Whisper ASR pipeline.
34
  Uses st.cache_resource to avoid reloading the model on every rerun.
35
+ We explicitly tell the pipeline to return timestamps for long-form audio.
36
  """
37
  st.info(f"Loading {model_name} model... This may take a moment.")
38
+ # Set return_timestamps=True to handle audio longer than 30 seconds
39
+ return pipeline("automatic-speech-recognition", model=model_name, return_timestamps=True)
40
 
41
  def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
42
  """
 
46
  asr_pipeline = load_whisper_pipeline(model_name)
47
 
48
  st.info("Transcribing audio... Please wait.")
49
+
50
+ # Configure generation arguments for translation if requested
51
+ generate_kwargs = {}
52
  if translate_to_english:
53
+ generate_kwargs["task"] = "translate"
54
+
55
+ # Pass the audio file path and any generation arguments to the pipeline
56
+ prediction = asr_pipeline(audio_file_path, generate_kwargs=generate_kwargs)
57
 
58
  transcribed_text = prediction["text"]
59
+ st.subheader("Full Transcription Output:")
60
  st.write(transcribed_text)
61
 
 
 
 
 
 
62
  srt_content = pysrt.SubRipFile()
63
+
64
+ # The 'chunks' key will be present if return_timestamps=True was set
65
+ if "chunks" in prediction:
66
+ for i, chunk in enumerate(prediction["chunks"]):
67
+ start_time_seconds = chunk["timestamp"][0] if chunk["timestamp"][0] is not None else 0.0
68
+ end_time_seconds = chunk["timestamp"][1] if chunk["timestamp"][1] is not None else start_time_seconds + 1.0 # Default if end is None
69
+
70
+ # Helper function to convert seconds to pysrt.SubRipTime
71
+ def seconds_to_srt_time(total_seconds):
72
+ hours = int(total_seconds / 3600)
73
+ minutes = int((total_seconds % 3600) / 60)
74
+ seconds = int(total_seconds % 60)
75
+ milliseconds = int((total_seconds - int(total_seconds)) * 1000)
76
+ return pysrt.SubRipTime(hours, minutes, seconds, milliseconds)
77
+
78
+ item = pysrt.SubRipItem(
79
+ index=i + 1,
80
+ start=seconds_to_srt_time(start_time_seconds),
81
+ end=seconds_to_srt_time(end_time_seconds),
82
+ text=chunk["text"]
83
+ )
84
+ srt_content.append(item)
85
+ else:
86
+ st.warning("Could not retrieve segmented timestamps. Generating a single subtitle entry.")
87
+ # Fallback: Create a single subtitle entry if chunks are not available
88
+ # This is less ideal but ensures some output even if timestamps are missing
89
+ audio_duration_seconds = 0
90
+ try:
91
+ audio = AudioSegment.from_file(audio_file_path)
92
+ audio_duration_seconds = audio.duration_seconds
93
+ except Exception:
94
+ # Estimate duration if pydub fails
95
+ audio_duration_seconds = len(transcribed_text) * 0.1 # Very rough estimate
96
+
97
+ item = pysrt.SubRipItem(
98
+ index=1,
99
+ start=pysrt.SubRipTime(0, 0, 0, 0),
100
+ end=pysrt.SubRipTime(0, 0, int(audio_duration_seconds), 0),
101
+ text=transcribed_text
102
+ )
103
+ srt_content.append(item)
104
 
105
 
106
  srt_file_path = "audio.srt"
 
113
 
114
  except Exception as e:
115
  st.error(f"Error during transcription: {str(e)}")
116
+ st.info("Common issues: File format not supported, model loading failed (check Hugging Face Space logs), or audio too large for available memory.")
 
117
 
118
 
119
  if uploaded_file is not None:
120
  # Save uploaded file to a temporary location for transformers pipeline
121
  # The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
122
+ # It's crucial to give the file a proper extension for pydub to identify format
123
+ temp_file_name = "temp_audio_file." + uploaded_file.type.split('/')[-1]
124
+ with open(temp_file_name, "wb") as f:
125
  f.write(uploaded_file.getbuffer())
126
 
127
+ audio_file_path = temp_file_name
128
 
129
  transcribe_with_transformers(audio_file_path, model_size, translate)
130