Deepakkori45 commited on
Commit
eeadc49
·
verified ·
1 Parent(s): 469f0e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -28
app.py CHANGED
@@ -24,17 +24,80 @@ status_placeholder.info("Loading Whisper model from Hugging Face...")
24
  def load_whisper_model():
25
  """
26
  Load the Whisper model and processor from Hugging Face.
27
- Change 'openai/whisper-base' to another variant if needed.
28
  """
29
- model_name = "openai/whisper-Small"
30
  processor = WhisperProcessor.from_pretrained(model_name)
31
  model = WhisperForConditionalGeneration.from_pretrained(model_name)
32
  return processor, model
33
 
34
  processor, model = load_whisper_model()
35
-
36
  status_placeholder.info("Whisper model loaded successfully!")
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
39
  """
40
  Split an audio file into chunks using silence detection.
@@ -50,42 +113,29 @@ def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=
50
  status_placeholder.info(f"Audio split into {len(chunks)} chunks.")
51
  return chunks
52
 
53
- # def transcribe(audio_file):
54
- # """
55
- # Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
56
- # This uses librosa to load and resample the audio as required.
57
- # """
58
- # # Load audio with librosa at 16kHz (as required by Whisper)
59
- # speech, sr = librosa.load(audio_file, sr=16000)
60
- # input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
61
- # predicted_ids = model.generate(input_features)
62
- # transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
63
- # return transcription
64
-
65
- def transcribe(audio_file):
66
  """
67
  Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
68
  This uses librosa to load and resample the audio as required.
69
- The transcription is forced to be in English.
70
 
71
  Args:
72
  audio_file (str): Path to the audio file.
 
73
 
74
  Returns:
75
- str: Transcribed text in English.
76
  """
77
  # Load audio with librosa at 16kHz (as required by Whisper)
78
  speech, sr = librosa.load(audio_file, sr=16000)
79
  input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
80
- # Force the transcription output to be in English:
81
- forced_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
82
  predicted_ids = model.generate(input_features, forced_decoder_ids=forced_ids)
83
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
84
  return transcription
85
 
86
-
87
-
88
- def transcribe_chunk(chunk, index, min_length_ms=100):
89
  """
90
  Transcribe an individual audio chunk.
91
  """
@@ -96,13 +146,13 @@ def transcribe_chunk(chunk, index, min_length_ms=100):
96
  with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
97
  chunk.export(temp_audio_file.name, format="wav")
98
  temp_audio_file_path = temp_audio_file.name
99
- status_placeholder.info(f"Transcribing chunk {index}...")
100
- transcription = transcribe(temp_audio_file_path)
101
  os.remove(temp_audio_file_path)
102
  st.write(f"Transcription for chunk {index}: {transcription}")
103
  return (index, transcription)
104
 
105
- def process_audio_chunks(audio_chunks):
106
  """
107
  Process and transcribe each audio chunk in sequence.
108
  Reports the total time taken.
@@ -111,7 +161,7 @@ def process_audio_chunks(audio_chunks):
111
  min_length_ms = 100 # minimum duration for processing
112
  start_transcription = time.time()
113
  for i, chunk in enumerate(audio_chunks):
114
- index, text = transcribe_chunk(chunk, i, min_length_ms)
115
  transcriptions.append((index, text))
116
  transcriptions.sort(key=lambda x: x[0])
117
  total_time = time.time() - start_transcription
@@ -152,7 +202,7 @@ if uploaded_file is not None and st.session_state.transcription is None:
152
  processing_start = time.time()
153
  with st.spinner('Processing audio...'):
154
  audio_chunks = split_audio_on_silence(temp_audio_file)
155
- transcription = process_audio_chunks(audio_chunks)
156
  if transcription:
157
  st.session_state.transcription = transcription
158
  st.success('Transcription complete!')
 
24
  def load_whisper_model():
25
  """
26
  Load the Whisper model and processor from Hugging Face.
27
+ Change 'openai/whisper-small' to another variant if needed.
28
  """
29
+ model_name = "openai/whisper-small" # You can change to "tiny", "base", "medium", or "large" based on resources.
30
  processor = WhisperProcessor.from_pretrained(model_name)
31
  model = WhisperForConditionalGeneration.from_pretrained(model_name)
32
  return processor, model
33
 
34
  processor, model = load_whisper_model()
 
35
  status_placeholder.info("Whisper model loaded successfully!")
36
 
37
+ # Comprehensive dictionary of languages supported by Whisper (most common ones)
38
+ LANGUAGES = {
39
+ "en": "English",
40
+ "zh": "Chinese",
41
+ "de": "German",
42
+ "es": "Spanish",
43
+ "ru": "Russian",
44
+ "ko": "Korean",
45
+ "fr": "French",
46
+ "ja": "Japanese",
47
+ "pt": "Portuguese",
48
+ "tr": "Turkish",
49
+ "pl": "Polish",
50
+ "ca": "Catalan",
51
+ "nl": "Dutch",
52
+ "ar": "Arabic",
53
+ "sv": "Swedish",
54
+ "it": "Italian",
55
+ "id": "Indonesian",
56
+ "hi": "Hindi",
57
+ "fi": "Finnish",
58
+ "vi": "Vietnamese",
59
+ "fa": "Persian",
60
+ "mr": "Marathi",
61
+ "uk": "Ukrainian",
62
+ "el": "Greek",
63
+ "ms": "Malay",
64
+ "cs": "Czech",
65
+ "ro": "Romanian",
66
+ "da": "Danish",
67
+ "hu": "Hungarian",
68
+ "ta": "Tamil",
69
+ "no": "Norwegian",
70
+ "th": "Thai",
71
+ "ur": "Urdu",
72
+ "hr": "Croatian",
73
+ "bg": "Bulgarian",
74
+ "lt": "Lithuanian",
75
+ "la": "Latin",
76
+ "mi": "Maori",
77
+ "ml": "Malayalam",
78
+ "cy": "Welsh",
79
+ "sk": "Slovak",
80
+ "te": "Telugu",
81
+ "ka": "Georgian",
82
+ "sl": "Slovenian",
83
+ "kn": "Kannada",
84
+ "et": "Estonian",
85
+ "mk": "Macedonian",
86
+ "br": "Breton",
87
+ "eu": "Basque",
88
+ "is": "Icelandic",
89
+ "hy": "Armenian",
90
+ "af": "Afrikaans"
91
+ }
92
+
93
+ # Create a sorted list of language names for the selectbox
94
+ language_names = sorted(LANGUAGES.values())
95
+ default_language = "English" # Default language
96
+
97
+ selected_lang_name = st.selectbox("Select transcription language", language_names, index=language_names.index(default_language))
98
+ # Find the language code by reverse lookup in LANGUAGES
99
+ selected_language = [code for code, name in LANGUAGES.items() if name == selected_lang_name][0]
100
+
101
  def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
102
  """
103
  Split an audio file into chunks using silence detection.
 
113
  status_placeholder.info(f"Audio split into {len(chunks)} chunks.")
114
  return chunks
115
 
116
+ def transcribe(audio_file, language):
 
 
 
 
 
 
 
 
 
 
 
 
117
  """
118
  Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
119
  This uses librosa to load and resample the audio as required.
120
+ The transcription is forced to the specified language.
121
 
122
  Args:
123
  audio_file (str): Path to the audio file.
124
+ language (str): Language code (e.g., "en", "es").
125
 
126
  Returns:
127
+ str: Transcribed text.
128
  """
129
  # Load audio with librosa at 16kHz (as required by Whisper)
130
  speech, sr = librosa.load(audio_file, sr=16000)
131
  input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
132
+ # Force the transcription output to the chosen language:
133
+ forced_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
134
  predicted_ids = model.generate(input_features, forced_decoder_ids=forced_ids)
135
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
136
  return transcription
137
 
138
+ def transcribe_chunk(chunk, index, language, min_length_ms=100):
 
 
139
  """
140
  Transcribe an individual audio chunk.
141
  """
 
146
  with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
147
  chunk.export(temp_audio_file.name, format="wav")
148
  temp_audio_file_path = temp_audio_file.name
149
+ status_placeholder.info(f"Transcribing chunk {index} in {selected_lang_name}...")
150
+ transcription = transcribe(temp_audio_file_path, language)
151
  os.remove(temp_audio_file_path)
152
  st.write(f"Transcription for chunk {index}: {transcription}")
153
  return (index, transcription)
154
 
155
+ def process_audio_chunks(audio_chunks, language):
156
  """
157
  Process and transcribe each audio chunk in sequence.
158
  Reports the total time taken.
 
161
  min_length_ms = 100 # minimum duration for processing
162
  start_transcription = time.time()
163
  for i, chunk in enumerate(audio_chunks):
164
+ index, text = transcribe_chunk(chunk, i, language, min_length_ms)
165
  transcriptions.append((index, text))
166
  transcriptions.sort(key=lambda x: x[0])
167
  total_time = time.time() - start_transcription
 
202
  processing_start = time.time()
203
  with st.spinner('Processing audio...'):
204
  audio_chunks = split_audio_on_silence(temp_audio_file)
205
+ transcription = process_audio_chunks(audio_chunks, selected_language)
206
  if transcription:
207
  st.session_state.transcription = transcription
208
  st.success('Transcription complete!')