camparchimedes commited on
Commit
cf8326e
·
verified ·
1 Parent(s): 25beb4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -43
app.py CHANGED
@@ -18,27 +18,110 @@ Description: webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummer
18
 
19
  import time
20
  import os
 
21
  import warnings
22
  from pydub import AudioSegment
 
 
 
23
  import torch
24
  import torchaudio
25
  import torchaudio.transforms as transforms
 
 
26
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
27
- from huggingface_hub import model_info
 
28
  import spacy
29
  import networkx as nx
30
  from sklearn.feature_extraction.text import TfidfVectorizer
31
  from sklearn.metrics.pairwise import cosine_similarity
32
- import pandas as pd
33
- import numpy as np
34
- import re
35
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
36
  import gradio as gr
37
  from fpdf import FPDF
38
  from PIL import Image
 
39
 
 
40
  # Suppress warnings
41
  warnings.filterwarnings("ignore")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  # Convert m4a audio to wav format
44
  def convert_to_wav(audio_file):
@@ -46,43 +129,61 @@ def convert_to_wav(audio_file):
46
  wav_file = "temp.wav"
47
  audio.export(wav_file, format="wav")
48
  return wav_file
49
-
50
- # Initialize device for torch
51
- device = 0 if torch.cuda.is_available() else "cpu"
52
- torch_dtype = torch.float32
53
-
54
- # Load tokenizer and model
55
  processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
56
  model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
 
57
 
58
 
59
- # Model script does not support JIT compilation
60
- #model = model.to(device)
61
- #model = torch.jit.script(model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- # Generation kwargs
64
  generate_kwargs = {
65
  "num_beams": 5,
66
- "task": "transcribe",
67
  "language": "no",
68
- "forced_decoder_ids": None
 
69
  }
70
 
71
- # Transcribe
72
  def transcribe_audio(audio_file, chunk_length_s=30):
 
 
 
 
 
 
73
  if audio_file.endswith(".m4a"):
74
  audio_file = convert_to_wav(audio_file)
75
 
76
  start_time = time.time()
77
-
78
- # Load the audio waveform using torchaudio
79
  waveform, sample_rate = torchaudio.load(audio_file)
80
 
81
  # Convert to mono if the audio has more than one channel
82
  if waveform.shape[0] > 1:
83
  waveform = torch.mean(waveform, dim=0, keepdim=True)
84
 
85
- # Resample audio to 16000 Hz if it’s not already
86
  if sample_rate != 16000:
87
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
88
  waveform = resampler(waveform)
@@ -103,28 +204,30 @@ def transcribe_audio(audio_file, chunk_length_s=30):
103
  # Check chunk waveform is properly shaped
104
  if chunk_waveform.shape[0] > 1:
105
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
 
 
 
 
 
 
106
 
107
- if processor.tokenizer.pad_token is None or processor.tokenizer.pad_token_id == processor.tokenizer.eos_token_id:
108
- processor.tokenizer.add_special_tokens({'pad_token': '<PAD>'})
109
- pad_token_id = processor.tokenizer.convert_tokens_to_ids('<PAD>')
110
-
111
- model.config.pad_token_id = pad_token_id # update model configuration with new pad token ID
112
-
113
- # Tokenize the input batch with the processor
114
- inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, padding="max_length", return_tensors="pt", task="transcribe", device=device)
115
 
116
- # ASR model inference on chunk
117
- with torch.no_grad():
118
- generated_ids = model.generate(
119
- input_features=inputs.input_features.to(device),
120
- **generate_kwargs
121
- )
122
 
123
- # Decode the generated IDs to text
124
  chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
 
 
 
 
125
  full_text.append(chunk_text)
126
-
127
-
128
  # Combine the transcribed text from all chunks
129
  text = " ".join(full_text)
130
 
@@ -132,7 +235,6 @@ def transcribe_audio(audio_file, chunk_length_s=30):
132
 
133
  # Audio duration (in seconds)
134
  audio_duration = waveform.shape[1] / sample_rate
135
-
136
  # Real-time Factor (RTF)
137
  rtf = output_time / audio_duration
138
 
@@ -146,10 +248,44 @@ def transcribe_audio(audio_file, chunk_length_s=30):
146
  "It is the ratio of transcription time to the duration of the audio.\n\n"
147
  "An RTF of less than 1 means the transcription process is faster than real-time (expected)."
148
  )
 
 
 
 
 
 
149
 
150
  return text, result
151
-
152
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  # Clean and preprocess/@summarization
154
  def clean_text(text):
155
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
@@ -176,7 +312,7 @@ def summarize_text(text):
176
  inputs = inputs.to(device)
177
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
178
  return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
179
-
180
  # Builds similarity matrix
181
  def build_similarity_matrix(sentences, stop_words):
182
  similarity_matrix = nx.Graph()
@@ -187,7 +323,7 @@ def build_similarity_matrix(sentences, stop_words):
187
  similarity_matrix.add_edge(i, j, weight=len(common_words))
188
  return similarity_matrix
189
 
190
- # "Graph-based summarization" =====>
191
  def graph_based_summary(text, num_paragraphs=3):
192
  doc = nlp(text)
193
  sentences = [sent.text for sent in doc.sents]
@@ -262,7 +398,7 @@ iface = gr.Blocks()
262
 
263
  PLACEHOLDER = """
264
  <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
265
- <img src=""https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.93; ">
266
  <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Switch Work | Verktæysett no.1</h1>
267
  <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
268
  </div>
 
18
 
19
  import time
20
  import os
21
+ import re
22
  import warnings
23
  from pydub import AudioSegment
24
+
25
+ import pandas as pd
26
+ import numpy as np
27
  import torch
28
  import torchaudio
29
  import torchaudio.transforms as transforms
30
+
31
+
32
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
33
+ from ...generation.configuration_utils import GenerationConfig
34
+
35
  import spacy
36
  import networkx as nx
37
  from sklearn.feature_extraction.text import TfidfVectorizer
38
  from sklearn.metrics.pairwise import cosine_similarity
39
+
 
 
40
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
41
+
42
  import gradio as gr
43
  from fpdf import FPDF
44
  from PIL import Image
45
+ # from huggingface_hub import model_info
46
 
47
+ #############################################################################################################################################3
48
  # Suppress warnings
49
  warnings.filterwarnings("ignore")
50
+ """
51
+ def generate(
52
+ self,
53
+ input_features: Optional[torch.Tensor] = None, # <====================== ACTIVE
54
+ generation_config: Optional[GenerationConfig] = None, # <====================== could be ACTIVE(ed.)*
55
+ logits_processor: Optional[LogitsProcessorList] = None,
56
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
57
+ prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
58
+ synced_gpus: bool = False,
59
+ return_timestamps: Optional[bool] = None,
60
+ task: Optional[str] = None,
61
+ language: Optional[Union[str, List[str]]] = None, # <====================== ACTIVE
62
+ is_multilingual: Optional[bool] = None,
63
+ prompt_ids: Optional[torch.Tensor] = None,
64
+ prompt_condition_type: Optional[str] = None, # first-segment, all-segments
65
+ condition_on_prev_tokens: Optional[bool] = None,
66
+ temperature: Optional[Union[float, Tuple[float, ...]]] = None,
67
+ compression_ratio_threshold: Optional[float] = None,
68
+ logprob_threshold: Optional[float] = None,
69
+ no_speech_threshold: Optional[float] = None,
70
+ num_segment_frames: Optional[int] = None,
71
+ attention_mask: Optional[torch.Tensor] = None, # <====================== NOT ACTIVE by DEFAULT
72
+ time_precision: float = 0.02,
73
+ return_token_timestamps: Optional[bool] = None,
74
+ return_segments: bool = False,
75
+ return_dict_in_generate: Optional[bool] = None,
76
+ **kwargs, # <====================== ACTIVE
77
+ ):
78
+ """
79
+ """
80
+ *generation_config (`~generation.GenerationConfig`, *optional*):
81
+ The generation configuration to be used as base parametrization for the generation call. `**kwargs`
82
+ passed to generate matching the attributes of `generation_config` will override them. If
83
+ `generation_config` is not provided, the default will be used, which had the following loading
84
+ priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
85
+ configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
86
+ default values, whose documentation should be checked to parameterize generation.
87
+
88
+ from v4.39 the forced decoder ids are always None in favour of decoder input ids
89
+ generation_config.forced_decoder_ids = None
90
+ """
91
+
92
+ """
93
+ Example:
94
+
95
+ - *Longform transcription*: To transcribe or translate audios longer than 30 seconds, process the audio files without truncation and pass all mel features at once to generate.
96
+
97
+ ```python
98
+ >>> import torch
99
+ >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
100
+ >>> from datasets import load_dataset, Audio
101
+
102
+ >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
103
+ >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
104
+ >>> model.cuda() # doctest: +IGNORE_RESULT
105
+
106
+ >>> # load audios > 30 seconds
107
+ >>> ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
108
+ >>> # resample to 16kHz
109
+ >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000))
110
+ >>> # take first 8 audios and retrieve array
111
+ >>> audio = ds[:8]["audio"]
112
+ >>> audio = [x["array"] for x in audio]
113
+
114
+ >>> # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
115
+ >>> inputs = processor(audio, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True, sampling_rate=16_000)
116
+ >>> inputs = inputs.to("cuda", torch.float32)
117
+
118
+ >>> # transcribe audio to ids
119
+ >>> generated_ids = model.generate(**inputs)
120
+
121
+ >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
122
+ >>> transcription[0]
123
+ " Folks, if you watch the show, you know, I spent a lot of time (..)"
124
+ """
125
 
126
  # Convert m4a audio to wav format
127
  def convert_to_wav(audio_file):
 
129
  wav_file = "temp.wav"
130
  audio.export(wav_file, format="wav")
131
  return wav_file
132
+ #############################################################################################################################################3
133
+ #
134
+ #
135
+ #
136
+ #
137
+ #---------------------------------------------------------------------------------------------------------------------------------------------
138
  processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
139
  model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
140
+ model.cuda() # device = 0 if torch.cuda.is_available() else "cpu"
141
 
142
 
143
+ # 0. deprecate old inputs
144
+ if "inputs" in kwargs:
145
+ input_features = kwargs.pop("inputs")
146
+ warnings.warn(
147
+ "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.",
148
+ FutureWarning,
149
+ )
150
+ """
151
+ # 1. prepare generation config
152
+ generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
153
+
154
+ # 2. set global generate variables
155
+ #input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]
156
+ #num_segment_frames = input_stride * self.config.max_source_positions
157
+ #batch_size, total_input_frames = self._retrieve_total_input_frames(
158
+ input_features=input_features, kwargs=kwargs #input_stride=input_stride,
159
+ )
160
+ """
161
 
 
162
  generate_kwargs = {
163
  "num_beams": 5,
 
164
  "language": "no",
165
+ "task": "transcribe",
166
+ "forced_decoder_ids": None # ALT. generation_config.forced_decoder_ids = None
167
  }
168
 
 
169
  def transcribe_audio(audio_file, chunk_length_s=30):
170
+ #---------------------------------------------------------------------------------------------------------------------------------------------
171
+ #
172
+ #
173
+ #
174
+ #
175
+ #############################################################################################################################################3
176
  if audio_file.endswith(".m4a"):
177
  audio_file = convert_to_wav(audio_file)
178
 
179
  start_time = time.time()
180
+ # Load waveform using torchaudio
 
181
  waveform, sample_rate = torchaudio.load(audio_file)
182
 
183
  # Convert to mono if the audio has more than one channel
184
  if waveform.shape[0] > 1:
185
  waveform = torch.mean(waveform, dim=0, keepdim=True)
186
 
 
187
  if sample_rate != 16000:
188
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
189
  waveform = resampler(waveform)
 
204
  # Check chunk waveform is properly shaped
205
  if chunk_waveform.shape[0] > 1:
206
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
207
+ #############################################################################################################################################3
208
+ #
209
+ #
210
+ #
211
+ #
212
+ #---------------------------------------------------------------------------------------------------------------------------------------------
213
 
214
+ # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
215
+ inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True)
216
+ inputs = inputs.to("cuda", torch.float32)
 
 
 
 
 
217
 
218
+ input_features = inputs.input_features
219
+ # transcribe audio to ids
220
+ generated_ids = model.generate(inputs=input_features,**generate_kwargs)
 
 
 
221
 
222
+ # transcription
223
  chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
224
+ #---------------------------------------------------------------------------------------------------------------------------------------------
225
+ #
226
+ #
227
+ #
228
+ #
229
+ #############################################################################################################################################3
230
  full_text.append(chunk_text)
 
 
231
  # Combine the transcribed text from all chunks
232
  text = " ".join(full_text)
233
 
 
235
 
236
  # Audio duration (in seconds)
237
  audio_duration = waveform.shape[1] / sample_rate
 
238
  # Real-time Factor (RTF)
239
  rtf = output_time / audio_duration
240
 
 
248
  "It is the ratio of transcription time to the duration of the audio.\n\n"
249
  "An RTF of less than 1 means the transcription process is faster than real-time (expected)."
250
  )
251
+ #############################################################################################################################################3
252
+ #
253
+ #
254
+ #
255
+ #
256
+ #---------------------------------------------------------------------------------------------------------------------------------------------
257
 
258
  return text, result
259
+ #---------------------------------------------------------------------------------------------------------------------------------------------
260
+ #
261
+ #
262
+ #
263
+ #
264
+ #
265
+ #
266
+ #
267
+ #
268
+ #
269
+ #
270
+ #
271
+ #
272
+ #
273
+ #
274
+ #
275
+ #
276
+ #
277
+ #
278
+ #
279
+ #
280
+ #
281
+ #
282
+ #
283
+ #
284
+ #
285
+ #
286
+ #
287
+ #
288
+ #
289
  # Clean and preprocess/@summarization
290
  def clean_text(text):
291
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
 
312
  inputs = inputs.to(device)
313
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
314
  return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
315
+ requires updating the pre-trained model weights to match
316
  # Builds similarity matrix
317
  def build_similarity_matrix(sentences, stop_words):
318
  similarity_matrix = nx.Graph()
 
323
  similarity_matrix.add_edge(i, j, weight=len(common_words))
324
  return similarity_matrix
325
 
326
+ # "Graph-based summarization" =====>
327
  def graph_based_summary(text, num_paragraphs=3):
328
  doc = nlp(text)
329
  sentences = [sent.text for sent in doc.sents]
 
398
 
399
  PLACEHOLDER = """
400
  <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
401
+ <img src=""https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.93; ">
402
  <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Switch Work | Verktæysett no.1</h1>
403
  <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
404
  </div>