camparchimedes commited on
Commit
351252d
·
verified ·
1 Parent(s): 783b278

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -38
app.py CHANGED
@@ -1,23 +1,30 @@
1
  """
 
2
 
3
- This application processes audio files, transcribes them using a pretrained model (Whisper), and provides multiple summarization options for the transcribed text. The application also includes a PDF generation feature and is built with Gradio for the user interface
4
-
5
- Webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummering, pdf-download.
6
  """
7
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import time
9
  import os
10
  import warnings
11
  from pydub import AudioSegment
12
  import torch
 
13
  from transformers import pipeline
14
  from huggingface_hub import model_info
15
  import spacy
16
- #import nltk
17
- #from nltk.tokenize import word_tokenize
18
- #from nltk.corpus import stopwords
19
- # from nltk.sem.logic import *
20
- from nltk.tokenize import PunktTokenizer
21
  import networkx as nx
22
  from sklearn.feature_extraction.text import TfidfVectorizer
23
  from sklearn.metrics.pairwise import cosine_similarity
@@ -32,10 +39,6 @@ from PIL import Image
32
  # Suppress warnings
33
  warnings.filterwarnings("ignore")
34
 
35
- #nltk.download('punkt', quiet=True)
36
- #nltk.download('stopwords', quiet=True)
37
- #word_tokenize = PunktTokenizer()
38
-
39
  # Convert m4a audio to wav format
40
  def convert_to_wav(audio_file):
41
  audio = AudioSegment.from_file(audio_file, format="m4a")
@@ -43,18 +46,15 @@ def convert_to_wav(audio_file):
43
  audio.export(wav_file, format="wav")
44
  return wav_file
45
 
46
- # Initialize device for torch
47
- device = "cuda" if torch.cuda.is_available() else "cpu"
48
 
49
- # Load smoking-big-pipe
50
- MODEL_NAME = "NbAiLab/nb-whisper-medium"
51
  lang = "no"
52
 
53
- task = "transcribe"
54
- forced_decoder_ids = None
55
- # get_decoder_prompt_ids =
56
-
57
  device = 0 if torch.cuda.is_available() else "cpu"
 
 
58
  pipe = pipeline(
59
  task="automatic-speech-recognition",
60
  model=MODEL_NAME,
@@ -62,33 +62,45 @@ pipe = pipeline(
62
  device=device,
63
  )
64
 
65
- #pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
 
 
 
 
66
 
 
 
67
 
 
 
 
 
 
 
68
  def transcribe_audio(audio_file):
69
  if audio_file.endswith(".m4a"):
70
  audio_file = convert_to_wav(audio_file)
71
 
 
 
 
72
  start_time = time.time()
73
 
74
- text = pipe(audio_file)["text"]
75
 
76
- # Load the audio file using torchaudio
77
- #waveform, sample_rate = torchaudio.load(audio_file)
78
 
79
- # Process the waveform with Whisper's processor
80
- #input_features = whisper_processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_features.to(device)
81
 
82
- # Generate the transcription
83
- #output = whisper_model.generate(input_features=input_features)
84
 
85
- # Decode the output
86
- #text = whisper_processor.batch_decode(output, skip_special_tokens=True)[0]
 
87
 
88
  output_time = time.time() - start_time
89
 
90
- # # Calculate audio duration using the pipeline's internal method
91
- audio_duration = pipe.feature_extractor.sampling_rate * len(pipe.feature_extractor(audio_file)["input_features"][0]) / pipe.feature_extractor.sampling_rate
 
 
 
92
 
93
  # Real-time Factor calculation
94
  rtf = output_time / audio_duration
@@ -99,16 +111,13 @@ def transcribe_audio(audio_file):
99
  f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
100
  f"Real-time Factor (RTF): {rtf:.2f}\n"
101
  f"Number of words: {len(text.split())}\n\n"
102
- "Real-time Factor (RTF) is a measure often used to evaluate the speed of speech recognition systems. "
103
  "It is the ratio of transcription time to the duration of the audio.\n\n"
104
- "An RTF of less than 1 means the transcription process is faster than real-time."
105
  )
106
 
107
- # result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {len(text.split())}"
108
-
109
  return text, result
110
 
111
- # 'punkt' is used for tokenizing sentences and stopwords for filtering, currently only for English(?)
112
 
113
  # Clean and preprocess text for summarization
114
  def clean_text(text):
@@ -220,7 +229,7 @@ def save_to_pdf(text, summary):
220
 
221
  def _return_img_html_embed(img_url):
222
  HTML_str = (
223
- f'<center> <img src="{img_url}" alt="Image" width="100%" height="auto"> </center>'
224
  )
225
  return HTML_str
226
 
 
1
  """
2
+ Version: 4th_pruned_optimized_transcription_app.py
3
 
4
+ Description: webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummering, pdf-download.
 
 
5
  """
6
 
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
  import time
20
  import os
21
  import warnings
22
  from pydub import AudioSegment
23
  import torch
24
+ import torchaudio
25
  from transformers import pipeline
26
  from huggingface_hub import model_info
27
  import spacy
 
 
 
 
 
28
  import networkx as nx
29
  from sklearn.feature_extraction.text import TfidfVectorizer
30
  from sklearn.metrics.pairwise import cosine_similarity
 
39
  # Suppress warnings
40
  warnings.filterwarnings("ignore")
41
 
 
 
 
 
42
  # Convert m4a audio to wav format
43
  def convert_to_wav(audio_file):
44
  audio = AudioSegment.from_file(audio_file, format="m4a")
 
46
  audio.export(wav_file, format="wav")
47
  return wav_file
48
 
 
 
49
 
50
+ # D3efine model
51
+ MODEL_NAME = "NbAiLab/nb-whisper-large"
52
  lang = "no"
53
 
54
+ # Initialize device for torch
 
 
 
55
  device = 0 if torch.cuda.is_available() else "cpu"
56
+
57
+ # Define pipeline config
58
  pipe = pipeline(
59
  task="automatic-speech-recognition",
60
  model=MODEL_NAME,
 
62
  device=device,
63
  )
64
 
65
+ #pipe.model.config.pad_token_id = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
66
+
67
+ # # Set eos_token_id and pad_token_id to different values
68
+ pipe.model.config.eos_token_id = 0
69
+ pipe.model.config.pad_token_id = 1
70
 
71
+ # OR
72
+ pipe.model.config.pad_token_id = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
73
 
74
+
75
+ assert pipe.model.config.eos_token_id != pipe.model.config.pad_token_id
76
+ "eos_token_id and pad_token_id must be different"
77
+
78
+
79
+ # Transcribe audio
80
  def transcribe_audio(audio_file):
81
  if audio_file.endswith(".m4a"):
82
  audio_file = convert_to_wav(audio_file)
83
 
84
+ # Load using torchaudio
85
+ waveform, sample_rate = torchaudio.load(audio_file)
86
+
87
  start_time = time.time()
88
 
 
89
 
 
 
90
 
 
 
91
 
 
 
92
 
93
+
94
+ text = pipe(waveform, sampling_rate=sample_rate)["text"]
95
+
96
 
97
  output_time = time.time() - start_time
98
 
99
+ # Calculate audio duration (in seconds)
100
+ audio_duration = waveform.shape[1] / sample_rate
101
+
102
+ # Find audio duration@pipeline's internal method
103
+ #audio_duration = pipe.feature_extractor.sampling_rate * len(pipe.feature_extractor(audio_file)["input_features"][0]) / pipe.feature_extractor.sampling_rate
104
 
105
  # Real-time Factor calculation
106
  rtf = output_time / audio_duration
 
111
  f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
112
  f"Real-time Factor (RTF): {rtf:.2f}\n"
113
  f"Number of words: {len(text.split())}\n\n"
114
+ "Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
115
  "It is the ratio of transcription time to the duration of the audio.\n\n"
116
+ "An RTF of less than 1 means the transcription process is faster than real-time (expected)."
117
  )
118
 
 
 
119
  return text, result
120
 
 
121
 
122
  # Clean and preprocess text for summarization
123
  def clean_text(text):
 
229
 
230
  def _return_img_html_embed(img_url):
231
  HTML_str = (
232
+ f'<center><img src="{img_url}" alt="Imagerine" style="width:100%; height:auto;"></center>'
233
  )
234
  return HTML_str
235