Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,23 +1,30 @@
|
|
1 |
"""
|
|
|
2 |
|
3 |
-
|
4 |
-
|
5 |
-
Webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummering, pdf-download.
|
6 |
"""
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import time
|
9 |
import os
|
10 |
import warnings
|
11 |
from pydub import AudioSegment
|
12 |
import torch
|
|
|
13 |
from transformers import pipeline
|
14 |
from huggingface_hub import model_info
|
15 |
import spacy
|
16 |
-
#import nltk
|
17 |
-
#from nltk.tokenize import word_tokenize
|
18 |
-
#from nltk.corpus import stopwords
|
19 |
-
# from nltk.sem.logic import *
|
20 |
-
from nltk.tokenize import PunktTokenizer
|
21 |
import networkx as nx
|
22 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
23 |
from sklearn.metrics.pairwise import cosine_similarity
|
@@ -32,10 +39,6 @@ from PIL import Image
|
|
32 |
# Suppress warnings
|
33 |
warnings.filterwarnings("ignore")
|
34 |
|
35 |
-
#nltk.download('punkt', quiet=True)
|
36 |
-
#nltk.download('stopwords', quiet=True)
|
37 |
-
#word_tokenize = PunktTokenizer()
|
38 |
-
|
39 |
# Convert m4a audio to wav format
|
40 |
def convert_to_wav(audio_file):
|
41 |
audio = AudioSegment.from_file(audio_file, format="m4a")
|
@@ -43,18 +46,15 @@ def convert_to_wav(audio_file):
|
|
43 |
audio.export(wav_file, format="wav")
|
44 |
return wav_file
|
45 |
|
46 |
-
# Initialize device for torch
|
47 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
48 |
|
49 |
-
#
|
50 |
-
MODEL_NAME = "NbAiLab/nb-whisper-
|
51 |
lang = "no"
|
52 |
|
53 |
-
|
54 |
-
forced_decoder_ids = None
|
55 |
-
# get_decoder_prompt_ids =
|
56 |
-
|
57 |
device = 0 if torch.cuda.is_available() else "cpu"
|
|
|
|
|
58 |
pipe = pipeline(
|
59 |
task="automatic-speech-recognition",
|
60 |
model=MODEL_NAME,
|
@@ -62,33 +62,45 @@ pipe = pipeline(
|
|
62 |
device=device,
|
63 |
)
|
64 |
|
65 |
-
#pipe.model.config.
|
|
|
|
|
|
|
|
|
66 |
|
|
|
|
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
def transcribe_audio(audio_file):
|
69 |
if audio_file.endswith(".m4a"):
|
70 |
audio_file = convert_to_wav(audio_file)
|
71 |
|
|
|
|
|
|
|
72 |
start_time = time.time()
|
73 |
|
74 |
-
text = pipe(audio_file)["text"]
|
75 |
|
76 |
-
# Load the audio file using torchaudio
|
77 |
-
#waveform, sample_rate = torchaudio.load(audio_file)
|
78 |
|
79 |
-
# Process the waveform with Whisper's processor
|
80 |
-
#input_features = whisper_processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_features.to(device)
|
81 |
|
82 |
-
# Generate the transcription
|
83 |
-
#output = whisper_model.generate(input_features=input_features)
|
84 |
|
85 |
-
|
86 |
-
|
|
|
87 |
|
88 |
output_time = time.time() - start_time
|
89 |
|
90 |
-
#
|
91 |
-
audio_duration =
|
|
|
|
|
|
|
92 |
|
93 |
# Real-time Factor calculation
|
94 |
rtf = output_time / audio_duration
|
@@ -99,16 +111,13 @@ def transcribe_audio(audio_file):
|
|
99 |
f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
|
100 |
f"Real-time Factor (RTF): {rtf:.2f}\n"
|
101 |
f"Number of words: {len(text.split())}\n\n"
|
102 |
-
"Real-time Factor (RTF) is a measure
|
103 |
"It is the ratio of transcription time to the duration of the audio.\n\n"
|
104 |
-
"An RTF of less than 1 means the transcription process is faster than real-time."
|
105 |
)
|
106 |
|
107 |
-
# result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {len(text.split())}"
|
108 |
-
|
109 |
return text, result
|
110 |
|
111 |
-
# 'punkt' is used for tokenizing sentences and stopwords for filtering, currently only for English(?)
|
112 |
|
113 |
# Clean and preprocess text for summarization
|
114 |
def clean_text(text):
|
@@ -220,7 +229,7 @@ def save_to_pdf(text, summary):
|
|
220 |
|
221 |
def _return_img_html_embed(img_url):
|
222 |
HTML_str = (
|
223 |
-
f'<center
|
224 |
)
|
225 |
return HTML_str
|
226 |
|
|
|
1 |
"""
|
2 |
+
Version: 4th_pruned_optimized_transcription_app.py
|
3 |
|
4 |
+
Description: webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummering, pdf-download.
|
|
|
|
|
5 |
"""
|
6 |
|
7 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
8 |
+
# you may not use this file except in compliance with the License.
|
9 |
+
# You may obtain a copy of the License at
|
10 |
+
#
|
11 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12 |
+
#
|
13 |
+
# Unless required by applicable law or agreed to in writing, software
|
14 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16 |
+
# See the License for the specific language governing permissions and
|
17 |
+
# limitations under the License.
|
18 |
+
|
19 |
import time
|
20 |
import os
|
21 |
import warnings
|
22 |
from pydub import AudioSegment
|
23 |
import torch
|
24 |
+
import torchaudio
|
25 |
from transformers import pipeline
|
26 |
from huggingface_hub import model_info
|
27 |
import spacy
|
|
|
|
|
|
|
|
|
|
|
28 |
import networkx as nx
|
29 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
30 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
39 |
# Suppress warnings
|
40 |
warnings.filterwarnings("ignore")
|
41 |
|
|
|
|
|
|
|
|
|
42 |
# Convert m4a audio to wav format
|
43 |
def convert_to_wav(audio_file):
|
44 |
audio = AudioSegment.from_file(audio_file, format="m4a")
|
|
|
46 |
audio.export(wav_file, format="wav")
|
47 |
return wav_file
|
48 |
|
|
|
|
|
49 |
|
50 |
+
# D3efine model
|
51 |
+
MODEL_NAME = "NbAiLab/nb-whisper-large"
|
52 |
lang = "no"
|
53 |
|
54 |
+
# Initialize device for torch
|
|
|
|
|
|
|
55 |
device = 0 if torch.cuda.is_available() else "cpu"
|
56 |
+
|
57 |
+
# Define pipeline config
|
58 |
pipe = pipeline(
|
59 |
task="automatic-speech-recognition",
|
60 |
model=MODEL_NAME,
|
|
|
62 |
device=device,
|
63 |
)
|
64 |
|
65 |
+
#pipe.model.config.pad_token_id = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
|
66 |
+
|
67 |
+
# # Set eos_token_id and pad_token_id to different values
|
68 |
+
pipe.model.config.eos_token_id = 0
|
69 |
+
pipe.model.config.pad_token_id = 1
|
70 |
|
71 |
+
# OR
|
72 |
+
pipe.model.config.pad_token_id = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
|
73 |
|
74 |
+
|
75 |
+
assert pipe.model.config.eos_token_id != pipe.model.config.pad_token_id
|
76 |
+
"eos_token_id and pad_token_id must be different"
|
77 |
+
|
78 |
+
|
79 |
+
# Transcribe audio
|
80 |
def transcribe_audio(audio_file):
|
81 |
if audio_file.endswith(".m4a"):
|
82 |
audio_file = convert_to_wav(audio_file)
|
83 |
|
84 |
+
# Load using torchaudio
|
85 |
+
waveform, sample_rate = torchaudio.load(audio_file)
|
86 |
+
|
87 |
start_time = time.time()
|
88 |
|
|
|
89 |
|
|
|
|
|
90 |
|
|
|
|
|
91 |
|
|
|
|
|
92 |
|
93 |
+
|
94 |
+
text = pipe(waveform, sampling_rate=sample_rate)["text"]
|
95 |
+
|
96 |
|
97 |
output_time = time.time() - start_time
|
98 |
|
99 |
+
# Calculate audio duration (in seconds)
|
100 |
+
audio_duration = waveform.shape[1] / sample_rate
|
101 |
+
|
102 |
+
# Find audio duration@pipeline's internal method
|
103 |
+
#audio_duration = pipe.feature_extractor.sampling_rate * len(pipe.feature_extractor(audio_file)["input_features"][0]) / pipe.feature_extractor.sampling_rate
|
104 |
|
105 |
# Real-time Factor calculation
|
106 |
rtf = output_time / audio_duration
|
|
|
111 |
f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
|
112 |
f"Real-time Factor (RTF): {rtf:.2f}\n"
|
113 |
f"Number of words: {len(text.split())}\n\n"
|
114 |
+
"Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
|
115 |
"It is the ratio of transcription time to the duration of the audio.\n\n"
|
116 |
+
"An RTF of less than 1 means the transcription process is faster than real-time (expected)."
|
117 |
)
|
118 |
|
|
|
|
|
119 |
return text, result
|
120 |
|
|
|
121 |
|
122 |
# Clean and preprocess text for summarization
|
123 |
def clean_text(text):
|
|
|
229 |
|
230 |
def _return_img_html_embed(img_url):
|
231 |
HTML_str = (
|
232 |
+
f'<center><img src="{img_url}" alt="Imagerine" style="width:100%; height:auto;"></center>'
|
233 |
)
|
234 |
return HTML_str
|
235 |
|