Spaces:

camparchimedes
/

nb

Build error

App Files Files

camparchimedes commited on Aug 19, 2024

Commit

351252d

verified ·

1 Parent(s): 783b278

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -38

app.py CHANGED Viewed

@@ -1,23 +1,30 @@
 """
-This application processes audio files, transcribes them using a pretrained model (Whisper), and provides multiple summarization options for the transcribed text. The application also includes a PDF generation feature and is built with Gradio for the user interface
-Webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummering, pdf-download.
 """
 import time
 import os
 import warnings
 from pydub import AudioSegment
 import torch
 from transformers import pipeline
 from huggingface_hub import model_info
 import spacy
-#import nltk
-#from nltk.tokenize import word_tokenize
-#from nltk.corpus import stopwords
-# from nltk.sem.logic import *
-from nltk.tokenize import PunktTokenizer
 import networkx as nx
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
@@ -32,10 +39,6 @@ from PIL import Image
 # Suppress warnings
 warnings.filterwarnings("ignore")
-#nltk.download('punkt', quiet=True)
-#nltk.download('stopwords', quiet=True)
-#word_tokenize = PunktTokenizer()
 # Convert m4a audio to wav format
 def convert_to_wav(audio_file):
     audio = AudioSegment.from_file(audio_file, format="m4a")
@@ -43,18 +46,15 @@ def convert_to_wav(audio_file):
     audio.export(wav_file, format="wav")
     return wav_file
-# Initialize device for torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load smoking-big-pipe
-MODEL_NAME = "NbAiLab/nb-whisper-medium"
 lang = "no"
-task = "transcribe"
-forced_decoder_ids = None
-# get_decoder_prompt_ids =
 device = 0 if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
@@ -62,33 +62,45 @@ pipe = pipeline(
     device=device,
 )
-#pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
 def transcribe_audio(audio_file):
     if audio_file.endswith(".m4a"):
         audio_file = convert_to_wav(audio_file)
     start_time = time.time()
-    text = pipe(audio_file)["text"]
-    # Load the audio file using torchaudio
-    #waveform, sample_rate = torchaudio.load(audio_file)
-    # Process the waveform with Whisper's processor
-    #input_features = whisper_processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_features.to(device)
-    # Generate the transcription
-    #output = whisper_model.generate(input_features=input_features)
-    # Decode the output
-    #text = whisper_processor.batch_decode(output, skip_special_tokens=True)[0]
     output_time = time.time() - start_time
-    # # Calculate audio duration using the pipeline's internal method
-    audio_duration = pipe.feature_extractor.sampling_rate * len(pipe.feature_extractor(audio_file)["input_features"][0]) / pipe.feature_extractor.sampling_rate
     # Real-time Factor calculation
     rtf = output_time / audio_duration
@@ -99,16 +111,13 @@ def transcribe_audio(audio_file):
         f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
         f"Real-time Factor (RTF): {rtf:.2f}\n"
         f"Number of words: {len(text.split())}\n\n"
-        "Real-time Factor (RTF) is a measure often used to evaluate the speed of speech recognition systems. "
         "It is the ratio of transcription time to the duration of the audio.\n\n"
-        "An RTF of less than 1 means the transcription process is faster than real-time."
     )
-    # result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {len(text.split())}"
     return text, result
-# 'punkt' is used for tokenizing sentences and stopwords for filtering, currently only for English(?)
 # Clean and preprocess text for summarization
 def clean_text(text):
@@ -220,7 +229,7 @@ def save_to_pdf(text, summary):
 def _return_img_html_embed(img_url):
     HTML_str = (
-        f'<center> <img src="{img_url}" alt="Image" width="100%" height="auto"> </center>'
     )
     return HTML_str

 """
+Version: 4th_pruned_optimized_transcription_app.py
+Description: webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummering, pdf-download.
 """
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import time
 import os
 import warnings
 from pydub import AudioSegment
 import torch
+import torchaudio
 from transformers import pipeline
 from huggingface_hub import model_info
 import spacy
 import networkx as nx
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 # Suppress warnings
 warnings.filterwarnings("ignore")
 # Convert m4a audio to wav format
 def convert_to_wav(audio_file):
     audio = AudioSegment.from_file(audio_file, format="m4a")
     audio.export(wav_file, format="wav")
     return wav_file
+# D3efine model
+MODEL_NAME = "NbAiLab/nb-whisper-large"
 lang = "no"
+# Initialize device for torch
 device = 0 if torch.cuda.is_available() else "cpu"
+# Define pipeline config
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
     device=device,
 )
+#pipe.model.config.pad_token_id = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
+# # Set eos_token_id and pad_token_id to different values
+pipe.model.config.eos_token_id = 0
+pipe.model.config.pad_token_id = 1
+# OR
+pipe.model.config.pad_token_id = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
+assert pipe.model.config.eos_token_id != pipe.model.config.pad_token_id
+"eos_token_id and pad_token_id must be different"
+# Transcribe audio
 def transcribe_audio(audio_file):
     if audio_file.endswith(".m4a"):
         audio_file = convert_to_wav(audio_file)
+    # Load using torchaudio
+    waveform, sample_rate = torchaudio.load(audio_file)
     start_time = time.time()
+    text = pipe(waveform, sampling_rate=sample_rate)["text"]
     output_time = time.time() - start_time
+    # Calculate audio duration (in seconds)
+    audio_duration = waveform.shape[1] / sample_rate
+    # Find audio duration@pipeline's internal method
+    #audio_duration = pipe.feature_extractor.sampling_rate * len(pipe.feature_extractor(audio_file)["input_features"][0]) / pipe.feature_extractor.sampling_rate
     # Real-time Factor calculation
     rtf = output_time / audio_duration
         f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
         f"Real-time Factor (RTF): {rtf:.2f}\n"
         f"Number of words: {len(text.split())}\n\n"
+        "Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
         "It is the ratio of transcription time to the duration of the audio.\n\n"
+        "An RTF of less than 1 means the transcription process is faster than real-time (expected)."
     )
     return text, result
 # Clean and preprocess text for summarization
 def clean_text(text):
 def _return_img_html_embed(img_url):
     HTML_str = (
+        f'<center><img src="{img_url}" alt="Imagerine" style="width:100%; height:auto;"></center>'
     )
     return HTML_str