camparchimedes commited on
Commit
85002a1
·
verified ·
1 Parent(s): 2e6be6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -16
app.py CHANGED
@@ -15,10 +15,13 @@
15
  #---------------------------------------------------------------------------------------------------------------------------------------------
16
 
17
 
18
- import time
19
  import os
20
  import re
21
  import warnings
 
 
 
 
22
  from pydub import AudioSegment
23
 
24
  import pandas as pd
@@ -36,14 +39,20 @@ from sklearn.metrics.pairwise import cosine_similarity
36
 
37
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
38
 
 
39
  import gradio as gr
40
  from fpdf import FPDF
41
  from PIL import Image
42
 
43
- title = """# Welcome to 🌟Switch Work webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download
44
- Query Attention, a context window of 16,384 tokens with a sliding window attention of 4,096 tokens, and was trained using the Fill-in-the-Middle objective on 4+ trillion tokens. The model was trained with NVIDIA NeMo™ Framework using the NVIDIA Eos Supercomputer built with NVIDIA DGX H100 systems. You can build with this endpoint using✨StarCoder available here : [bigcode/starcoder2-15b](https://huggingface.co/bigcode/starcoder2-15b). You can also use ✨StarCoder by cloning this space. Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/starcoder2?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3>
45
- Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) Math 🔍 [introspector](https://huggingface.co/introspector) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [SciTonic](https://github.com/Tonic-AI/scitonic)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
46
- """
 
 
 
 
 
47
 
48
  warnings.filterwarnings("ignore")
49
 
@@ -51,32 +60,56 @@ def convert_to_wav(audio_file):
51
  audio = AudioSegment.from_file(audio_file, format="m4a")
52
  wav_file = "temp.wav"
53
  audio.export(wav_file, format="wav")
54
-
55
  return wav_file
56
 
 
 
 
 
 
 
 
57
  #:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
58
- asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large")
 
 
 
 
 
 
 
 
59
 
60
  def transcribe_audio(audio_file, batch_size=4):
61
  if audio_file.endswith(".m4a"):
62
  audio_file = convert_to_wav(audio_file)
63
-
64
  start_time = time.time()
65
 
66
- outputs = asr(audio_file, chunk_length_s=28, batch_size=batch_size, return_timestamps=False, generate_kwargs={'num_beams': 5, 'task': 'transcribe', 'language': 'no'}, skip_special_tokens=True)
67
- text = outputs["text", skip_special_tokens=True]
68
 
69
  end_time = time.time()
70
  output_time = end_time - start_time
71
  word_count = len(text.split())
72
 
73
  result = f"Transcription: {text.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
74
-
75
 
76
  return text.strip(), result
77
- #:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 
 
 
 
 
 
 
 
78
 
79
 
 
 
80
  # Clean and preprocess text
81
  def clean_text(text):
82
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
@@ -187,16 +220,26 @@ def save_to_pdf(text, summary):
187
 
188
  iface = gr.Blocks()
189
 
 
 
 
 
 
 
 
190
 
191
  with iface:
192
- gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="Image" style="width: 100%; height: auto;"></a>')
193
- gr.Markdown(title)
 
 
 
194
 
195
  with gr.Tabs():
196
  with gr.TabItem("Transcription"):
197
  audio_input = gr.Audio(type="filepath")
198
- text_output = gr.Textbox(label="Transcription")
199
- result_output = gr.Textbox(label="Details")
200
  transcribe_button = gr.Button("Transcribe")
201
 
202
  transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])
 
15
  #---------------------------------------------------------------------------------------------------------------------------------------------
16
 
17
 
 
18
  import os
19
  import re
20
  import warnings
21
+ import time
22
+ import datetime
23
+ import subprocess
24
+ from pathlib import Path
25
  from pydub import AudioSegment
26
 
27
  import pandas as pd
 
39
 
40
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
41
 
42
+ from gpuinfo import GPUInfo
43
  import gradio as gr
44
  from fpdf import FPDF
45
  from PIL import Image
46
 
47
+ HEADER_INFO = """
48
+ # WEB APP ✨| Norwegian WHISPER Model
49
+ Switch Work [Transkribering av lydfiler til norsk skrift]
50
+ """.strip()
51
+ LOGO = "https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png"
52
+ SIDEBAR_INFO = f"""
53
+ <div align=center>
54
+ <img src="{LOGO}" width=100/>"""
55
+
56
 
57
  warnings.filterwarnings("ignore")
58
 
 
60
  audio = AudioSegment.from_file(audio_file, format="m4a")
61
  wav_file = "temp.wav"
62
  audio.export(wav_file, format="wav")
 
63
  return wav_file
64
 
65
+ #def convert_to_wav(filepath):
66
+ #_,file_ending = os.path.splitext(f'{filepath}')
67
+ #audio_file = filepath.replace(file_ending, ".wav")
68
+ #print("starting conversion to wav")
69
+ #os.system(f'ffmpeg -i "{filepath}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
70
+ #return audio_file
71
+
72
  #:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
73
+
74
+ device = "cuda" if torch.cuda.is_available() else "cpu"
75
+
76
+ pipe = pipeline(
77
+ task="automatic-speech-recognition",
78
+ model="NbAiLab/nb-whisper-large",
79
+ chunk_length_s=30,
80
+ device=device,
81
+ )
82
 
83
  def transcribe_audio(audio_file, batch_size=4):
84
  if audio_file.endswith(".m4a"):
85
  audio_file = convert_to_wav(audio_file)
86
+
87
  start_time = time.time()
88
 
89
+ outputs = pipe(audio_file, batch_size=batch_size, return_timestamps=False, generate_kwargs={'num_beams': 5, 'task': 'transcribe', 'language': 'no'}) # skip_special_tokens=True
90
+ text = outputs["text"]
91
 
92
  end_time = time.time()
93
  output_time = end_time - start_time
94
  word_count = len(text.split())
95
 
96
  result = f"Transcription: {text.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
97
+
98
 
99
  return text.strip(), result
100
+
101
+ memory = psutil.virtual_memory()
102
+ gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
103
+ gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
104
+ gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
105
+ system_info = f"""
106
+ *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
107
+ *Processing time: {time_diff:.5} seconds.*
108
+ *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}"""
109
 
110
 
111
+ #:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
112
+
113
  # Clean and preprocess text
114
  def clean_text(text):
115
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
 
220
 
221
  iface = gr.Blocks()
222
 
223
+ PLACEHOLDER = """
224
+ <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
225
+ <img src=""https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.93; ">
226
+ <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Switch Work | Verktæysett no.1</h1>
227
+ <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
228
+ </div>
229
+ """
230
 
231
  with iface:
232
+ #gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.55; >')
233
+ #gr.Markdown("**Switch Work webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
234
+ gr.Image({LOGO})
235
+ gr.HTML(SIDEBAR_INFO)
236
+ gr.Markdown(HEADER_INFO)
237
 
238
  with gr.Tabs():
239
  with gr.TabItem("Transcription"):
240
  audio_input = gr.Audio(type="filepath")
241
+ text_output = gr.Textbox(label="Text")
242
+ result_output = gr.Textbox(label="Transcription Details")
243
  transcribe_button = gr.Button("Transcribe")
244
 
245
  transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])