Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -15,10 +15,13 @@
|
|
15 |
#---------------------------------------------------------------------------------------------------------------------------------------------
|
16 |
|
17 |
|
18 |
-
import time
|
19 |
import os
|
20 |
import re
|
21 |
import warnings
|
|
|
|
|
|
|
|
|
22 |
from pydub import AudioSegment
|
23 |
|
24 |
import pandas as pd
|
@@ -36,14 +39,20 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
36 |
|
37 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
38 |
|
|
|
39 |
import gradio as gr
|
40 |
from fpdf import FPDF
|
41 |
from PIL import Image
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
warnings.filterwarnings("ignore")
|
49 |
|
@@ -51,32 +60,56 @@ def convert_to_wav(audio_file):
|
|
51 |
audio = AudioSegment.from_file(audio_file, format="m4a")
|
52 |
wav_file = "temp.wav"
|
53 |
audio.export(wav_file, format="wav")
|
54 |
-
|
55 |
return wav_file
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
def transcribe_audio(audio_file, batch_size=4):
|
61 |
if audio_file.endswith(".m4a"):
|
62 |
audio_file = convert_to_wav(audio_file)
|
63 |
-
|
64 |
start_time = time.time()
|
65 |
|
66 |
-
outputs =
|
67 |
-
text = outputs["text"
|
68 |
|
69 |
end_time = time.time()
|
70 |
output_time = end_time - start_time
|
71 |
word_count = len(text.split())
|
72 |
|
73 |
result = f"Transcription: {text.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
|
74 |
-
|
75 |
|
76 |
return text.strip(), result
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
|
|
|
|
|
80 |
# Clean and preprocess text
|
81 |
def clean_text(text):
|
82 |
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
|
@@ -187,16 +220,26 @@ def save_to_pdf(text, summary):
|
|
187 |
|
188 |
iface = gr.Blocks()
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
with iface:
|
192 |
-
gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="
|
193 |
-
gr.Markdown(
|
|
|
|
|
|
|
194 |
|
195 |
with gr.Tabs():
|
196 |
with gr.TabItem("Transcription"):
|
197 |
audio_input = gr.Audio(type="filepath")
|
198 |
-
text_output = gr.Textbox(label="
|
199 |
-
result_output = gr.Textbox(label="Details")
|
200 |
transcribe_button = gr.Button("Transcribe")
|
201 |
|
202 |
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])
|
|
|
15 |
#---------------------------------------------------------------------------------------------------------------------------------------------
|
16 |
|
17 |
|
|
|
18 |
import os
|
19 |
import re
|
20 |
import warnings
|
21 |
+
import time
|
22 |
+
import datetime
|
23 |
+
import subprocess
|
24 |
+
from pathlib import Path
|
25 |
from pydub import AudioSegment
|
26 |
|
27 |
import pandas as pd
|
|
|
39 |
|
40 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
41 |
|
42 |
+
from gpuinfo import GPUInfo
|
43 |
import gradio as gr
|
44 |
from fpdf import FPDF
|
45 |
from PIL import Image
|
46 |
|
47 |
+
HEADER_INFO = """
|
48 |
+
# WEB APP ✨| Norwegian WHISPER Model
|
49 |
+
Switch Work [Transkribering av lydfiler til norsk skrift]
|
50 |
+
""".strip()
|
51 |
+
LOGO = "https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png"
|
52 |
+
SIDEBAR_INFO = f"""
|
53 |
+
<div align=center>
|
54 |
+
<img src="{LOGO}" width=100/>"""
|
55 |
+
|
56 |
|
57 |
warnings.filterwarnings("ignore")
|
58 |
|
|
|
60 |
audio = AudioSegment.from_file(audio_file, format="m4a")
|
61 |
wav_file = "temp.wav"
|
62 |
audio.export(wav_file, format="wav")
|
|
|
63 |
return wav_file
|
64 |
|
65 |
+
#def convert_to_wav(filepath):
|
66 |
+
#_,file_ending = os.path.splitext(f'{filepath}')
|
67 |
+
#audio_file = filepath.replace(file_ending, ".wav")
|
68 |
+
#print("starting conversion to wav")
|
69 |
+
#os.system(f'ffmpeg -i "{filepath}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
|
70 |
+
#return audio_file
|
71 |
+
|
72 |
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
73 |
+
|
74 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
75 |
+
|
76 |
+
pipe = pipeline(
|
77 |
+
task="automatic-speech-recognition",
|
78 |
+
model="NbAiLab/nb-whisper-large",
|
79 |
+
chunk_length_s=30,
|
80 |
+
device=device,
|
81 |
+
)
|
82 |
|
83 |
def transcribe_audio(audio_file, batch_size=4):
|
84 |
if audio_file.endswith(".m4a"):
|
85 |
audio_file = convert_to_wav(audio_file)
|
86 |
+
|
87 |
start_time = time.time()
|
88 |
|
89 |
+
outputs = pipe(audio_file, batch_size=batch_size, return_timestamps=False, generate_kwargs={'num_beams': 5, 'task': 'transcribe', 'language': 'no'}) # skip_special_tokens=True
|
90 |
+
text = outputs["text"]
|
91 |
|
92 |
end_time = time.time()
|
93 |
output_time = end_time - start_time
|
94 |
word_count = len(text.split())
|
95 |
|
96 |
result = f"Transcription: {text.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
|
97 |
+
|
98 |
|
99 |
return text.strip(), result
|
100 |
+
|
101 |
+
memory = psutil.virtual_memory()
|
102 |
+
gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
|
103 |
+
gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
|
104 |
+
gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
|
105 |
+
system_info = f"""
|
106 |
+
*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
|
107 |
+
*Processing time: {time_diff:.5} seconds.*
|
108 |
+
*GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}"""
|
109 |
|
110 |
|
111 |
+
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
112 |
+
|
113 |
# Clean and preprocess text
|
114 |
def clean_text(text):
|
115 |
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
|
|
|
220 |
|
221 |
iface = gr.Blocks()
|
222 |
|
223 |
+
PLACEHOLDER = """
|
224 |
+
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
|
225 |
+
<img src=""https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.93; ">
|
226 |
+
<h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Switch Work | Verktæysett no.1</h1>
|
227 |
+
<p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
|
228 |
+
</div>
|
229 |
+
"""
|
230 |
|
231 |
with iface:
|
232 |
+
#gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.55; >')
|
233 |
+
#gr.Markdown("**Switch Work webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
|
234 |
+
gr.Image({LOGO})
|
235 |
+
gr.HTML(SIDEBAR_INFO)
|
236 |
+
gr.Markdown(HEADER_INFO)
|
237 |
|
238 |
with gr.Tabs():
|
239 |
with gr.TabItem("Transcription"):
|
240 |
audio_input = gr.Audio(type="filepath")
|
241 |
+
text_output = gr.Textbox(label="Text")
|
242 |
+
result_output = gr.Textbox(label="Transcription Details")
|
243 |
transcribe_button = gr.Button("Transcribe")
|
244 |
|
245 |
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])
|