camparchimedes commited on
Commit
3320156
·
verified ·
1 Parent(s): 5d54943

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -106
app.py CHANGED
@@ -1,6 +1,6 @@
1
- #!/usr/bin/env python
2
  ### -----------------------------------------------------------------------
3
- ### (test_BASE, Revised) version_1.07 ALPHA, app.py
 
4
  ### -----------------------------------------------------------------------
5
 
6
  # -------------------------------------------------------------------------
@@ -17,122 +17,135 @@
17
  # limitations under the License.
18
  # -------------------------------------------------------------------------
19
 
 
20
  import os
21
  import re
22
  import uuid
23
  import time
24
  import psutil
25
- import pydub
26
  import subprocess
27
  from tqdm import tqdm
28
-
29
  import tempfile
30
  from fpdf import FPDF
31
  from pathlib import Path
32
-
33
  import numpy as np
34
- import soundfile as sf
35
- import librosa
36
  import torch
37
- from transformers import pipeline
38
-
39
  from gpuinfo import GPUInfo
40
-
 
41
  import gradio as gr
 
42
 
43
 
44
  ###############################################################################
45
- # Configuration.
 
 
46
  ###############################################################################
47
 
48
- #if not torch.cuda.is_available():
49
- #DESCRIPTION += "\n<p>⚠️Running on CPU, This may not work on CPU.</p>"
 
 
 
50
 
51
- #CACHE_EXAMPLES = torch.device('cuda') and os.getenv("CACHE_EXAMPLES", "0") == "1"
52
- #CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "0") == "1"
53
- #USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
54
- #ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 
 
55
 
56
- device = torch.device('cuda')
57
- #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
58
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- def transcribe(file_upload, progress=gr.Progress(track_tqdm=True)): # microphone
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- file = file_upload
 
63
  start_time = time.time()
64
 
65
- #--------------____________________________________________--------------"
66
- with torch.no_grad():
67
- pipe = pipeline("automatic-speech-recognition",
68
- model="NbAiLab/nb-whisper-large",
69
- chunk_length_s=30,
70
- device=device)
71
-
72
- pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(task="transcribe")
73
-
74
- text = pipe(file)["text"]
75
- #--------------____________________________________________--------------"
76
 
77
  end_time = time.time()
78
  output_time = end_time - start_time
79
-
80
- # --word count
81
  word_count = len(text.split())
82
 
83
- # --memory metrics
84
- memory = psutil.virtual_memory()
85
-
86
- # --cpu metric
87
  cpu_usage = psutil.cpu_percent(interval=1)
88
 
89
- # --gpu metric
90
- gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
91
-
92
  # --system info string
93
  system_info = f"""
94
  Processing time: {output_time:.2f} seconds.
95
  Number of words: {word_count}
96
  CPU Usage: {cpu_usage}%
97
- GPU Memory: {gpu_memory}%
98
- GPU Utilization: {gpu_utilization}%
99
  """
100
 
101
- return text.strip(), system_info
 
 
102
 
103
  ###############################################################################
104
- # Interface.
105
  ###############################################################################
106
 
107
  HEADER_INFO = """
108
- # SWITCHVOX ✨|🇳🇴 *Transkribering av lydfiler til norsk bokmål.*
109
  """.strip()
110
- LOGO = "https://cdn-lfs-us-1.huggingface.co/repos/fe/3b/fe3bd7c8beece8b087fddcc2278295e7f56c794c8dcf728189f4af8bddc585e1/24ad06a03a5bc66f3eba361b94e45ad17e46f98b76632f2d17faf8a0b4f9ab6b?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27banner_trans.png%3B+filename%3D%22banner_trans.png%22%3B&response-content-type=image%2Fpng&Expires=1726757282&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNjc1NzI4Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2ZlLzNiL2ZlM2JkN2M4YmVlY2U4YjA4N2ZkZGNjMjI3ODI5NWU3ZjU2Yzc5NGM4ZGNmNzI4MTg5ZjRhZjhiZGRjNTg1ZTEvMjRhZDA2YTAzYTViYzY2ZjNlYmEzNjFiOTRlNDVhZDE3ZTQ2Zjk4Yjc2NjMyZjJkMTdmYWY4YTBiNGY5YWI2Yj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=HB0ypHpwK3dgXHqU5a3oBoR-OlPTV-Zlo-QzpvVD8DOlYvLCIwheHxh6OFUSFiWt1qEhWaelL71O1Rx5EwHG8L6oKbOVEvvrHzZjIJ9RD2YlOlx96EG5ZlaVdAlT0trDwlre-Q8VVey22UAu-H9hX%7EoyLoksIgbWX02%7E5ncmeujYG0KRMVwwB9DCkOY6FxtISGAw2A7qv1FoOdJ6nMxi8ijXDlmRigY9Cr-iuqYOUCBv4oinK-d-LEljUTbWEua1t8BvvlE02yt1TQGd8xz6E-qzWQN%7Es8%7EjNZRGMybpk5FaIl8%7El%7EMmr2Iy%7Erh62180ffBHG5YUgPnpmDKiKA2P-g__&Key-Pair-Id=K24J24Z295AEI9"
111
- SIDEBAR_INFO = f"""
112
- <div align="center">
113
- <img src="{LOGO}" style="width: 100%; height: auto;"/>
114
- </div>
115
- """
116
-
117
- """
118
- def save_to_pdf(text, summary):
119
- pdf = FPDF()
120
- pdf.add_page()
121
- pdf.set_font("Arial", size=12)
122
-
123
- if text:
124
- pdf.multi_cell(0, 10, "Transkribert Tekst:\n" + text)
125
-
126
- pdf.ln(10) # Paragraph metric
127
-
128
- if summary:
129
- pdf.multi_cell(0, 10, "Summary:\n" + summary)
130
-
131
- pdf_output_path = "transcription_.pdf"
132
- pdf.output(pdf_output_path)
133
- return pdf_output_path
134
- """
135
-
136
 
137
  css = """
138
  #transcription_output textarea {
@@ -153,56 +166,36 @@ iface = gr.Blocks(css=css)
153
 
154
  with iface:
155
 
156
- gr.HTML(SIDEBAR_INFO)
157
  gr.Markdown(HEADER_INFO)
158
 
159
  with gr.Row():
160
- gr.Markdown('''
161
- ##### 🔊 Last opp lydfila [max.lengde: 40min]
162
- ##### ☕️ Trykk på "Transkriber" knappen og vent på svar
163
- ##### ⚡️ Går rimelig bra kjapt med Norwegian NB-Whisper Large..
164
- ##### 😅 Planlegger tilleggs-funksjoner senere
165
-
166
- ''')
167
- #microphone = gr.Audio(label="Microphone", sources="microphone", type="filepath")
168
  upload = gr.Audio(label="Upload audio", sources="upload", type="filepath")
169
  transcribe_btn = gr.Button("Transkriber")
170
 
171
- with gr.Row():
172
  with gr.Column(scale=3):
173
- text_output = gr.Textbox(label="Transkribert Tekst", elem_id="transcription_output")
174
  with gr.Column(scale=1):
175
  system_info = gr.Textbox(label="Antall sekunder, ord, system data:", elem_id="system_info_box")
176
-
177
- """
178
- with gr.Tabs():
179
- with gr.TabItem("Download PDF"):
180
- pdf_text_only = gr.Button("Last ned pdf med resultat")
181
- pdf_output = gr.File(label="/.pdf")
182
 
183
- pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
184
- """
185
-
186
  with gr.Row():
187
  gr.Markdown('''
188
- <div align="center">
189
- <a href="https://opensource.com/resources/what-open-source">
190
- <img src="https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github" alt="Open Source? Yes!">
191
  </a>
192
- <span style="display:inline-block; width: 20px;"></span>
193
- <a href="https://opensource.org/licenses/Apache-2.0">
194
- <img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" alt="License: Apache 2.0">
195
  </a>
196
  </div>
197
  ''')
 
 
198
  transcribe_btn.click(
199
- fn=transcribe,
200
- inputs=[upload], # microphone
201
  outputs=[text_output, system_info]
202
- )
203
-
204
- #transcribe_btn.click(fn=transcribe, inputs=[microphone, upload], outputs=[text_output, system_info])
205
-
206
-
207
 
208
- iface.launch(share=True,debug=True)
 
 
1
  ### -----------------------------------------------------------------------
2
+ ### Transkriber version_1.00
3
+ ### app.py
4
  ### -----------------------------------------------------------------------
5
 
6
  # -------------------------------------------------------------------------
 
17
  # limitations under the License.
18
  # -------------------------------------------------------------------------
19
 
20
+
21
  import os
22
  import re
23
  import uuid
24
  import time
25
  import psutil
 
26
  import subprocess
27
  from tqdm import tqdm
 
28
  import tempfile
29
  from fpdf import FPDF
30
  from pathlib import Path
 
31
  import numpy as np
 
 
32
  import torch
33
+ from transformers import pipeline
 
34
  from gpuinfo import GPUInfo
35
+ from pydub import AudioSegment
36
+ from IPython.display import Audio
37
  import gradio as gr
38
+ import huggingface_hub
39
 
40
 
41
  ###############################################################################
42
+ # # Configuration | @version 1.05?
43
+ # You are an intelligent assistant specializing in interviews with business clients
44
+ # for in-depth content creation, etc..()
45
  ###############################################################################
46
 
47
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
48
+
49
+ ###############################################################################
50
+ # Function to detect leading silence
51
+ ###############################################################################
52
 
53
+ def milliseconds_until_sound(sound, silence_threshold_in_decibels=-20.0, chunk_size=10):
54
+ trim_ms = 0
55
+ assert chunk_size > 0
56
+ while sound[trim_ms:trim_ms + chunk_size].dBFS < silence_threshold_in_decibels and trim_ms < len(sound):
57
+ trim_ms += chunk_size
58
+ return trim_ms
59
 
60
+ ###############################################################################
61
+ # Trim the start of the audio file
62
+ ###############################################################################
63
 
64
+ def trim_start(filepath):
65
+ path = Path(filepath)
66
+ directory = path.parent
67
+ filename = path.name
68
+ audio = AudioSegment.from_file(filepath, format="wav")
69
+ start_trim = milliseconds_until_sound(audio)
70
+ trimmed = audio[start_trim:]
71
+ new_filename = directory / f"trimmed_{filename}"
72
+ trimmed.export(new_filename, format="wav")
73
+ return trimmed, new_filename
74
 
75
+ ###############################################################################
76
+ # -- segment the audio into smaller parts (1-minute segments for large files)
77
+ ###############################################################################
78
+
79
+ def segment_audio(trimmed_audio, output_dir_trimmed):
80
+ one_minute = 1 * 60 * 1000 # 1 minute in milliseconds
81
+ start_time = 0
82
+ i = 0
83
+
84
+ # -- iterate through trimmed audio, segment it
85
+ segmented_files = []
86
+ while start_time < len(trimmed_audio):
87
+ segment = trimmed_audio[start_time:start_time + one_minute]
88
+
89
+ # -- filename for each segment
90
+ file_name = f"trimmed_{i:02d}.wav"
91
+
92
+ # --export each segment, save to the Hugging Face hub directly
93
+ file_path = file_name
94
+ segment.export(file_path, format="wav")
95
+
96
+
97
+ segmented_files.append(file_path)
98
+ start_time += one_minute
99
+ i += 1
100
+
101
+ return segmented_files
102
+
103
+
104
+ ###############################################################################
105
+ # Transcription logic
106
+ ###############################################################################
107
 
108
+ def transcribe(file_upload, progress=gr.Progress(track_tqdm=True)):
109
+ file = file_upload
110
  start_time = time.time()
111
 
112
+ # -- trim auio, segment it for processing
113
+ trimmed_audio, trimmed_filename = trim_start(file)
114
+ segmented_files = segment_audio(trimmed_audio, "trimmed_audio")
115
+
116
+
117
+ pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, device=device)
118
+
119
+ transcriptions = [pipe(seg_file)["text"] for seg_file in segmented_files]
120
+ text = ''.join(transcriptions)
 
 
121
 
122
  end_time = time.time()
123
  output_time = end_time - start_time
124
+
125
+ # --Word count
126
  word_count = len(text.split())
127
 
128
+ # --CPU metric
 
 
 
129
  cpu_usage = psutil.cpu_percent(interval=1)
130
 
 
 
 
131
  # --system info string
132
  system_info = f"""
133
  Processing time: {output_time:.2f} seconds.
134
  Number of words: {word_count}
135
  CPU Usage: {cpu_usage}%
 
 
136
  """
137
 
138
+
139
+ return text, system_info
140
+
141
 
142
  ###############################################################################
143
+ # Interface
144
  ###############################################################################
145
 
146
  HEADER_INFO = """
147
+ # SWITCHVOX ✨|🇳🇴 *Transkribering av lydfiler til Norsk skrift.*
148
  """.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  css = """
151
  #transcription_output textarea {
 
166
 
167
  with iface:
168
 
 
169
  gr.Markdown(HEADER_INFO)
170
 
171
  with gr.Row():
 
 
 
 
 
 
 
 
172
  upload = gr.Audio(label="Upload audio", sources="upload", type="filepath")
173
  transcribe_btn = gr.Button("Transkriber")
174
 
175
+ with gr.Row():
176
  with gr.Column(scale=3):
177
+ text_output = gr.Textbox(label="Transkribert Tekst", placeholder="t r a n s c r i p t i o", elem_id="transcription_output")
178
  with gr.Column(scale=1):
179
  system_info = gr.Textbox(label="Antall sekunder, ord, system data:", elem_id="system_info_box")
 
 
 
 
 
 
180
 
 
 
 
181
  with gr.Row():
182
  gr.Markdown('''
183
+ <div style="text-align:center;">
184
+ <a href="https://opensource.com/resources/what-open-source" style="display: inline-block;">
185
+ <img src="https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github" alt="Open Source? Yes!" style="vertical-align: middle;">
186
  </a>
187
+ <span style="display:inline-block; width: 20px;"></span> <!-- This adds space between the logos -->
188
+ <a href="https://opensource.org/licenses/Apache-2.0" style="display: inline-block;">
189
+ <img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" alt="License: Apache 2.0" style="vertical-align: middle;">
190
  </a>
191
  </div>
192
  ''')
193
+
194
+
195
  transcribe_btn.click(
196
+ fn=transcribe,
197
+ inputs=[upload],
198
  outputs=[text_output, system_info]
199
+ )
 
 
 
 
200
 
201
+ iface.launch(debug=True)