File size: 6,919 Bytes
abc89d1
3320156
 
abc89d1
 
 
351252d
 
 
 
 
 
 
 
 
 
 
abc89d1
 
3320156
d4b107b
cf8326e
b8712f3
85002a1
b8712f3
85002a1
b8712f3
c5571fa
dbca570
b8712f3
 
 
3320156
dbca570
3320156
 
b8712f3
3320156
cf8326e
04f2c63
b8712f3
3320156
 
 
b8712f3
cf8326e
3320156
 
 
 
 
dbca570
3320156
 
 
 
 
 
b8712f3
3320156
 
 
73a1be0
3320156
 
 
 
 
 
 
 
 
 
2b6186f
3320156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9da571f
3320156
 
9da571f
f8b8cd5
3320156
 
 
 
 
 
 
 
 
9da571f
 
 
3320156
 
9da571f
 
3320156
9bfe584
9da571f
 
 
9bfe584
 
 
9da571f
 
3320156
 
 
9da571f
b8712f3
3320156
b8712f3
 
 
32e6e2c
b8712f3
2fb8a5f
102fb89
 
1667a9d
fa68d0f
b8712f3
102fb89
 
 
1667a9d
fa68d0f
b8712f3
 
102fb89
 
dee4184
102fb89
abc89d1
6a67784
abc89d1
85002a1
5ca37ae
abc89d1
8fb6f57
aebda00
abc89d1
3320156
1813060
3320156
b8712f3
9bfe584
73a1be0
9d34978
 
3320156
 
 
9d34978
3320156
 
 
9d34978
 
 
3320156
 
b8712f3
3320156
 
b8712f3
3320156
73a1be0
3320156
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
### -----------------------------------------------------------------------
### Transkriber version_1.00
### app.py
### -----------------------------------------------------------------------

# -------------------------------------------------------------------------
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -------------------------------------------------------------------------


import os
import re
import uuid
import time
import psutil
import subprocess
from tqdm import tqdm
import tempfile
from fpdf import FPDF
from pathlib import Path
import numpy as np
import torch
from transformers import pipeline
from gpuinfo import GPUInfo
from pydub import AudioSegment
from IPython.display import Audio
import gradio as gr
import huggingface_hub


###############################################################################
# # Configuration | @version 1.05?
# You are an intelligent assistant specializing in interviews with business clients
# for in-depth content creation, etc..()
###############################################################################

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

###############################################################################
# Function to detect leading silence
###############################################################################

def milliseconds_until_sound(sound, silence_threshold_in_decibels=-20.0, chunk_size=10):
    trim_ms = 0
    assert chunk_size > 0
    while sound[trim_ms:trim_ms + chunk_size].dBFS < silence_threshold_in_decibels and trim_ms < len(sound):
        trim_ms += chunk_size
    return trim_ms

###############################################################################
# Trim the start of the audio file
###############################################################################

def trim_start(filepath):
    path = Path(filepath)
    directory = path.parent
    filename = path.name
    audio = AudioSegment.from_file(filepath, format="wav")
    start_trim = milliseconds_until_sound(audio)
    trimmed = audio[start_trim:]
    new_filename = directory / f"trimmed_{filename}"
    trimmed.export(new_filename, format="wav")
    return trimmed, new_filename

###############################################################################
# -- segment the audio into smaller parts (1-minute segments for large files)
###############################################################################

def segment_audio(trimmed_audio, output_dir_trimmed):
    one_minute = 1 * 60 * 1000  # 1 minute in milliseconds
    start_time = 0
    i = 0

    # -- iterate through trimmed audio, segment it
    segmented_files = []
    while start_time < len(trimmed_audio):
        segment = trimmed_audio[start_time:start_time + one_minute]

        # -- filename for each segment
        file_name = f"trimmed_{i:02d}.wav"

        # --export each segment, save to the Hugging Face hub directly
        file_path = file_name
        segment.export(file_path, format="wav")


        segmented_files.append(file_path)
        start_time += one_minute
        i += 1

    return segmented_files


###############################################################################
# Transcription logic
###############################################################################

def transcribe(file_upload, progress=gr.Progress(track_tqdm=True)):
    file = file_upload
    start_time = time.time()

    # -- trim auio, segment it for processing
    trimmed_audio, trimmed_filename = trim_start(file)
    segmented_files = segment_audio(trimmed_audio, "trimmed_audio")


    pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, device=device)

    transcriptions = [pipe(seg_file)["text"] for seg_file in segmented_files]
    text = ''.join(transcriptions)

    end_time = time.time()
    output_time = end_time - start_time

    # --Word count
    word_count = len(text.split())

    # --CPU metric
    cpu_usage = psutil.cpu_percent(interval=1)

    # --system info string
    system_info = f"""
    Processing time: {output_time:.2f} seconds.
    Number of words: {word_count}
    CPU Usage: {cpu_usage}%
    """


    return text, system_info


###############################################################################
# Interface
###############################################################################

HEADER_INFO = """
    # This space uses the *Norwegian NB-Whisper Large* model by **NbAiLab** to transcribe long-form microphone or audio inputs in Norwegian of arbitrary length.
""".strip()

css = """
#transcription_output textarea {
    background-color: #000000;  /* black */
    color: #00FF00 !important;  /* text color */
    font-size: 18px;  /* font size */
}

#system_info_box textarea {
    background-color: #ffe0b3;  /* orange */
    color: black !important;  /* text color */
    font-size: 16px;  /* font size */
    font-weight: bold;  /* bold font */
}
"""

iface = gr.Blocks(css=css)

with iface:

    gr.Markdown(HEADER_INFO)

    with gr.Row():
        upload = gr.Audio(label="Upload audio", sources="upload", type="filepath")
        transcribe_btn = gr.Button("Transkriber")

    with gr.Row():
        with gr.Column(scale=3):
            text_output = gr.Textbox(label="Transkribert Tekst", placeholder="t r a n s c r i p t i o", elem_id="transcription_output")
        with gr.Column(scale=1):
            system_info = gr.Textbox(label="Antall sekunder, ord, system data:", elem_id="system_info_box")

    with gr.Row():
        gr.Markdown('''
        <div style="text-align:center;">
        <a href="https://opensource.com/resources/what-open-source" style="display: inline-block;">
            <img src="https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github" alt="Open Source? Yes!" style="vertical-align: middle;">
        </a>
        <span style="display:inline-block; width: 20px;"></span> <!-- This adds space between the logos -->
        <a href="https://opensource.org/licenses/Apache-2.0" style="display: inline-block;">
            <img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" alt="License: Apache 2.0" style="vertical-align: middle;">
        </a>
        </div>
        ''')

  
    transcribe_btn.click(
        fn=transcribe,
        inputs=[upload],
        outputs=[text_output, system_info]
    )

iface.launch(debug=True)