Spaces:
Running
Running
File size: 6,919 Bytes
abc89d1 3320156 abc89d1 351252d abc89d1 3320156 d4b107b cf8326e b8712f3 85002a1 b8712f3 85002a1 b8712f3 c5571fa dbca570 b8712f3 3320156 dbca570 3320156 b8712f3 3320156 cf8326e 04f2c63 b8712f3 3320156 b8712f3 cf8326e 3320156 dbca570 3320156 b8712f3 3320156 73a1be0 3320156 2b6186f 3320156 9da571f 3320156 9da571f f8b8cd5 3320156 9da571f 3320156 9da571f 3320156 9bfe584 9da571f 9bfe584 9da571f 3320156 9da571f b8712f3 3320156 b8712f3 32e6e2c b8712f3 2fb8a5f 102fb89 1667a9d fa68d0f b8712f3 102fb89 1667a9d fa68d0f b8712f3 102fb89 dee4184 102fb89 abc89d1 6a67784 abc89d1 85002a1 5ca37ae abc89d1 8fb6f57 aebda00 abc89d1 3320156 1813060 3320156 b8712f3 9bfe584 73a1be0 9d34978 3320156 9d34978 3320156 9d34978 3320156 b8712f3 3320156 b8712f3 3320156 73a1be0 3320156 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
### -----------------------------------------------------------------------
### Transkriber version_1.00
### app.py
### -----------------------------------------------------------------------
# -------------------------------------------------------------------------
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -------------------------------------------------------------------------
import os
import re
import uuid
import time
import psutil
import subprocess
from tqdm import tqdm
import tempfile
from fpdf import FPDF
from pathlib import Path
import numpy as np
import torch
from transformers import pipeline
from gpuinfo import GPUInfo
from pydub import AudioSegment
from IPython.display import Audio
import gradio as gr
import huggingface_hub
###############################################################################
# # Configuration | @version 1.05?
# You are an intelligent assistant specializing in interviews with business clients
# for in-depth content creation, etc..()
###############################################################################
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
###############################################################################
# Function to detect leading silence
###############################################################################
def milliseconds_until_sound(sound, silence_threshold_in_decibels=-20.0, chunk_size=10):
trim_ms = 0
assert chunk_size > 0
while sound[trim_ms:trim_ms + chunk_size].dBFS < silence_threshold_in_decibels and trim_ms < len(sound):
trim_ms += chunk_size
return trim_ms
###############################################################################
# Trim the start of the audio file
###############################################################################
def trim_start(filepath):
path = Path(filepath)
directory = path.parent
filename = path.name
audio = AudioSegment.from_file(filepath, format="wav")
start_trim = milliseconds_until_sound(audio)
trimmed = audio[start_trim:]
new_filename = directory / f"trimmed_{filename}"
trimmed.export(new_filename, format="wav")
return trimmed, new_filename
###############################################################################
# -- segment the audio into smaller parts (1-minute segments for large files)
###############################################################################
def segment_audio(trimmed_audio, output_dir_trimmed):
one_minute = 1 * 60 * 1000 # 1 minute in milliseconds
start_time = 0
i = 0
# -- iterate through trimmed audio, segment it
segmented_files = []
while start_time < len(trimmed_audio):
segment = trimmed_audio[start_time:start_time + one_minute]
# -- filename for each segment
file_name = f"trimmed_{i:02d}.wav"
# --export each segment, save to the Hugging Face hub directly
file_path = file_name
segment.export(file_path, format="wav")
segmented_files.append(file_path)
start_time += one_minute
i += 1
return segmented_files
###############################################################################
# Transcription logic
###############################################################################
def transcribe(file_upload, progress=gr.Progress(track_tqdm=True)):
file = file_upload
start_time = time.time()
# -- trim auio, segment it for processing
trimmed_audio, trimmed_filename = trim_start(file)
segmented_files = segment_audio(trimmed_audio, "trimmed_audio")
pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, device=device)
transcriptions = [pipe(seg_file)["text"] for seg_file in segmented_files]
text = ''.join(transcriptions)
end_time = time.time()
output_time = end_time - start_time
# --Word count
word_count = len(text.split())
# --CPU metric
cpu_usage = psutil.cpu_percent(interval=1)
# --system info string
system_info = f"""
Processing time: {output_time:.2f} seconds.
Number of words: {word_count}
CPU Usage: {cpu_usage}%
"""
return text, system_info
###############################################################################
# Interface
###############################################################################
HEADER_INFO = """
# This space uses the *Norwegian NB-Whisper Large* model by **NbAiLab** to transcribe long-form microphone or audio inputs in Norwegian of arbitrary length.
""".strip()
css = """
#transcription_output textarea {
background-color: #000000; /* black */
color: #00FF00 !important; /* text color */
font-size: 18px; /* font size */
}
#system_info_box textarea {
background-color: #ffe0b3; /* orange */
color: black !important; /* text color */
font-size: 16px; /* font size */
font-weight: bold; /* bold font */
}
"""
iface = gr.Blocks(css=css)
with iface:
gr.Markdown(HEADER_INFO)
with gr.Row():
upload = gr.Audio(label="Upload audio", sources="upload", type="filepath")
transcribe_btn = gr.Button("Transkriber")
with gr.Row():
with gr.Column(scale=3):
text_output = gr.Textbox(label="Transkribert Tekst", placeholder="t r a n s c r i p t i o", elem_id="transcription_output")
with gr.Column(scale=1):
system_info = gr.Textbox(label="Antall sekunder, ord, system data:", elem_id="system_info_box")
with gr.Row():
gr.Markdown('''
<div style="text-align:center;">
<a href="https://opensource.com/resources/what-open-source" style="display: inline-block;">
<img src="https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github" alt="Open Source? Yes!" style="vertical-align: middle;">
</a>
<span style="display:inline-block; width: 20px;"></span> <!-- This adds space between the logos -->
<a href="https://opensource.org/licenses/Apache-2.0" style="display: inline-block;">
<img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" alt="License: Apache 2.0" style="vertical-align: middle;">
</a>
</div>
''')
transcribe_btn.click(
fn=transcribe,
inputs=[upload],
outputs=[text_output, system_info]
)
iface.launch(debug=True) |