|
import os |
|
import sys |
|
module_path = os.path.abspath(os.path.join('..')) |
|
if module_path not in sys.path: |
|
sys.path.append(module_path) |
|
from transcribe.transcribe import transcriber, languages |
|
import gradio as gr |
|
import torch |
|
import torchaudio |
|
import torch.cuda as cuda |
|
import platform |
|
from transformers import __version__ as transformers_version |
|
from dotenv import load_dotenv |
|
import shutil |
|
from docx import Document |
|
import logging |
|
import subprocess |
|
load_dotenv(override=True) |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
|
HF_AUTH_TOKEN = os.getenv("HF_AUTH_TOKEN") |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
num_gpus = cuda.device_count() if torch.cuda.is_available() else 0 |
|
cuda_version = torch.version.cuda if torch.cuda.is_available() else "N/A" |
|
cudnn_version = torch.backends.cudnn.version() if torch.cuda.is_available() else "N/A" |
|
os_info = platform.system() + " " + platform.release() + " " + platform.machine() |
|
|
|
|
|
vram_info = [] |
|
if torch.cuda.is_available(): |
|
for i in range(cuda.device_count()): |
|
gpu_properties = cuda.get_device_properties(i) |
|
vram_info.append(f"**GPU {i}: {gpu_properties.total_memory / 1024**3:.2f} GB**") |
|
|
|
pytorch_version = torch.__version__ |
|
torchaudio_version = torchaudio.__version__ if 'torchaudio' in dir() else "N/A" |
|
|
|
device_info = f"""Running on: **{device}** |
|
|
|
Number of GPUs available: **{num_gpus}** |
|
|
|
CUDA version: **{cuda_version}** |
|
|
|
CuDNN version: **{cudnn_version}** |
|
|
|
PyTorch version: **{pytorch_version}** |
|
|
|
Torchaudio version: **{torchaudio_version}** |
|
|
|
Transformers version: **{transformers_version}** |
|
|
|
Operating system: **{os_info}** |
|
|
|
Available VRAM: |
|
\t {', '.join(vram_info) if vram_info else '**N/A**'} |
|
""" |
|
|
|
css = """ |
|
#audio_input { |
|
padding-bottom: 50px; |
|
} |
|
""" |
|
|
|
def format_srt_time(timestamp): |
|
"""Formats the timestamp into SRT time format.""" |
|
hours, remainder = divmod(timestamp, 3600) |
|
minutes, seconds = divmod(remainder, 60) |
|
milliseconds = int((seconds - int(seconds)) * 1000) |
|
return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}" |
|
|
|
def generate_srt_content(chunks): |
|
"""Generates the content for an SRT file based on transcription chunks.""" |
|
srt_content = "" |
|
for i, chunk in enumerate(chunks, start=1): |
|
try: |
|
start, end = chunk["timestamp"] |
|
start_time = format_srt_time(start) |
|
end_time = format_srt_time(end) |
|
text = chunk["text"] |
|
srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n" |
|
except: |
|
logging.info("couldn't add phrase") |
|
continue |
|
return srt_content.strip() |
|
|
|
def create_black_screen_video(audio_file_path, output_video_path): |
|
""" |
|
Creates a video with an empty black screen and the original audio from the input audio file. |
|
|
|
Parameters: |
|
- audio_file_path: Path to the input audio file. |
|
- output_video_path: Path where the output video will be saved. |
|
""" |
|
|
|
output_dir = os.path.dirname(output_video_path) |
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
|
|
|
|
command = [ |
|
'ffmpeg', |
|
'-y', |
|
'-f', 'lavfi', |
|
'-i', 'color=c=black:s=320x240:r=10', |
|
'-i', audio_file_path, |
|
'-c:v', 'libx264', |
|
'-tune', 'stillimage', |
|
'-c:a', 'aac', |
|
'-b:a', '192k', |
|
'-shortest', |
|
output_video_path |
|
] |
|
|
|
|
|
subprocess.run(command, check=True) |
|
|
|
|
|
def process_folder(files_source, model, language, translate, diarize, diarization_token): |
|
output_folder_path = "./tmp" |
|
if not os.path.exists(output_folder_path): |
|
os.makedirs(output_folder_path) |
|
|
|
for file_path in files_source: |
|
|
|
if file_path.endswith(('.mp3', '.mp4', '.wav')): |
|
|
|
file_name = os.path.basename(file_path) |
|
|
|
|
|
output_audio_filepath = os.path.join(output_folder_path, file_name) |
|
shutil.copy2(file_path, output_audio_filepath) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
transcription_result = transcriber(file_path, model, language=language, translate=translate, diarize=diarize, input_diarization_token=diarization_token) |
|
print(transcription_result) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def inference(input, model, language, translate, diarize, input_diarization_token): |
|
tr = transcriber(input, model, language, translate, diarize, input_diarization_token) |
|
return {textbox: gr.update(value=tr)} |
|
|
|
with gr.Blocks(title="Automatic speech recognition (beta)", css=css, analytics_enabled=False) as demo: |
|
with gr.Row(): |
|
gr.Markdown( |
|
""" |
|
# Automatic speech recognition |
|
|
|
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) |
|
|
|
![Python 3.10](https://raw.githubusercontent.com/tools4eu/automatic-speech-recognition/main/badges/python3_10.svg) |
|
|
|
Report issues [here](https://github.com/tools4eu/automatic-speech-recognition/issues) |
|
""" |
|
|
|
) |
|
|
|
|
|
with gr.Tab("Upload/record sound"): |
|
with gr.Column(): |
|
dropdown_model = gr.Dropdown( |
|
label='Model', |
|
choices = ["openai/whisper-large-v3", "openai/whisper-medium", "openai/whisper-small", "openai/whisper-tiny"], |
|
value="openai/whisper-large-v3", |
|
info=""" |
|
Larger models will increase the quality of the transcription, but reduce performance. |
|
""") |
|
with gr.Row(): |
|
with gr.Column(): |
|
upl_input = gr.Audio(type='filepath', elem_id="audio_input") |
|
upl_language = gr.Dropdown( |
|
label='Language', |
|
choices = ['Automatic detection']+sorted(list(languages.keys())), |
|
value='Automatic detection', |
|
info=""" |
|
Setting the language to "Automatic detection" will auto-detect the language based on the first 30 seconds. |
|
If the language is known upfront, always set it manually. |
|
""") |
|
|
|
with gr.Row(): |
|
upl_translate = gr.Checkbox(label='Translate to English') |
|
|
|
with gr.Column(): |
|
with gr.Group(): |
|
input_diarization_token = gr.Textbox(label='Paste your HF token here for speaker diarization (or add it as an environment variable)', value=HF_AUTH_TOKEN) |
|
check_diarization = gr.Checkbox(label='Speaker diarization') |
|
with gr.Accordion("For more details click here...", open=False): |
|
gr.Markdown(""" |
|
An access token can be created [here](https://hf.co/settings/tokens) |
|
|
|
If not done yet for your account, you need to [accept segmentation terms & conditions](https://huggingface.co/pyannote/segmentation-3.0) |
|
|
|
If not done yet for your account, you need to [accept diarization terms & conditions](https://huggingface.co/pyannote/speaker-diarization-3.1) |
|
""") |
|
|
|
with gr.Row(): |
|
upl_btn = gr.Button("Transcribe") |
|
|
|
with gr.Row(variant='panel'): |
|
with gr.Column(): |
|
textbox = gr.Textbox(label='Transciption',visible=True) |
|
|
|
with gr.Tab("Process multiple files"): |
|
files_source=gr.Files(label="Select Audio Files", file_count="multiple") |
|
with gr.Column(): |
|
dropdown_model_multi = gr.Dropdown( |
|
label='Model', |
|
choices = ["openai/whisper-large-v3", "openai/whisper-medium", "openai/whisper-small", "openai/whisper-tiny"], |
|
value="openai/whisper-large-v3", |
|
info=""" |
|
Larger models will increase the quality of the transcription, but reduce performance. |
|
""") |
|
dropdown_lang_multi = gr.Dropdown( |
|
label='Language', |
|
choices = ['Automatic detection']+sorted(list(languages.keys())), |
|
value='Automatic detection', |
|
info=""" |
|
Setting the language to "Automatic detection" will auto-detect the language based on the first 30 seconds. |
|
If the language is known upfront, always set it manually. |
|
""") |
|
checkbox_trans_multi = gr.Checkbox(label='Translate to English') |
|
with gr.Column(): |
|
with gr.Group(): |
|
input_diarization_token_multi = gr.Textbox(label='Paste your Hugging Face token here for speaker diarization (or add it as an environment variable)', value=HF_AUTH_TOKEN) |
|
check_diarization_multi = gr.Checkbox(label='Speaker diarization') |
|
with gr.Accordion("For more details click here...", open=False): |
|
gr.Markdown(""" |
|
An access token can be created [here](https://hf.co/settings/tokens) |
|
|
|
If not done yet for your account, you need to [accept segmentation terms & conditions](https://huggingface.co/pyannote/segmentation-3.0) |
|
|
|
If not done yet for your account, you need to [accept diarization terms & conditions](https://huggingface.co/pyannote/speaker-diarization-3.1) |
|
""") |
|
btn_transcribe_multi= gr.Button("Transcribe") |
|
textbox_transcribe_multi= gr.Chatbot(label='Transciption',visible=True) |
|
|
|
with gr.Tab("Device info"): |
|
gr.Markdown(device_info, label="Hardware info & installed packages") |
|
|
|
|
|
transcribe_event = upl_btn.click(fn=inference, inputs=[upl_input, dropdown_model, upl_language, upl_translate, check_diarization, input_diarization_token], outputs=[textbox], concurrency_limit=1) |
|
|
|
transcribe_files_event = btn_transcribe_multi.click(fn=process_folder, inputs=[files_source, dropdown_model_multi, dropdown_lang_multi, check_diarization_multi, checkbox_trans_multi, input_diarization_token_multi], outputs=[], concurrency_limit=1) |
|
|
|
demo.queue().launch(server_name="0.0.0.0") |