import os import shutil from pathlib import Path import gradio as gr import pandas as pd from datasets import Audio, Dataset from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter from speechline.transcribers import Wav2Vec2Transcriber from speechline.utils.tokenizer import WordTokenizer MAX_SEGMENTS = 10 OUTPUT_DIR = "tmp" def segmentation_interface(choice: str): if choice == "Silence Gap": return gr.update(visible=True), gr.update(visible=False) elif choice == "Word Overlap": return gr.update(visible=False), gr.update(visible=True) def run(audio_path, model, segmentation_type, silence_duration, ground_truth): transcriber = Wav2Vec2Transcriber(model) dataset = Dataset.from_dict({"audio": [audio_path]}) dataset = dataset.cast_column( "audio", Audio(sampling_rate=transcriber.sampling_rate) ) output_offsets = transcriber.predict(dataset, output_offsets=True) if segmentation_type == "Silence Gap": segmenter = SilenceSegmenter() elif segmentation_type == "Word Overlap": segmenter = WordOverlapSegmenter() tokenizer = WordTokenizer() if os.path.exists(OUTPUT_DIR): shutil.rmtree(OUTPUT_DIR) segmenter.chunk_audio_segments( audio_path, OUTPUT_DIR, output_offsets[0], minimum_chunk_duration=0, silence_duration=silence_duration, ground_truth=tokenizer(ground_truth), ) outputs, idx = [], 0 for path in sorted(Path(OUTPUT_DIR).rglob("*")): if path.suffix == ".tsv": gt = pd.read_csv( path, sep="\t", names=["start_offset", "end_offset", "text"] ) outputs.append(gr.Dataframe.update(value=gt, visible=True)) elif path.suffix == ".wav": outputs.append(gr.Audio.update(value=str(path), visible=True)) idx += 1 for _ in range(MAX_SEGMENTS - idx): outputs += [gr.Dataframe.update(visible=False), gr.Audio.update(visible=False)] return outputs with gr.Blocks() as demo: gr.Markdown( f"""