#!/usr/bin/python3 # -*- coding: utf-8 -*- """ 任意格式转到 wav 8000 int16 格式。 多通道转单通道。 音频 pad 加长。 """ import argparse import json from pathlib import Path import platform import tempfile from typing import Tuple, List import uuid import gradio as gr import numpy as np from scipy.io import wavfile from project_settings import environment, project_path from toolbox.audio_edit.info import get_audio_info, engine_to_function as info_engine_to_function from toolbox.audio_edit.convert import audio_convert, engine_to_function as cvt_engine_to_function from toolbox.audio_edit.speech_speed import change_speech_speed, engine_to_function as speed_engine_to_function from toolbox.audio_edit.volume import change_volume, engine_to_function as volume_engine_to_function from toolbox.audio_edit.augment import mix_speech_and_noise from toolbox.audio_edit.reverb import reverb, engine_to_function as reverb_engine_to_function def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--examples_dir", default=(project_path / "data/examples").as_posix(), type=str, ) args = parser.parse_args() return args def save_input_audio(sample_rate: int, signal: np.ndarray) -> str: temp_audio_dir = Path(tempfile.gettempdir()) / "input_audio" temp_audio_dir.mkdir(parents=True, exist_ok=True) filename = temp_audio_dir / f"{uuid.uuid4()}.wav" filename = filename.as_posix() wavfile.write( filename, sample_rate, signal ) return filename def when_click_get_audio_info(audio_t, engine: str): sample_rate, signal = audio_t filename = save_input_audio(sample_rate, signal) message = "success" try: info: dict = get_audio_info(filename, engine) result = json.dumps(info, ensure_ascii=False, indent=4) except Exception as e: result = None message = f"failed. error type: {type(e)}, error text: {str(e)}" return result, message def when_click_audio_convert(audio_t, to_sample_rate: int = 8000, sample_width: int = 2, channels: str = "0", engine: str = "librosa", ) -> Tuple[str, str, str, str]: sample_rate, signal = audio_t filename = save_input_audio(sample_rate, signal) message = "success" try: output_file: str = audio_convert(filename, to_sample_rate=to_sample_rate, sample_width=sample_width, channels=channels, engine=engine, ) origin_audio_info: dict = get_audio_info(filename, engine="wave") origin_audio_info = json.dumps(origin_audio_info, ensure_ascii=False, indent=4) output_audio_info: dict = get_audio_info(output_file, engine="wave") output_audio_info = json.dumps(output_audio_info, ensure_ascii=False, indent=4) except Exception as e: output_file = None origin_audio_info = None output_audio_info = None message = f"failed. error type: {type(e)}, error text: {str(e)}" return output_file, origin_audio_info, output_audio_info, message def when_click_change_speech_speed(audio_t, speed: float = 1.0, engine: str = "librosa"): sample_rate, signal = audio_t filename = save_input_audio(sample_rate, signal) message = "success" try: output_file: str = change_speech_speed(filename, speed, engine) origin_audio_info: dict = get_audio_info(filename, engine="pydub") origin_audio_info = json.dumps(origin_audio_info, ensure_ascii=False, indent=4) output_audio_info: dict = get_audio_info(output_file, engine="pydub") output_audio_info = json.dumps(output_audio_info, ensure_ascii=False, indent=4) except Exception as e: output_file = None origin_audio_info = None output_audio_info = None message = f"failed. error type: {type(e)}, error text: {str(e)}" return output_file, origin_audio_info, output_audio_info, message def when_click_change_volume(audio_t: str, radio: float = 1.0, decibel: float = 0.0, reference: str = None, engine: str = "by_ffmpy_by_db", ): sample_rate, signal = audio_t filename = save_input_audio(sample_rate, signal) message = "success" try: output_file: str = change_volume(filename, radio, decibel, reference, engine) except Exception as e: output_file = None message = f"failed. error type: {type(e)}, error text: {str(e)}" return output_file, message def when_click_pad_audio(audio_t, pad_seconds: int = 10, pad_mode: str = "zero"): sample_rate, signal = audio_t message = "success" pad_signal = signal try: if not signal.ndim == 1: raise AssertionError pad_length = int(pad_seconds * sample_rate) if pad_mode == "zero": pad = np.zeros(shape=(pad_length,), dtype=signal.dtype) elif pad_mode == "repeat": signal_length = len(signal) if pad_length <= signal_length: pad = signal[:pad_length] else: a = pad_length // signal_length pad = np.concat([signal] * int(a + 1), axis=-1) pad = pad[:pad_length] else: raise NotImplementedError pad_signal = np.concat([signal, pad], axis=-1) except Exception as e: message = f"failed. error type: {type(e)}, error text: {str(e)}" return (sample_rate, pad_signal), message def when_click_reverb(audio_t, kwargs: str, engine: str): sample_rate, signal = audio_t message = "success" try: signal = np.array(signal / (1 << 15), dtype=np.float32) kwargs = json.loads(kwargs) reverberated_audio = reverb( signal=signal, sample_rate=sample_rate, engine=engine, **kwargs, ) reverberated_audio = np.array(reverberated_audio * (1 << 15), dtype=np.int16) except Exception as e: reverberated_audio = None message = f"failed. error type: {type(e)}, error text: {str(e)}" return (sample_rate, reverberated_audio), message def when_click_mix_speech_and_noise(speech_t, noise_t, snr_db: float): sample_rate1, speech = speech_t sample_rate2, noise = noise_t message = "success" mix_signal = speech try: if sample_rate1 != sample_rate2: raise AssertionError(f"sr of speech: {sample_rate1}, sr of noise: {sample_rate2}") if speech.dtype == np.int16: speech = np.array(speech, dtype=np.float32) speech /= (1 << 15) else: raise NotImplementedError if noise.dtype == np.int16: noise = np.array(noise, dtype=np.float32) noise /= (1 << 15) else: raise NotImplementedError mix_signal = mix_speech_and_noise(speech, noise, snr_db) mix_signal = np.array(mix_signal * (1 << 15), dtype=np.int16) except Exception as e: message = f"failed. error type: {type(e)}, error text: {str(e)}" # np.int16 return (sample_rate1, mix_signal), message audio_convert_examples = [ [ (project_path / "data/examples/default/audio_0_2.wav").as_posix(), 8000, 2, "0", "librosa" ] ] change_volume_examples = [ [ (project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(), 1.0, -10.0, None, "by_ffmpy_by_db" ], [ (project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(), 0.3, 0.0, None, "by_ffmpy_by_radio" ], [ (project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(), 1.0, -10.0, None, "by_pydub_by_db" ], [ (project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(), 1.0, 0.0, (project_path / "data/examples/default/audio_0_2.wav").as_posix(), "by_pydub_by_reference" ] ] pad_audio_examples = [ [ (project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(), 10, "zero", ], ] reverb_examples = [ [ (project_path / "data/examples/default/audio_0_2.wav").as_posix(), '{\n "room_size": 0.25,\n "damping": 0.5,\n "width": 1.0,\n "dry_level": 0.4,\n "wet_level": 0.6,\n "freeze_mode": false\n}', "pedalboard", ], [ (project_path / "data/examples/default/audio_0_2.wav").as_posix(), '{\n "room_size": [4.0, 6.0],\n "source_position": [2.5, 4.5],\n "microphone_array": [\n [1.5, 1.5],\n [2.5, 1.5]\n ],\n "output_microphone_idx": 0\n}', "pyroomacoustics", ] ] mix_speech_and_noise_examples = [ [ (project_path / "data/examples/mix/speech/000f62f5-5b05-4494-a8db-0eaca3ebd871_th-TH_1678353399860.wav").as_posix(), (project_path / "data/examples/mix/noise/000e2a2e-43c8-4752-8e26-34207fa6e9e4_th-TH_1678244573769.wav").as_posix(), -5, ], [ (project_path / "data/examples/mix/speech/0000c655-3a8e-4196-bc31-c01fa8d115cc_th-TH_1678768644585.wav").as_posix(), (project_path / "data/examples/mix/noise/000f28d7-2129-49d5-9942-16ebf60e8285_th-TH_1678343313388.wav").as_posix(), 0, ], [ (project_path / "data/examples/mix/speech/001df4d1-9f7a-4e78-adc9-ef26d07eba60_th-TH_1667878032.0303788.wav").as_posix(), (project_path / "data/examples/mix/noise/0001f9f2-3626-427f-8ae5-105d81fcb5a3_th-TH_1678772646723.wav").as_posix(), 5, ], [ (project_path / "data/examples/mix/speech/001ef59d-b266-4409-b89c-627e3d7fb27d_th-TH_1678356022482.wav").as_posix(), (project_path / "data/examples/mix/noise/00240453-cd58-4059-9a38-d00583b879c7_th-TH_1678168729318.wav").as_posix(), 10, ] ] def main(): args = get_args() # examples examples_dir = Path(args.examples_dir) # choices info_choices = list(info_engine_to_function.keys()) cvt_choices = list(cvt_engine_to_function.keys()) speed_choices = list(speed_engine_to_function.keys()) volume_choices = list(volume_engine_to_function.keys()) reverb_choices = list(reverb_engine_to_function.keys()) # ui with gr.Blocks() as blocks: with gr.Tabs(): with gr.TabItem("info"): with gr.Row(): with gr.Column(variant="panel", scale=5): info_audio = gr.Audio(label="audio") info_engine = gr.Dropdown(choices=info_choices, value=info_choices[0], label="engine") info_button = gr.Button(variant="primary") with gr.Column(variant="panel", scale=5): info_output = gr.Text(label="output") info_log = gr.Text(label="log") gr.Examples( examples=[ [filename.as_posix(), "wave"] for filename in examples_dir.glob("**/*.wav") ], inputs=[info_audio, info_engine], outputs=[info_output, info_log], fn=when_click_get_audio_info, ) info_button.click( when_click_get_audio_info, inputs=[info_audio, info_engine], outputs=[info_output, info_log] ) with gr.TabItem("convert"): with gr.Row(): with gr.Column(variant="panel", scale=5): cvt_audio = gr.Audio(label="audio") with gr.Row(): cvt_sample_rate = gr.Dropdown(choices=[8000], value=8000, label="sample_rate") cvt_sample_width = gr.Dropdown(choices=[2], value=2, label="sample_width") cvt_channels = gr.Text( value="0", label="channels", info = "The channels to be retained, separated by commas, such as `0,1`" ) cvt_engine = gr.Dropdown(choices=cvt_choices, value=cvt_choices[0], label="engine") cvt_button = gr.Button(variant="primary") with gr.Column(variant="panel", scale=5): cvt_output_audio = gr.Audio(label="output_audio") cvt_origin_audio_info = gr.Text(label="origin_audio_info") cvt_output_audio_info = gr.Text(label="output_audio_info") cvt_log = gr.Text(label="log") gr.Examples( examples=audio_convert_examples, inputs=[ cvt_audio, cvt_sample_rate, cvt_sample_width, cvt_channels, cvt_engine, ], outputs=[ cvt_output_audio, cvt_origin_audio_info, cvt_output_audio_info, cvt_log ], fn=when_click_audio_convert, ) cvt_button.click( when_click_audio_convert, inputs=[ cvt_audio, cvt_sample_rate, cvt_sample_width, cvt_channels, cvt_engine, ], outputs=[ cvt_output_audio, cvt_origin_audio_info, cvt_output_audio_info, cvt_log ], ) with gr.TabItem("speech_speed"): with gr.Row(): with gr.Column(variant="panel", scale=5): speech_speed_audio = gr.Audio(label="audio") with gr.Row(): speech_speed_speed = gr.Slider(minimum=0.0, maximum=4.0, value=1.0, label="speed") speech_speed_engine = gr.Dropdown(choices=speed_choices, value=speed_choices[0], label="engine") speech_speed_button = gr.Button(variant="primary") with gr.Column(variant="panel", scale=5): speech_speed_output_audio = gr.Audio(label="output_audio") speech_speed_origin_audio_info = gr.Text(label="origin_audio_info") speech_speed_output_audio_info = gr.Text(label="output_audio_info") speech_speed_log = gr.Text(label="log") gr.Examples( examples=[ [filename.as_posix(), 0.5] for filename in examples_dir.glob("**/*.wav") ], inputs=[speech_speed_audio, speech_speed_speed, speech_speed_engine], outputs=[ speech_speed_output_audio, speech_speed_origin_audio_info, speech_speed_output_audio_info, speech_speed_log, ], fn=when_click_change_speech_speed, ) speech_speed_button.click( when_click_change_speech_speed, inputs=[speech_speed_audio, speech_speed_speed, speech_speed_engine], outputs=[ speech_speed_output_audio, speech_speed_origin_audio_info, speech_speed_output_audio_info, speech_speed_log, ] ) with gr.TabItem("volume"): with gr.Row(): with gr.Column(variant="panel", scale=5): volume_speed_audio = gr.Audio(label="audio") with gr.Row(): with gr.Column(): volume_radio = gr.Slider(minimum=0.0, maximum=3.0, value=1.0, step=0.1, label="radio") volume_decibel = gr.Slider(minimum=-30.0, maximum=30.0, value=0.0, step=0.1, label="decibel") volume_engine = gr.Dropdown(choices=volume_choices, value=volume_choices[0], label="engine") with gr.Column(): volume_reference = gr.File(label="reference") volume_button = gr.Button(variant="primary") with gr.Column(variant="panel", scale=5): volume_output_audio = gr.Audio(label="output_audio") volume_log = gr.Text(label="log") gr.Examples( examples=change_volume_examples, inputs=[volume_speed_audio, volume_radio, volume_decibel, volume_reference, volume_engine], outputs=[ volume_output_audio, volume_log, ], fn=when_click_change_volume, ) volume_button.click( when_click_change_volume, inputs=[volume_speed_audio, volume_radio, volume_decibel, volume_reference, volume_engine], outputs=[ volume_output_audio, volume_log, ] ) with gr.TabItem("pad"): with gr.Row(): with gr.Column(variant="panel", scale=5): pad_audio = gr.Audio(label="audio") with gr.Row(): pad_seconds = gr.Slider(minimum=0, maximum=100, value=20, step=0.1, label="pad_seconds") pad_mode = gr.Dropdown(choices=["zero", "repeat"], value="zero", label="pad_mode") pad_button = gr.Button(variant="primary") with gr.Column(variant="panel", scale=5): pad_output_audio = gr.Audio(label="output_audio") pad_log = gr.Text(label="log") gr.Examples( examples=pad_audio_examples, inputs=[pad_audio, pad_seconds, pad_mode], outputs=[ pad_output_audio, pad_log ], fn=when_click_pad_audio, ) pad_button.click( when_click_pad_audio, inputs=[pad_audio, pad_seconds, pad_mode], outputs=[ pad_output_audio, pad_log ], ) with gr.TabItem("reverb"): with gr.Row(): with gr.Column(variant="panel", scale=5): reverb_audio = gr.Audio(label="audio") reverb_kwargs = gr.Textbox(lines=8, label="kwargs") reverb_engine = gr.Dropdown(choices=reverb_choices, value=reverb_choices[0], label="engine") reverb_button = gr.Button(variant="primary") with gr.Column(variant="panel", scale=5): reverb_output_audio = gr.Audio(label="output_audio") reverb_log = gr.Text(label="log") gr.Examples( examples=reverb_examples, inputs=[reverb_audio, reverb_kwargs, reverb_engine], outputs=[ reverb_output_audio, reverb_log ], fn=when_click_reverb, ) reverb_button.click( when_click_reverb, inputs=[reverb_audio, reverb_kwargs, reverb_engine], outputs=[ reverb_output_audio, reverb_log ], ) with gr.TabItem("mix"): with gr.Row(): with gr.Column(variant="panel", scale=5): mix_speed_audio = gr.Audio(label="speech") mix_noise_audio = gr.Audio(label="noise") with gr.Row(): mix_snr_db = gr.Slider(minimum=-10, maximum=20, value=10, step=0.1, label="snr_db") mix_button = gr.Button(variant="primary") with gr.Column(variant="panel", scale=5): mix_output_audio = gr.Audio(label="output_audio") mix_log = gr.Text(label="log") gr.Examples( examples=mix_speech_and_noise_examples, inputs=[mix_speed_audio, mix_noise_audio, mix_snr_db], outputs=[ mix_output_audio, mix_log ], fn=when_click_mix_speech_and_noise, ) mix_button.click( when_click_mix_speech_and_noise, inputs=[mix_speed_audio, mix_noise_audio, mix_snr_db], outputs=[ mix_output_audio, mix_log ], ) # http://127.0.0.1:7860/ blocks.queue().launch( share=False if platform.system() == "Windows" else False, # server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0", server_name="0.0.0.0", server_port=environment.get("port", 7860, dtype=int), ) return if __name__ == "__main__": main()