|
import sys |
|
print(sys.path) |
|
sys.path.append('/home/user/audio_ai/diffusers_harp/venv/src') |
|
|
|
from pyharp import ModelCard, build_endpoint, save_and_return_filepath |
|
|
|
from audiotools import AudioSignal |
|
import scipy |
|
import torch |
|
import gradio as gr |
|
from diffusers import AudioLDM2Pipeline |
|
import subprocess as sp |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
card = ModelCard( |
|
name='Diffusers AudioLDM2 Generation', |
|
description='AudioLDM2 text-to-audio generation, operates on region selected in track. Not conditioned on selected audio, simply replaces audio in source track with generation.', |
|
author='Team Audio', |
|
tags=['AudioLDM', 'Diffusers', 'Generation'] |
|
) |
|
|
|
|
|
repo_id = "cvssp/audioldm2" |
|
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16) |
|
pipe = pipe.to("cuda") |
|
|
|
|
|
def process_fn(input_audio_path, prompt, negative_prompt, seed, num_inference_steps, audio_length_in_s, num_waveforms_per_prompt): |
|
""" |
|
This function defines the audio processing steps |
|
|
|
Args: |
|
input_audio_path (str): the audio filepath to be processed. |
|
|
|
<YOUR_KWARGS>: additional keyword arguments necessary for processing. |
|
NOTE: These should correspond to and match order of UI elements defined below. |
|
|
|
Returns: |
|
output_audio_path (str): the filepath of the processed audio. |
|
""" |
|
|
|
sig = AudioSignal(input_audio_path) |
|
outfile = "./output.wav" |
|
|
|
|
|
|
|
|
|
|
|
generator = torch.Generator("cuda").manual_seed(int(seed)) |
|
|
|
audio = pipe( |
|
prompt, |
|
negative_prompt=negative_prompt, |
|
num_inference_steps=int(num_inference_steps), |
|
audio_length_in_s=audio_length_in_s, |
|
num_waveforms_per_prompt=int(num_waveforms_per_prompt), |
|
generator=generator, |
|
).audios |
|
|
|
scipy.io.wavfile.write(outfile, rate=16000, data=audio[0]) |
|
return outfile |
|
|
|
|
|
|
|
with gr.Blocks() as webapp: |
|
|
|
inputs = [ |
|
gr.Audio( |
|
label="Audio Input", |
|
type="filepath" |
|
), |
|
gr.Text( |
|
label="Prompt", |
|
interactive=True |
|
), |
|
gr.Text( |
|
label="Negative Prompt", |
|
interactive=True |
|
), |
|
gr.Slider( |
|
label="seed", |
|
minimum="0", |
|
maximum="65535", |
|
value="0", |
|
step="1" |
|
), |
|
gr.Slider( |
|
minimum=1, maximum=500, |
|
step=1, value=1, |
|
label="Inference Steps" |
|
), |
|
gr.Slider( |
|
minimum=2.5, maximum=10.0, |
|
step=2.5, value=2.5, |
|
label="Duration" |
|
), |
|
gr.Slider( |
|
minimum=1, maximum=10, |
|
step=1, value=1, |
|
label="Waveforms Per Prompt" |
|
), |
|
] |
|
|
|
|
|
output = gr.Audio(label="Audio Output", type="filepath") |
|
|
|
|
|
ctrls_data, ctrls_button, process_button, cancel_button = build_endpoint(inputs, output, process_fn, card) |
|
|
|
|
|
webapp.launch(share=True) |
|
|