Spaces:

fantaxy
/

tango2

Runtime error

App Files Files Community

seawolf2357 commited on Oct 6, 2024

Commit

382d3da

verified ·

1 Parent(s): 062c717

Create app.py

Browse files

Files changed (1) hide show

app.py +374 -0

app.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import spaces
+import gradio as gr
+import json
+import torch
+import wavio
+from tqdm import tqdm
+from huggingface_hub import snapshot_download
+from models import AudioDiffusion, DDPMScheduler
+from audioldm.audio.stft import TacotronSTFT
+from audioldm.variational_autoencoder import AutoencoderKL
+from pydub import AudioSegment
+from gradio import Markdown
+from diffusers.models.unet_2d_condition import UNet2DConditionModel
+from diffusers import DiffusionPipeline, AudioPipelineOutput
+from transformers import T5EncoderModel, T5Tokenizer, T5TokenizerFast, pipeline
+from typing import Union
+from diffusers.utils.torch_utils import randn_tensor
+from tqdm import tqdm
+from langdetect import detect, DetectorFactory
+# Ensure consistent results from langdetect
+DetectorFactory.seed = 0
+class Tango2Pipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: T5EncoderModel,
+        tokenizer: Union[T5Tokenizer, T5TokenizerFast],
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler
+        )
+    def _encode_prompt(self, prompt):
+        device = self.text_encoder.device
+        batch = self.tokenizer(
+            prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
+        )
+        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
+        encoder_hidden_states = self.text_encoder(
+                input_ids=input_ids, attention_mask=attention_mask
+            )[0]
+        boolean_encoder_mask = (attention_mask == 1).to(device)
+        return encoder_hidden_states, boolean_encoder_mask
+    def _encode_text_classifier_free(self, prompt, num_samples_per_prompt):
+        device = self.text_encoder.device
+        batch = self.tokenizer(
+            prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
+        )
+        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
+        with torch.no_grad():
+            prompt_embeds = self.text_encoder(
+                input_ids=input_ids, attention_mask=attention_mask
+            )[0]
+        prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
+        attention_mask = attention_mask.repeat_interleave(num_samples_per_prompt, 0)
+        # get unconditional embeddings for classifier free guidance
+        uncond_tokens = [""] * len(prompt)
+        max_length = prompt_embeds.shape[1]
+        uncond_batch = self.tokenizer(
+            uncond_tokens, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt",
+        )
+        uncond_input_ids = uncond_batch.input_ids.to(device)
+        uncond_attention_mask = uncond_batch.attention_mask.to(device)
+        with torch.no_grad():
+            negative_prompt_embeds = self.text_encoder(
+                input_ids=uncond_input_ids, attention_mask=uncond_attention_mask
+            )[0]
+        negative_prompt_embeds = negative_prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
+        uncond_attention_mask = uncond_attention_mask.repeat_interleave(num_samples_per_prompt, 0)
+        # For classifier free guidance, we need to do two forward passes.
+        # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
+        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        prompt_mask = torch.cat([uncond_attention_mask, attention_mask])
+        boolean_prompt_mask = (prompt_mask == 1).to(device)
+        return prompt_embeds, boolean_prompt_mask
+    def prepare_latents(self, batch_size, inference_scheduler, num_channels_latents, dtype, device):
+        shape = (batch_size, num_channels_latents, 256, 16)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * inference_scheduler.init_noise_sigma
+        return latents
+    @torch.no_grad()
+    def inference(self, prompt, inference_scheduler, num_steps=20, guidance_scale=3, num_samples_per_prompt=1,
+                  disable_progress=True):
+        device = self.text_encoder.device
+        classifier_free_guidance = guidance_scale > 1.0
+        batch_size = len(prompt) * num_samples_per_prompt
+        if classifier_free_guidance:
+            prompt_embeds, boolean_prompt_mask = self._encode_text_classifier_free(prompt, num_samples_per_prompt)
+        else:
+            prompt_embeds, boolean_prompt_mask = self._encode_prompt(prompt)
+            prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
+            boolean_prompt_mask = boolean_prompt_mask.repeat_interleave(num_samples_per_prompt, 0)
+        inference_scheduler.set_timesteps(num_steps, device=device)
+        timesteps = inference_scheduler.timesteps
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(batch_size, inference_scheduler, num_channels_latents, prompt_embeds.dtype, device)
+        num_warmup_steps = len(timesteps) - num_steps * inference_scheduler.order
+        progress_bar = tqdm(range(num_steps), disable=disable_progress)
+        for i, t in enumerate(timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if classifier_free_guidance else latents
+            latent_model_input = inference_scheduler.scale_model_input(latent_model_input, t)
+            noise_pred = self.unet(
+                latent_model_input, t, encoder_hidden_states=prompt_embeds,
+                encoder_attention_mask=boolean_prompt_mask
+            ).sample
+            # perform guidance
+            if classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = inference_scheduler.step(noise_pred, t, latents).prev_sample
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % inference_scheduler.order == 0):
+                progress_bar.update(1)
+        return latents
+    @torch.no_grad()
+    def __call__(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
+        """ Generate audio for a single prompt string. """
+        with torch.no_grad():
+            latents = self.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
+            mel = self.vae.decode_first_stage(latents)
+            wave = self.vae.decode_to_waveform(mel)
+        return AudioPipelineOutput(audios=wave)
+# Automatic device detection
+if torch.cuda.is_available():
+    device_type = "cuda"
+    device_selection = "cuda:0"
+else:
+    device_type = "cpu"
+    device_selection = "cpu"
+class Tango:
+    def __init__(self, name="declare-lab/tango2", device=device_selection):
+        path = snapshot_download(repo_id=name)
+        vae_config = json.load(open("{}/vae_config.json".format(path)))
+        stft_config = json.load(open("{}/stft_config.json".format(path)))
+        main_config = json.load(open("{}/main_config.json".format(path)))
+        self.vae = AutoencoderKL(**vae_config).to(device)
+        self.stft = TacotronSTFT(**stft_config).to(device)
+        self.model = AudioDiffusion(**main_config).to(device)
+        vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location=device)
+        stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location=device)
+        main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location=device)
+        self.vae.load_state_dict(vae_weights)
+        self.stft.load_state_dict(stft_weights)
+        self.model.load_state_dict(main_weights)
+        print ("Successfully loaded checkpoint from:", name)
+        self.vae.eval()
+        self.stft.eval()
+        self.model.eval()
+        self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder="scheduler")
+    def chunks(self, lst, n):
+        """ Yield successive n-sized chunks from a list. """
+        for i in range(0, len(lst), n):
+            yield lst[i:i + n]
+    def generate(self, prompt, steps=200, guidance=8, samples=1, disable_progress=True):
+        """ Generate audio for a single prompt string. """
+        with torch.no_grad():
+            latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
+            mel = self.vae.decode_first_stage(latents)
+            wave = self.vae.decode_to_waveform(mel)
+        return wave[0]
+    def generate_for_batch(self, prompts, steps=200, guidance=8, samples=1, batch_size=8, disable_progress=True):
+        """ Generate audio for a list of prompt strings. """
+        outputs = []
+        for k in tqdm(range(0, len(prompts), batch_size)):
+            batch = prompts[k: k+batch_size]
+            with torch.no_grad():
+                latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
+                mel = self.vae.decode_first_stage(latents)
+                wave = self.vae.decode_to_waveform(mel)
+                outputs += [item for item in wave]
+        if samples == 1:
+            return outputs
+        else:
+            return list(self.chunks(outputs, samples))
+# Initialize TANGO
+tango = Tango(device=device_selection)
+tango.vae.to(device_type)
+tango.stft.to(device_type)
+tango.model.to(device_type)
+pipe = Tango2Pipeline(
+    vae=tango.vae,
+    text_encoder=tango.model.text_encoder,
+    tokenizer=tango.model.tokenizer,
+    unet=tango.model.unet,
+    scheduler=tango.scheduler
+)
+# Initialize Translation Pipeline
+translation_pipeline = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
+def adjust_audio_length(audio_path, desired_length_sec, output_format):
+    """
+    Adjust the audio to the desired length.
+    If the audio is shorter, pad with silence.
+    If longer, trim the audio.
+    """
+    audio = AudioSegment.from_file(audio_path)
+    desired_length_ms = desired_length_sec * 1000  # Convert to milliseconds
+    if len(audio) < desired_length_ms:
+        # Pad with silence
+        padding = AudioSegment.silent(duration=desired_length_ms - len(audio))
+        audio += padding
+    elif len(audio) > desired_length_ms:
+        # Trim the audio
+        audio = audio[:desired_length_ms]
+    # Export the adjusted audio
+    adjusted_path = f"adjusted.{output_format}"
+    audio.export(adjusted_path, format=output_format)
+    return adjusted_path
+@spaces.GPU(duration=60)
+def gradio_generate(prompt, output_format, steps, guidance, audio_length):
+    """
+    Generate audio based on the prompt, translate if necessary, and adjust its length.
+    """
+    # Detect language
+    try:
+        lang = detect(prompt)
+    except:
+        lang = "unknown"
+    # If the prompt is in Korean, translate to English
+    if lang == "ko":
+        translated = translation_pipeline(prompt)[0]['translation_text']
+        print(f"Translated Prompt: {translated}")
+        prompt_to_use = translated
+    else:
+        prompt_to_use = prompt
+    # Generate audio using the pipeline
+    output_wave = pipe(prompt_to_use, steps, guidance)
+    output_wave = output_wave.audios[0]
+    temp_wav = "temp.wav"
+    wavio.write(temp_wav, output_wave, rate=16000, sampwidth=2)
+    # Adjust audio length
+    adjusted_path = adjust_audio_length(temp_wav, audio_length, output_format)
+    return adjusted_path
+# Gradio input and output components
+input_text = gr.Textbox(lines=2, label="Prompt")
+output_format = gr.Radio(
+    label="Output Format",
+    info="The file you can download",
+    choices=["mp3", "wav"],
+    value="wav"
+)
+audio_length = gr.Slider(
+    minimum=4,
+    maximum=10,
+    step=1,
+    label="Audio Length (seconds)",
+    value=6,
+    interactive=True
+)
+output_audio = gr.Audio(label="Generated Audio", type="filepath")
+denoising_steps = gr.Slider(
+    minimum=100,
+    maximum=200,
+    step=1,
+    label="Steps",
+    value=200,  # Changed from 100 to 200
+    interactive=True
+)
+guidance_scale = gr.Slider(
+    minimum=1,
+    maximum=10,
+    step=0.1,
+    label="Guidance Scale",
+    value=8,  # Changed from 3 to 8
+    interactive=True
+)
+# Gradio interface
+gr_interface = gr.Interface(
+    theme="Nymbo/Nymbo_Theme",
+    fn=gradio_generate,
+    inputs=[input_text, output_format, denoising_steps, guidance_scale, audio_length],
+    outputs=[output_audio],
+    title="T2: Text to SoundFX",
+    allow_flagging=False,
+    examples=[
+        ["조용한 말소리 후 비행기가 멀어지는 소리"],
+        ["사람들이 환호하고 박수치는 소리"],
+        ["강한 바람 소리와 빗소리"],
+        ["Quiet speech and then and airplane flying away"],
+        ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing"],
+        ["Ducks quack and water splashes with some animal screeching in the background"],
+        ["Describe the sound of the ocean"],
+        ["A woman and a baby are having a conversation"],
+        ["A man speaks followed by a popping noise and laughter"],
+        ["A cup is filled from a faucet"],
+        ["An audience cheering and clapping"],
+        ["Rolling thunder with lightning strikes"],
+        ["A dog barking and a cat mewing and a racing car passes by"],
+        ["Gentle water stream, birds chirping and sudden gun shot"],
+        ["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone."],
+        ["A dog barking"],
+        ["A cat meowing"],
+        ["Wooden table tapping sound while water pouring"],
+        ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker"],
+        ["two gunshots followed by birds flying away while chirping"],
+        ["Whistling with birds chirping"],
+        ["A person snoring"],
+        ["Motor vehicles are driving with loud engines and a person whistles"],
+        ["People cheering in a stadium while thunder and lightning strikes"],
+        ["A helicopter is in flight"],
+        ["A dog barking and a man talking and a racing car passes by"],
+    ],
+    cache_examples="lazy", # Turn on to cache.
+)
+# Launch Gradio app
+gr_interface.queue(10).launch()