|
import os |
|
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" |
|
from diffusers import LDMTextToImagePipeline |
|
import gradio as gr |
|
import PIL.Image |
|
import numpy as np |
|
import random |
|
import torch |
|
import subprocess |
|
from transformers import Wav2Vec2Processor, Wav2Vec2Tokenizer |
|
from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer |
|
from transformers import WhisperForConditionalGeneration, WhisperConfig, WhisperProcessor |
|
import torchaudio |
|
import nltk |
|
from pydub import AudioSegment |
|
import re |
|
from datasets import load_dataset |
|
from transformers import AutoModelWithLMHead, AutoTokenizer, set_seed, pipeline |
|
import torch |
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel |
|
import torch |
|
from diffusers import StableDiffusionPipeline, AutoencoderKL, UNet2DConditionModel, PNDMScheduler, DPMSolverMultistepScheduler, LMSDiscreteScheduler |
|
from transformers import CLIPTextModel, CLIPTokenizer |
|
from tqdm.auto import tqdm |
|
from torch import autocast |
|
from PIL import Image |
|
torch_device = "cuda" if torch.cuda.is_available() else "cpu" |
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
def generate_lyrics(sample): |
|
model_name = "openai/whisper-tiny.en" |
|
model_config = WhisperConfig.from_pretrained(model_name) |
|
processor = WhisperProcessor.from_pretrained(model_name) |
|
asr_model = WhisperForConditionalGeneration.from_pretrained(model_name, config=model_config) |
|
asr_model.eval() |
|
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features |
|
transcript = asr_model.generate(input_features) |
|
predicted_ids = asr_model.generate(input_features) |
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) |
|
lyrics = transcription[0] |
|
return lyrics |
|
|
|
def generate_summary(lyrics): |
|
summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum") |
|
summary = summarizer(lyrics) |
|
return summary |
|
|
|
def generate_prompt(summary): |
|
model_name = 'gpt2' |
|
tokenizer = GPT2Tokenizer.from_pretrained(model_name) |
|
model = GPT2LMHeadModel.from_pretrained(model_name) |
|
model = model.to(device) |
|
prompt = f"Create an image that represents the feeling of '{summary}'" |
|
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device) |
|
output = model.generate(input_ids, do_sample=True, max_length=100, temperature=0.7) |
|
prompt_text = tokenizer.decode(output[0], skip_special_tokens=True) |
|
return prompt_text |
|
|
|
def generate_image(prompt, |
|
height = 512, |
|
width = 512 , |
|
num_inference_steps = 50 , |
|
guidance_scale = 7.5 , |
|
generator = torch.manual_seed(32), |
|
batch_size = 1,): |
|
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) |
|
pipe = pipe.to(torch_device) |
|
vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae") |
|
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") |
|
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") |
|
unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet") |
|
scheduler = DPMSolverMultistepScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler") |
|
vae = vae.to(torch_device) |
|
text_encoder = text_encoder.to(torch_device) |
|
unet = unet.to(torch_device) |
|
text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt") |
|
with torch.no_grad(): |
|
text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0] |
|
max_length = text_input.input_ids.shape[-1] |
|
uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt") |
|
with torch.no_grad(): |
|
uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0] |
|
text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) |
|
latents = torch.randn((batch_size, unet.in_channels, height // 8, width // 8), generator=generator,) |
|
latents = latents.to(torch_device) |
|
scheduler.set_timesteps(num_inference_steps) |
|
latents = latents * scheduler.init_noise_sigma |
|
for t in tqdm(scheduler.timesteps): |
|
latent_model_input = torch.cat([latents] * 2) |
|
latent_model_input = scheduler.scale_model_input(latent_model_input, t) |
|
with torch.no_grad(): |
|
noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample |
|
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) |
|
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) |
|
latents = scheduler.step(noise_pred, t, latents).prev_sample |
|
latents = 1 / 0.18215 * latents |
|
with torch.no_grad(): |
|
image = vae.decode(latents).sample |
|
image = (image / 2 + 0.5).clamp(0, 1) |
|
image = image.detach().cpu().permute(0, 2, 3, 1).numpy() |
|
images = (image * 255).round().astype("uint8") |
|
pil_images = [Image.fromarray(image) for image in images] |
|
f_images = pil_images |
|
return f_images |
|
|
|
def predict(audio, steps=100, seed=42, guidance_scale=6.0): |
|
generator = torch.manual_seed(seed) |
|
lyrics = generate_lyrics(audio) |
|
summary_1 = generate_summary(lyrics) |
|
prompt_text_1 = generate_prompt(summary_1[0]['summary_text']) |
|
images = generate_image(prompt= prompt_text_1, generator= generator, num_inference_steps=steps, guidance_scale=guidance_scale) |
|
return images[0] |
|
|
|
random_seed = random.randint(0, 2147483647) |
|
gr.Interface( |
|
predict, |
|
inputs=[ |
|
gr.Audio(source="upload", type="filepath"), |
|
|
|
gr.inputs.Slider(1, 100, label='Inference Steps', default=50, step=1), |
|
gr.inputs.Slider(0, 2147483647, label='Seed', default=random_seed, step=1), |
|
gr.inputs.Slider(1.0, 20.0, label='Guidance Scale - how much the prompt will influence the results', default=6.0, step=0.1), |
|
], |
|
|
|
outputs=gr.Image(shape=[256,256], type="pil", elem_id="output_image"), |
|
css="#output_image{width: 256px}", |
|
title="Cover Generator (audio-to-image)", |
|
description="Application of OpenAI tools such as Whisper, ChatGPT, and DALL-E to produce covers for the given audio", |
|
).launch() |