import torch import matplotlib.pyplot as plt from datasets import load_dataset from diffusers import DiffusionPipeline from transformers import ( WhisperForConditionalGeneration, WhisperProcessor, ) device = "cuda" if torch.cuda.is_available() else "cpu" ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") audio_sample = ds[3] text = audio_sample["text"].lower() speech_data = audio_sample["audio"]["array"] model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device) processor = WhisperProcessor.from_pretrained("openai/whisper-small") diffuser_pipeline = DiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", custom_pipeline="audio_to_image_pipeline.py", speech_model=model, speech_processor=processor, torch_dtype=torch.float16, ) diffuser_pipeline.enable_attention_slicing() diffuser_pipeline = diffuser_pipeline.to(device) output = diffuser_pipeline(speech_data) plt.imshow(output.images[0])