clone3 commited on
Commit
8635465
·
verified ·
1 Parent(s): 1fbebb5

Delete lcm_ov_pipeline.py

Browse files
Files changed (1) hide show
  1. lcm_ov_pipeline.py +0 -388
lcm_ov_pipeline.py DELETED
@@ -1,388 +0,0 @@
1
- import inspect
2
-
3
- from pathlib import Path
4
- from tempfile import TemporaryDirectory
5
- from typing import List, Optional, Tuple, Union, Dict, Any, Callable, OrderedDict
6
-
7
- import numpy as np
8
- import openvino
9
- import torch
10
-
11
- from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
12
- from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipeline, OVModelUnet, OVModelVaeDecoder, OVModelTextEncoder, OVModelVaeEncoder, VaeImageProcessor
13
- from optimum.utils import (
14
- DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
15
- DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
16
- DIFFUSION_MODEL_UNET_SUBFOLDER,
17
- DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
18
- DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
19
- )
20
-
21
-
22
- from diffusers import logging
23
- logger = logging.get_logger(__name__) # pylint: disable=invalid-name
24
-
25
- class LCMOVModelUnet(OVModelUnet):
26
- def __call__(
27
- self,
28
- sample: np.ndarray,
29
- timestep: np.ndarray,
30
- encoder_hidden_states: np.ndarray,
31
- timestep_cond: Optional[np.ndarray] = None,
32
- text_embeds: Optional[np.ndarray] = None,
33
- time_ids: Optional[np.ndarray] = None,
34
- ):
35
- self._compile()
36
-
37
- inputs = {
38
- "sample": sample,
39
- "timestep": timestep,
40
- "encoder_hidden_states": encoder_hidden_states,
41
- }
42
-
43
- if timestep_cond is not None:
44
- inputs["timestep_cond"] = timestep_cond
45
- if text_embeds is not None:
46
- inputs["text_embeds"] = text_embeds
47
- if time_ids is not None:
48
- inputs["time_ids"] = time_ids
49
-
50
- outputs = self.request(inputs, shared_memory=True)
51
- return list(outputs.values())
52
-
53
- class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
54
-
55
- def __init__(
56
- self,
57
- vae_decoder: openvino.runtime.Model,
58
- text_encoder: openvino.runtime.Model,
59
- unet: openvino.runtime.Model,
60
- config: Dict[str, Any],
61
- tokenizer: "CLIPTokenizer",
62
- scheduler: Union["DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler"],
63
- feature_extractor: Optional["CLIPFeatureExtractor"] = None,
64
- vae_encoder: Optional[openvino.runtime.Model] = None,
65
- text_encoder_2: Optional[openvino.runtime.Model] = None,
66
- tokenizer_2: Optional["CLIPTokenizer"] = None,
67
- device: str = "CPU",
68
- dynamic_shapes: bool = True,
69
- compile: bool = True,
70
- ov_config: Optional[Dict[str, str]] = None,
71
- model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
72
- **kwargs,
73
- ):
74
- self._internal_dict = config
75
- self._device = device.upper()
76
- self.is_dynamic = dynamic_shapes
77
- self.ov_config = ov_config if ov_config is not None else {}
78
- self._model_save_dir = (
79
- Path(model_save_dir.name) if isinstance(model_save_dir, TemporaryDirectory) else model_save_dir
80
- )
81
- self.vae_decoder = OVModelVaeDecoder(vae_decoder, self)
82
- self.unet = LCMOVModelUnet(unet, self)
83
- self.text_encoder = OVModelTextEncoder(text_encoder, self) if text_encoder is not None else None
84
- self.text_encoder_2 = (
85
- OVModelTextEncoder(text_encoder_2, self, model_name=DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER)
86
- if text_encoder_2 is not None
87
- else None
88
- )
89
- self.vae_encoder = OVModelVaeEncoder(vae_encoder, self) if vae_encoder is not None else None
90
-
91
- if "block_out_channels" in self.vae_decoder.config:
92
- self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1)
93
- else:
94
- self.vae_scale_factor = 8
95
-
96
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
97
-
98
- self.tokenizer = tokenizer
99
- self.tokenizer_2 = tokenizer_2
100
- self.scheduler = scheduler
101
- self.feature_extractor = feature_extractor
102
- self.safety_checker = None
103
- self.preprocessors = []
104
-
105
- if self.is_dynamic:
106
- self.reshape(batch_size=-1, height=-1, width=-1, num_images_per_prompt=-1)
107
-
108
- if compile:
109
- self.compile()
110
-
111
- sub_models = {
112
- DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder,
113
- DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet,
114
- DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder,
115
- DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder,
116
- DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2,
117
- }
118
- for name in sub_models.keys():
119
- self._internal_dict[name] = (
120
- ("optimum", sub_models[name].__class__.__name__) if sub_models[name] is not None else (None, None)
121
- )
122
-
123
- self._internal_dict.pop("vae", None)
124
-
125
- def _reshape_unet(
126
- self,
127
- model: openvino.runtime.Model,
128
- batch_size: int = -1,
129
- height: int = -1,
130
- width: int = -1,
131
- num_images_per_prompt: int = -1,
132
- tokenizer_max_length: int = -1,
133
- ):
134
- if batch_size == -1 or num_images_per_prompt == -1:
135
- batch_size = -1
136
- else:
137
- batch_size = batch_size * num_images_per_prompt
138
-
139
- height = height // self.vae_scale_factor if height > 0 else height
140
- width = width // self.vae_scale_factor if width > 0 else width
141
- shapes = {}
142
- for inputs in model.inputs:
143
- shapes[inputs] = inputs.get_partial_shape()
144
- if inputs.get_any_name() == "timestep":
145
- shapes[inputs][0] = 1
146
- elif inputs.get_any_name() == "sample":
147
- in_channels = self.unet.config.get("in_channels", None)
148
- if in_channels is None:
149
- in_channels = shapes[inputs][1]
150
- if in_channels.is_dynamic:
151
- logger.warning(
152
- "Could not identify `in_channels` from the unet configuration, to statically reshape the unet please provide a configuration."
153
- )
154
- self.is_dynamic = True
155
-
156
- shapes[inputs] = [batch_size, in_channels, height, width]
157
- elif inputs.get_any_name() == "timestep_cond":
158
- shapes[inputs] = [batch_size, inputs.get_partial_shape()[1]]
159
- elif inputs.get_any_name() == "text_embeds":
160
- shapes[inputs] = [batch_size, self.text_encoder_2.config["projection_dim"]]
161
- elif inputs.get_any_name() == "time_ids":
162
- shapes[inputs] = [batch_size, inputs.get_partial_shape()[1]]
163
- else:
164
- shapes[inputs][0] = batch_size
165
- shapes[inputs][1] = tokenizer_max_length
166
- model.reshape(shapes)
167
- return model
168
-
169
- def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=np.float32):
170
- """
171
- see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
172
- Args:
173
- timesteps: np.array: generate embedding vectors at these timesteps
174
- embedding_dim: int: dimension of the embeddings to generate
175
- dtype: data type of the generated embeddings
176
-
177
- Returns:
178
- embedding vectors with shape `(len(timesteps), embedding_dim)`
179
- """
180
- assert len(w.shape) == 1
181
- w = w * 1000.
182
-
183
- half_dim = embedding_dim // 2
184
- emb = np.log(np.array(10000.)) / (half_dim - 1)
185
- emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb)
186
- emb = w.astype(dtype)[:, None] * emb[None, :]
187
- emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1)
188
- if embedding_dim % 2 == 1: # zero pad
189
- emb = np.pad(emb, (0, 1))
190
- assert emb.shape == (w.shape[0], embedding_dim)
191
- return emb
192
-
193
- # Adapted from https://github.com/huggingface/optimum/blob/15b8d1eed4d83c5004d3b60f6b6f13744b358f01/optimum/pipelines/diffusers/pipeline_stable_diffusion.py#L201
194
- def __call__(
195
- self,
196
- prompt: Optional[Union[str, List[str]]] = None,
197
- height: Optional[int] = None,
198
- width: Optional[int] = None,
199
- num_inference_steps: int = 4,
200
- original_inference_steps: int = None,
201
- guidance_scale: float = 7.5,
202
- num_images_per_prompt: int = 1,
203
- eta: float = 0.0,
204
- generator: Optional[np.random.RandomState] = None,
205
- latents: Optional[np.ndarray] = None,
206
- prompt_embeds: Optional[np.ndarray] = None,
207
- output_type: str = "pil",
208
- return_dict: bool = True,
209
- callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
210
- callback_steps: int = 1,
211
- guidance_rescale: float = 0.0,
212
- ):
213
- r"""
214
- Function invoked when calling the pipeline for generation.
215
-
216
- Args:
217
- prompt (`Optional[Union[str, List[str]]]`, defaults to None):
218
- The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
219
- instead.
220
- height (`Optional[int]`, defaults to None):
221
- The height in pixels of the generated image.
222
- width (`Optional[int]`, defaults to None):
223
- The width in pixels of the generated image.
224
- num_inference_steps (`int`, defaults to 4):
225
- The number of denoising steps. More denoising steps usually lead to a higher quality image at the
226
- expense of slower inference.
227
- original_inference_steps (`int`, *optional*):
228
- The original number of inference steps use to generate a linearly-spaced timestep schedule, from which
229
- we will draw `num_inference_steps` evenly spaced timesteps from as our final timestep schedule,
230
- following the Skipping-Step method in the paper (see Section 4.3). If not set this will default to the
231
- scheduler's `original_inference_steps` attribute.
232
- guidance_scale (`float`, defaults to 7.5):
233
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
234
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
235
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
236
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
237
- usually at the expense of lower image quality.
238
- num_images_per_prompt (`int`, defaults to 1):
239
- The number of images to generate per prompt.
240
- eta (`float`, defaults to 0.0):
241
- Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
242
- [`schedulers.DDIMScheduler`], will be ignored for others.
243
- generator (`Optional[np.random.RandomState]`, defaults to `None`)::
244
- A np.random.RandomState to make generation deterministic.
245
- latents (`Optional[np.ndarray]`, defaults to `None`):
246
- Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
247
- generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
248
- tensor will ge generated by sampling using the supplied random `generator`.
249
- prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
250
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
251
- provided, text embeddings will be generated from `prompt` input argument.
252
- output_type (`str`, defaults to `"pil"`):
253
- The output format of the generate image. Choose between
254
- [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
255
- return_dict (`bool`, defaults to `True`):
256
- Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
257
- plain tuple.
258
- callback (Optional[Callable], defaults to `None`):
259
- A function that will be called every `callback_steps` steps during inference. The function will be
260
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
261
- callback_steps (`int`, defaults to 1):
262
- The frequency at which the `callback` function will be called. If not specified, the callback will be
263
- called at every step.
264
- guidance_rescale (`float`, defaults to 0.0):
265
- Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
266
- Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
267
- [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
268
- Guidance rescale factor should fix overexposure when using zero terminal SNR.
269
-
270
- Returns:
271
- [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
272
- [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
273
- When returning a tuple, the first element is a list with the generated images, and the second element is a
274
- list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
275
- (nsfw) content, according to the `safety_checker`.
276
- """
277
- height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
278
- width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
279
-
280
- # check inputs. Raise error if not correct
281
- self.check_inputs(
282
- prompt, height, width, callback_steps, None, prompt_embeds, None
283
- )
284
-
285
- # define call parameters
286
- if isinstance(prompt, str):
287
- batch_size = 1
288
- elif isinstance(prompt, list):
289
- batch_size = len(prompt)
290
- else:
291
- batch_size = prompt_embeds.shape[0]
292
-
293
- if generator is None:
294
- generator = np.random
295
-
296
- # Create torch.Generator instance with same state as np.random.RandomState
297
- torch_generator = torch.Generator().manual_seed(int(generator.get_state()[1][0]))
298
-
299
- #do_classifier_free_guidance = guidance_scale > 1.0
300
-
301
- # NOTE: when a LCM is distilled from an LDM via latent consistency distillation (Algorithm 1) with guided
302
- # distillation, the forward pass of the LCM learns to approximate sampling from the LDM using CFG with the
303
- # unconditional prompt "" (the empty string). Due to this, LCMs currently do not support negative prompts.
304
- prompt_embeds = self._encode_prompt(
305
- prompt,
306
- num_images_per_prompt,
307
- False,
308
- negative_prompt=None,
309
- prompt_embeds=prompt_embeds,
310
- negative_prompt_embeds=None,
311
- )
312
-
313
- # set timesteps
314
- self.scheduler.set_timesteps(num_inference_steps, "cpu", original_inference_steps=original_inference_steps)
315
- timesteps = self.scheduler.timesteps
316
-
317
- latents = self.prepare_latents(
318
- batch_size * num_images_per_prompt,
319
- self.unet.config.get("in_channels", 4),
320
- height,
321
- width,
322
- prompt_embeds.dtype,
323
- generator,
324
- latents,
325
- )
326
-
327
- # Get Guidance Scale Embedding
328
- w = np.tile(guidance_scale - 1, batch_size * num_images_per_prompt)
329
- w_embedding = self.get_guidance_scale_embedding(w, embedding_dim=self.unet.config.get("time_cond_proj_dim", 256))
330
-
331
- # Adapted from diffusers to extend it for other runtimes than ORT
332
- timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
333
-
334
- # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
335
- # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
336
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
337
- # and should be between [0, 1]
338
- accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
339
- extra_step_kwargs = {}
340
- if accepts_eta:
341
- extra_step_kwargs["eta"] = eta
342
-
343
- accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
344
- if accepts_generator:
345
- extra_step_kwargs["generator"] = torch_generator
346
-
347
- num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
348
- for i, t in enumerate(self.progress_bar(timesteps)):
349
-
350
- # predict the noise residual
351
- timestep = np.array([t], dtype=timestep_dtype)
352
-
353
- noise_pred = self.unet(sample=latents, timestep=timestep, timestep_cond = w_embedding, encoder_hidden_states=prompt_embeds)[0]
354
-
355
- # compute the previous noisy sample x_t -> x_t-1
356
- latents, denoised = self.scheduler.step(
357
- torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs, return_dict = False
358
- )
359
-
360
- latents, denoised = latents.numpy(), denoised.numpy()
361
-
362
- # call the callback, if provided
363
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
364
- if callback is not None and i % callback_steps == 0:
365
- callback(i, t, latents)
366
-
367
- if output_type == "latent":
368
- image = latents
369
- has_nsfw_concept = None
370
- else:
371
- denoised /= self.vae_decoder.config.get("scaling_factor", 0.18215)
372
- # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
373
- image = np.concatenate(
374
- [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(latents.shape[0])]
375
- )
376
- image, has_nsfw_concept = self.run_safety_checker(image)
377
-
378
- if has_nsfw_concept is None:
379
- do_denormalize = [True] * image.shape[0]
380
- else:
381
- do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
382
-
383
- image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
384
-
385
- if not return_dict:
386
- return (image, has_nsfw_concept)
387
-
388
- return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)