StevenZhang commited on
Commit
4706800
·
1 Parent(s): 04d1fd1

init upload

Browse files
README.md CHANGED
@@ -3,53 +3,65 @@ license: apache-2.0
3
  ---
4
 
5
  ```
 
 
 
6
  import torch
7
- from transformers import AutoTokenizer, UMT5EncoderModel
8
- from diffusers import AutoencoderKLWan, WanPipeline, WanTransformer3DModel, FlowMatchEulerDiscreteScheduler
9
- from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
10
- from diffusers.utils import export_to_video
11
- from torchvision import transforms
12
- import os
13
- import cv2
14
- os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
15
- import numpy as np
16
-
17
- pretrained_model_name_or_path = "./wan_t2v"
18
- transformer_t2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer')
19
 
20
  text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',
21
  torch_dtype=torch.bfloat16)
22
 
23
- pipe = WanPipeline.from_pretrained(
24
  pretrained_model_name_or_path,
25
- transformer=transformer_t2v,
26
  text_encoder=text_encoder,
 
 
27
  )
28
 
29
- negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
30
-
 
31
  device = "cuda"
32
  seed = 0
33
-
 
34
  generator = torch.Generator(device=device).manual_seed(seed)
 
 
 
 
 
 
35
  inputs = {
36
- "prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
 
 
37
  "negative_prompt": negative_prompt,
 
38
  "generator": generator,
39
- "num_inference_steps": 50,
40
- "flow_shift": 5.0,
41
  "guidance_scale": 5.0,
42
- "height": 720,
43
- "width": 1280,
44
  "num_frames": 81,
45
  "max_sequence_length": 512,
46
- "output_type": "np"
 
 
47
  }
48
 
49
- pipe.enable_model_cpu_offload()
50
 
51
- video = pipe(**inputs).frames[0]
52
 
53
- export_to_video(video, "output.mp4", fps=16)
54
 
55
  ```
 
3
  ---
4
 
5
  ```
6
+ from diffusers.utils import load_image, export_to_video
7
+ from transformers import CLIPVisionModel, CLIPImageProcessor, UMT5EncoderModel
8
+ from diffusers import WanI2VPipeline, WanTransformer3DModel
9
  import torch
10
+
11
+ pretrained_model_name_or_path = "./wan_i2v" # TODO replace with our hf id
12
+ image_encoder = CLIPVisionModel.from_pretrained(pretrained_model_name_or_path, subfolder='image_encoder',
13
+ torch_dtype=torch.float16)
14
+ transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_480p')
15
+ # for 720p
16
+ # transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_720p',
17
+ # torch_dtype=torch.bfloat16)
18
+
19
+ image_processor = CLIPImageProcessor.from_pretrained(pretrained_model_name_or_path, subfolder='image_processor')
 
 
20
 
21
  text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',
22
  torch_dtype=torch.bfloat16)
23
 
24
+ pipe = WanI2VPipeline.from_pretrained(
25
  pretrained_model_name_or_path,
26
+ transformer=transformer_i2v,
27
  text_encoder=text_encoder,
28
+ image_encoder=image_encoder,
29
+ image_processor=image_processor,
30
  )
31
 
32
+ image = load_image(
33
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
34
+ )
35
  device = "cuda"
36
  seed = 0
37
+ prompt = ("An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
38
+ "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot.")
39
  generator = torch.Generator(device=device).manual_seed(seed)
40
+
41
+ # pipe.to(device)
42
+ pipe.enable_model_cpu_offload()
43
+
44
+ negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
45
+
46
  inputs = {
47
+ 'image': image,
48
+ "prompt": prompt,
49
+ # 'max_area': 720 * 1280, # for 720p
50
  "negative_prompt": negative_prompt,
51
+ 'max_area': 480 * 832,
52
  "generator": generator,
53
+ "num_inference_steps": 40,
 
54
  "guidance_scale": 5.0,
 
 
55
  "num_frames": 81,
56
  "max_sequence_length": 512,
57
+ "output_type": "np",
58
+ # 'flow_shift': 5.0, # for 720p
59
+ 'flow_shift': 3.0
60
  }
61
 
62
+ output = pipe(**inputs).frames[0]
63
 
64
+ export_to_video(output, "output.mp4", fps=16)
65
 
 
66
 
67
  ```
wan_t2v_fp32_example.py → wan_i2v_example.py RENAMED
@@ -1,48 +1,59 @@
 
 
 
1
  import torch
2
- from transformers import AutoTokenizer, UMT5EncoderModel
3
- from diffusers import AutoencoderKLWan, WanPipeline, WanTransformer3DModel, FlowMatchEulerDiscreteScheduler
4
- from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
5
- from diffusers.utils import export_to_video
6
- from torchvision import transforms
7
- import os
8
- import cv2
9
- os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
10
- import numpy as np
11
-
12
- pretrained_model_name_or_path = "./wan_t2v"
13
- transformer_t2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer')
14
 
15
  text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',
16
  torch_dtype=torch.bfloat16)
17
 
18
- pipe = WanPipeline.from_pretrained(
19
  pretrained_model_name_or_path,
20
- transformer=transformer_t2v,
21
  text_encoder=text_encoder,
 
 
22
  )
23
 
24
- negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
25
-
 
26
  device = "cuda"
27
  seed = 0
28
-
 
29
  generator = torch.Generator(device=device).manual_seed(seed)
 
 
 
 
 
 
30
  inputs = {
31
- "prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
 
 
32
  "negative_prompt": negative_prompt,
 
33
  "generator": generator,
34
- "num_inference_steps": 50,
35
- "flow_shift": 5.0,
36
  "guidance_scale": 5.0,
37
- "height": 720,
38
- "width": 1280,
39
  "num_frames": 81,
40
  "max_sequence_length": 512,
41
- "output_type": "np"
 
 
42
  }
43
 
44
- pipe.enable_model_cpu_offload()
45
-
46
- video = pipe(**inputs).frames[0]
47
 
48
- export_to_video(video, "output.mp4", fps=16)
 
1
+ from diffusers.utils import load_image, export_to_video
2
+ from transformers import CLIPVisionModel, CLIPImageProcessor, UMT5EncoderModel
3
+ from diffusers import WanI2VPipeline, WanTransformer3DModel
4
  import torch
5
+
6
+ pretrained_model_name_or_path = "./wan_i2v" # TODO replace with our hf id
7
+ image_encoder = CLIPVisionModel.from_pretrained(pretrained_model_name_or_path, subfolder='image_encoder',
8
+ torch_dtype=torch.float16)
9
+ transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_480p')
10
+ # for 720p
11
+ # transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_720p',
12
+ # torch_dtype=torch.bfloat16)
13
+
14
+ image_processor = CLIPImageProcessor.from_pretrained(pretrained_model_name_or_path, subfolder='image_processor')
 
 
15
 
16
  text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',
17
  torch_dtype=torch.bfloat16)
18
 
19
+ pipe = WanI2VPipeline.from_pretrained(
20
  pretrained_model_name_or_path,
21
+ transformer=transformer_i2v,
22
  text_encoder=text_encoder,
23
+ image_encoder=image_encoder,
24
+ image_processor=image_processor,
25
  )
26
 
27
+ image = load_image(
28
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
29
+ )
30
  device = "cuda"
31
  seed = 0
32
+ prompt = ("An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
33
+ "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot.")
34
  generator = torch.Generator(device=device).manual_seed(seed)
35
+
36
+ # pipe.to(device)
37
+ pipe.enable_model_cpu_offload()
38
+
39
+ negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
40
+
41
  inputs = {
42
+ 'image': image,
43
+ "prompt": prompt,
44
+ # 'max_area': 720 * 1280, # for 720p
45
  "negative_prompt": negative_prompt,
46
+ 'max_area': 480 * 832,
47
  "generator": generator,
48
+ "num_inference_steps": 40,
 
49
  "guidance_scale": 5.0,
 
 
50
  "num_frames": 81,
51
  "max_sequence_length": 512,
52
+ "output_type": "np",
53
+ # 'flow_shift': 5.0, # for 720p
54
+ 'flow_shift': 3.0
55
  }
56
 
57
+ output = pipe(**inputs).frames[0]
 
 
58
 
59
+ export_to_video(output, "output.mp4", fps=16)