wyysf commited on
Commit
9505fe5
·
1 Parent(s): a480bdb
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. ckpts/.gitattributes +35 -0
  2. ckpts/README.md +117 -0
  3. ckpts/craftsman-v1-5 +1 -0
  4. craftsman/__pycache__/__init__.cpython-310.pyc +0 -0
  5. craftsman/__pycache__/__init__.cpython-311.pyc +0 -0
  6. craftsman/__pycache__/pipeline.cpython-310.pyc +0 -0
  7. craftsman/__pycache__/pipeline.cpython-311.pyc +0 -0
  8. craftsman/data/__pycache__/Objaverse.cpython-310.pyc +0 -0
  9. craftsman/data/__pycache__/__init__.cpython-310.pyc +0 -0
  10. craftsman/data/__pycache__/base.cpython-310.pyc +0 -0
  11. craftsman/data/base.py +9 -8
  12. craftsman/models/__pycache__/__init__.cpython-310.pyc +0 -0
  13. craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc +0 -0
  14. craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc +0 -0
  15. craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc +0 -0
  16. craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc +0 -0
  17. craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc +0 -0
  18. craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc +0 -0
  19. craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc +0 -0
  20. craftsman/models/conditional_encoders/cond_encoder.py +1 -22
  21. craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc +0 -0
  22. craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc +0 -0
  23. craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc +0 -0
  24. craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc +0 -0
  25. craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc +0 -0
  26. craftsman/models/denoisers/pixart_denoiser.py +2 -22
  27. craftsman/models/denoisers/utils.py +1 -185
  28. craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc +0 -0
  29. craftsman/models/geometry/__pycache__/base.cpython-310.pyc +0 -0
  30. craftsman/models/geometry/__pycache__/utils.cpython-310.pyc +0 -0
  31. craftsman/models/transformers/__pycache__/attention.cpython-310.pyc +0 -0
  32. craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc +0 -0
  33. craftsman/models/transformers/__pycache__/utils.cpython-310.pyc +0 -0
  34. craftsman/models/transformers/attention.py +1 -121
  35. craftsman/models/transformers/perceiver_1d.py +0 -0
  36. craftsman/models/transformers/utils.py +0 -0
  37. craftsman/pipeline.py +13 -0
  38. craftsman/systems/__pycache__/__init__.cpython-310.pyc +0 -0
  39. craftsman/systems/__pycache__/base.cpython-310.pyc +0 -0
  40. craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc +0 -0
  41. craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc +0 -0
  42. craftsman/systems/__pycache__/utils.cpython-310.pyc +0 -0
  43. craftsman/systems/pixart_diffusion.py +3 -3
  44. craftsman/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  45. craftsman/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  46. craftsman/utils/__pycache__/base.cpython-310.pyc +0 -0
  47. craftsman/utils/__pycache__/base.cpython-311.pyc +0 -0
  48. craftsman/utils/__pycache__/checkpoint.cpython-310.pyc +0 -0
  49. craftsman/utils/__pycache__/config.cpython-310.pyc +0 -0
  50. craftsman/utils/__pycache__/config.cpython-311.pyc +0 -0
ckpts/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
ckpts/README.md ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: craftsman-v1-5
3
+ license: creativeml-openrail-m
4
+ license_name: creativeml-openrail-m
5
+ license_link: https://raw.githubusercontent.com/CompVis/stable-diffusion/refs/heads/main/LICENSE
6
+ pipeline_tag: image-to-3d
7
+ language:
8
+ - en
9
+ - zh
10
+ ---
11
+
12
+ ## **CraftsMan-v1-5**
13
+
14
+ <p align="center">
15
+ <img src="./assets/teaser.png" height=200>
16
+ </p>
17
+
18
+ ### <div align="center">CraftsMan: High-fidelity Mesh Generation <br> with 3D Native Generation and Interactive Geometry Refiner<div>
19
+ ##### <p align="center"> [Weiyu Li<sup>*1,2</sup>](https://wyysf-98.github.io/), Jiarui Liu<sup>*1,2</sup>, Hongyu Yan<sup>*1,2</sup>, [Rui Chen<sup>1,2</sup>](https://aruichen.github.io/), [Yixun Liang<sup>2,3</sup>](https://yixunliang.github.io/), [Xuelin Chen<sup>4</sup>](https://xuelin-chen.github.io/), [Ping Tan<sup>1,2</sup>](https://ece.hkust.edu.hk/pingtan), [Xiaoxiao Long<sup>1,2</sup>](https://www.xxlong.site/)</p>
20
+ ##### <p align="center"> <sup>1</sup>HKUST, <sup>2</sup>LightIllusions, <sup>3</sup>HKUST(GZ), <sup>4</sup>Tencent AI Lab</p>
21
+ <div align="center">
22
+ <a href="https://craftsman3d.github.io/"><img src="https://img.shields.io/static/v1?label=Project%20Page&message=Github&color=blue&logo=github-pages"></a> &ensp;
23
+ <a href="http://algodemo.bj.lightions.top:24926"><img src="https://www.gradio.app/_app/immutable/assets/gradio.CHB5adID.svg" height="25"/></a> &ensp;
24
+ <a href="https://arxiv.org/pdf/2405.14979"><img src="https://img.shields.io/static/v1?label=Paper&message=Arxiv&color=red&logo=arxiv"></a> &ensp;
25
+ </div>
26
+
27
+ # Usage
28
+
29
+ To use the model, please refer to the [official repository](https://github.com/wyysf-98/CraftsMan) for installation and usage instructions.
30
+
31
+ ```
32
+
33
+ from craftsman import CraftsManPipeline
34
+ import torch
35
+
36
+ pipeline = CraftsManPipeline.from_pretrained("./ckpts/craftsman-v1-5", device="cuda:0", torch_dtype=torch.float32) # load from local ckpt
37
+ mesh = pipeline("https://pub-f9073a756ec645d692ce3d171c2e1232.r2.dev/data/werewolf.png").meshes[0]
38
+ mesh.export("werewolf.obj")
39
+
40
+ ```
41
+
42
+ ## 🔥🔥🔥 News!!
43
+
44
+ * Nov 16, 2024: 💬 We release the CraftsMan-v1-5
45
+
46
+
47
+ ## 📑 Open-source Plan
48
+
49
+ - [x] Inference
50
+ - [x] Checkpoints
51
+ - [x] Training
52
+ - [ ] ComfyUI
53
+
54
+ ## 🎉 **CraftMan-v1-5 Architecture**
55
+
56
+ <p align="center">
57
+ <img src="./assets/arch.png" height=400>
58
+ </p>
59
+
60
+
61
+ ## Get Started
62
+
63
+ #### Begin by cloning the repository:
64
+
65
+ ```shell
66
+ git clone https://github.com/wyysf-98/CraftsMan
67
+ cd CraftsMan
68
+ ```
69
+
70
+ #### Installation Guide for Linux
71
+
72
+ We provide an env_install.sh script file for setting up environment.
73
+
74
+ ```
75
+ # step 1, create conda env
76
+ conda create -n CraftsMan python=3.10
77
+ conda activate CraftsMan
78
+
79
+
80
+ # step 2. install torch realated package
81
+ conda install -c pytorch pytorch=2.3.0 torchvision=0.18.0 cudatoolkit=11.8
82
+
83
+ # step 3. install other packages
84
+ pip install -r docker/requirements.txt
85
+ ```
86
+ <details>
87
+
88
+
89
+ #### Using Gradio
90
+
91
+ We have prepared a gradio demo for you to try out the model. You can run the following command to start the demo.
92
+
93
+ ```shell
94
+ # std
95
+ python3 gradio.py
96
+ ```
97
+
98
+ Then the demo can be accessed through the output link.
99
+
100
+
101
+ ## Citation
102
+
103
+ If you found this repository helpful, please cite our report:
104
+ ```bibtex
105
+ @misc{li2024craftsman,
106
+ title = {CraftsMan: High-fidelity Mesh Generation with 3D Native Generation and Interactive Geometry Refiner},
107
+ author = {Weiyu Li and Jiarui Liu and Rui Chen and Yixun Liang and Xuelin Chen and Ping Tan and Xiaoxiao Long},
108
+ year = {2024},
109
+ archivePrefix = {arXiv preprint arXiv:2405.14979},
110
+ primaryClass = {cs.CG}
111
+ }
112
+ ```
113
+
114
+
115
+ # License
116
+
117
+ [creativeml-openrail-m](https://raw.githubusercontent.com/CompVis/stable-diffusion/refs/heads/main/LICENSE)
ckpts/craftsman-v1-5 ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 9a5e9189c2dfab20cf838885dd6acaf99b41844e
craftsman/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/craftsman/__pycache__/__init__.cpython-310.pyc and b/craftsman/__pycache__/__init__.cpython-310.pyc differ
 
craftsman/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (2.24 kB). View file
 
craftsman/__pycache__/pipeline.cpython-310.pyc CHANGED
Binary files a/craftsman/__pycache__/pipeline.cpython-310.pyc and b/craftsman/__pycache__/pipeline.cpython-310.pyc differ
 
craftsman/__pycache__/pipeline.cpython-311.pyc ADDED
Binary file (16.6 kB). View file
 
craftsman/data/__pycache__/Objaverse.cpython-310.pyc CHANGED
Binary files a/craftsman/data/__pycache__/Objaverse.cpython-310.pyc and b/craftsman/data/__pycache__/Objaverse.cpython-310.pyc differ
 
craftsman/data/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/craftsman/data/__pycache__/__init__.cpython-310.pyc and b/craftsman/data/__pycache__/__init__.cpython-310.pyc differ
 
craftsman/data/__pycache__/base.cpython-310.pyc CHANGED
Binary files a/craftsman/data/__pycache__/base.cpython-310.pyc and b/craftsman/data/__pycache__/base.cpython-310.pyc differ
 
craftsman/data/base.py CHANGED
@@ -53,7 +53,7 @@ class BaseDataModuleConfig:
53
  # for occupancy and sdf data
54
  n_samples: int = 4096 # number of points in input point cloud
55
  upsample_ratio: int = 1 # upsample ratio for input point cloud
56
- sampling_strategy: str = "random" # sampling strategy for input point cloud
57
  scale: float = 1.0 # scale of the input point cloud and target supervision
58
  load_supervision: bool = True # whether to load supervision
59
  supervision_type: str = "occupancy" # occupancy, sdf, tsdf
@@ -70,6 +70,8 @@ class BaseDataModuleConfig:
70
  idx: Optional[List[int]] = None # index of the image to load
71
  n_views: int = 1 # number of views
72
  marign_pix_dis: int = 30 # margin of the bounding box
 
 
73
 
74
 
75
  class BaseDataset(Dataset):
@@ -78,7 +80,7 @@ class BaseDataset(Dataset):
78
  self.cfg: BaseDataModuleConfig = cfg
79
  self.split = split
80
 
81
- self.uids = json.load(open(f'{cfg.root_dir}/{split}.json'))
82
  print(f"Loaded {len(self.uids)} {split} uids")
83
 
84
  def __len__(self):
@@ -94,10 +96,7 @@ class BaseDataset(Dataset):
94
  surface = np.concatenate([surface, normal], axis=1)
95
  elif self.cfg.geo_data_type == "sdf":
96
  # for sdf data with our own format
97
- if re.match(r"\.\.", self.uids[index]):
98
- data = np.load(f'{self.cfg.geo_data_path}/{self.uids[index]}.npz')
99
- else:
100
- data = np.load(f'{self.uids[index]}.npz')
101
  # for input point cloud
102
  surface = data["surface"]
103
  else:
@@ -112,6 +111,8 @@ class BaseDataset(Dataset):
112
  import fpsample
113
  kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(surface[:, :3], self.cfg.n_samples, h=5)
114
  surface = surface[kdline_fps_samples_idx]
 
 
115
  else:
116
  raise NotImplementedError(f"sampling strategy {self.cfg.sampling_strategy} not implemented")
117
  # rescale data
@@ -189,9 +190,9 @@ class BaseDataset(Dataset):
189
  sel_idx = random.choice(self.cfg.idx)
190
  ret["sel_image_idx"] = sel_idx
191
  if self.cfg.image_type == "rgb":
192
- img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_rgb.png"
193
  elif self.cfg.image_type == "normal":
194
- img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_normal.png"
195
  ret["image"], ret["mask"] = _load_single_image(img_path, background_color, self.cfg.marign_pix_dis)
196
 
197
  else:
 
53
  # for occupancy and sdf data
54
  n_samples: int = 4096 # number of points in input point cloud
55
  upsample_ratio: int = 1 # upsample ratio for input point cloud
56
+ sampling_strategy: Optional[str] = None # sampling strategy for input point cloud
57
  scale: float = 1.0 # scale of the input point cloud and target supervision
58
  load_supervision: bool = True # whether to load supervision
59
  supervision_type: str = "occupancy" # occupancy, sdf, tsdf
 
70
  idx: Optional[List[int]] = None # index of the image to load
71
  n_views: int = 1 # number of views
72
  marign_pix_dis: int = 30 # margin of the bounding box
73
+ batch_size: int = 32
74
+ num_workers: int = 8
75
 
76
 
77
  class BaseDataset(Dataset):
 
80
  self.cfg: BaseDataModuleConfig = cfg
81
  self.split = split
82
 
83
+ self.uids = json.load(open(f'{cfg.local_dir}/{split}.json'))
84
  print(f"Loaded {len(self.uids)} {split} uids")
85
 
86
  def __len__(self):
 
96
  surface = np.concatenate([surface, normal], axis=1)
97
  elif self.cfg.geo_data_type == "sdf":
98
  # for sdf data with our own format
99
+ data = np.load(f'{self.cfg.geo_data_path}/{self.uids[index]}.npz')
 
 
 
100
  # for input point cloud
101
  surface = data["surface"]
102
  else:
 
111
  import fpsample
112
  kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(surface[:, :3], self.cfg.n_samples, h=5)
113
  surface = surface[kdline_fps_samples_idx]
114
+ elif self.cfg.sampling_strategy is None:
115
+ pass
116
  else:
117
  raise NotImplementedError(f"sampling strategy {self.cfg.sampling_strategy} not implemented")
118
  # rescale data
 
190
  sel_idx = random.choice(self.cfg.idx)
191
  ret["sel_image_idx"] = sel_idx
192
  if self.cfg.image_type == "rgb":
193
+ img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_rgb.jpeg"
194
  elif self.cfg.image_type == "normal":
195
+ img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_normal.jpeg"
196
  ret["image"], ret["mask"] = _load_single_image(img_path, background_color, self.cfg.marign_pix_dis)
197
 
198
  else:
craftsman/models/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/craftsman/models/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/__pycache__/__init__.cpython-310.pyc differ
 
craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc differ
 
craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc CHANGED
Binary files a/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc and b/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc differ
 
craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc differ
 
craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc CHANGED
Binary files a/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc differ
 
craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc CHANGED
Binary files a/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc differ
 
craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc CHANGED
Binary files a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc and b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc differ
 
craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc CHANGED
Binary files a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc and b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc differ
 
craftsman/models/conditional_encoders/cond_encoder.py CHANGED
@@ -46,7 +46,6 @@ class CondEmbedder(BaseEmbedder):
46
  enable_gradient_checkpointing: bool = False
47
  embeds_fusion_mode: int = 1 # 0: sum | 1: concat
48
  linear_proj_init: str = "constant"
49
- text_model_type: str = "clip"
50
  text_max_length: int = 77
51
  image_size_clip: int = 224
52
  image_size_dino: int = 224
@@ -277,29 +276,9 @@ class CondEmbedder(BaseEmbedder):
277
  else:
278
  return vision_outputs.last_hidden_state
279
 
280
- def post_process_embeds(self, text_embeds, visual_embeds):
281
- clip_embeds, dino_embeds = visual_embeds.chunk(2, dim=2)
282
- if self.cfg.normalize_embeds:
283
- # post-process the text/visual embeds
284
- if text_embeds is not None:
285
- text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
286
- if clip_embeds is not None:
287
- clip_embeds = clip_embeds / clip_embeds.norm(dim=-1, keepdim=True)
288
- if dino_embeds is not None:
289
- dino_embeds = dino_embeds / dino_embeds.norm(dim=-1, keepdim=True)
290
-
291
- assert text_embeds is not None or dino_embeds is not None or clip_embeds is not None
292
-
293
- if text_embeds is not None and visual_embeds is not None:
294
- return torch.cat([text_embeds, visual_embeds], dim=1)
295
- elif text_embeds is not None:
296
- return text_embeds
297
- else:
298
- return visual_embeds
299
-
300
  def encode_image(self, images: Iterable[Optional[ImageType]], cameras: Optional[torch.Tensor] = None, force_none_camera_embeds: bool = False, return_dict: bool = False, **kwargs) -> torch.FloatTensor:
301
  clip_embeds = self.encode_image_clip(images, cameras)
302
  dino_embeds = self.encode_image_dino(images, cameras)
303
  dino_embeds = self.linear_proj(dino_embeds)
304
  visual_embeds = torch.cat([clip_embeds, dino_embeds], dim=1)
305
- return visual_embeds
 
46
  enable_gradient_checkpointing: bool = False
47
  embeds_fusion_mode: int = 1 # 0: sum | 1: concat
48
  linear_proj_init: str = "constant"
 
49
  text_max_length: int = 77
50
  image_size_clip: int = 224
51
  image_size_dino: int = 224
 
276
  else:
277
  return vision_outputs.last_hidden_state
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  def encode_image(self, images: Iterable[Optional[ImageType]], cameras: Optional[torch.Tensor] = None, force_none_camera_embeds: bool = False, return_dict: bool = False, **kwargs) -> torch.FloatTensor:
280
  clip_embeds = self.encode_image_clip(images, cameras)
281
  dino_embeds = self.encode_image_dino(images, cameras)
282
  dino_embeds = self.linear_proj(dino_embeds)
283
  visual_embeds = torch.cat([clip_embeds, dino_embeds], dim=1)
284
+ return visual_embeds
craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc CHANGED
Binary files a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc and b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc differ
 
craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc CHANGED
Binary files a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc and b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc differ
 
craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc differ
 
craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc CHANGED
Binary files a/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc differ
 
craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc differ
 
craftsman/models/denoisers/pixart_denoiser.py CHANGED
@@ -25,15 +25,11 @@ class PixArtDinoDenoiser(BaseModule):
25
  context_dim: int = 1024
26
  n_views: int = 1
27
  context_ln: bool = True
28
- skip_ln: bool = False
29
  init_scale: float = 0.25
30
  use_checkpoint: bool = False
31
  drop_path: float = 0.
32
- variance_type: str = ""
33
- img_pos_embed: bool = False
34
  clip_weight: float = 1.0
35
  dino_weight: float = 1.0
36
- dit_block: str = ""
37
 
38
  cfg: Config
39
 
@@ -63,9 +59,8 @@ class PixArtDinoDenoiser(BaseModule):
63
 
64
  init_scale = self.cfg.init_scale * math.sqrt(1.0 / self.cfg.width)
65
  drop_path = [x.item() for x in torch.linspace(0, self.cfg.drop_path, self.cfg.layers)]
66
- ditblock = getattr(importlib.import_module("craftsman.models.denoisers.utils"), self.cfg.dit_block)
67
  self.blocks = nn.ModuleList([
68
- ditblock(
69
  width=self.cfg.width,
70
  heads=self.cfg.heads,
71
  init_scale=init_scale,
@@ -82,11 +77,7 @@ class PixArtDinoDenoiser(BaseModule):
82
  )
83
 
84
  # final layer
85
- if self.cfg.variance_type.upper() in ["LEARNED", "LEARNED_RANGE"]:
86
- self.output_channels = self.cfg.output_channels * 2
87
- else:
88
- self.output_channels = self.cfg.output_channels
89
- self.final_layer = T2IFinalLayer(self.cfg.width, self.output_channels)
90
 
91
  self.identity_initialize()
92
 
@@ -99,17 +90,6 @@ class PixArtDinoDenoiser(BaseModule):
99
  self.denoiser_ckpt[k.replace('denoiser_model.', '')] = v
100
  self.load_state_dict(self.denoiser_ckpt, strict=False)
101
 
102
- def forward_with_dpmsolver(self, model_input, timestep, context):
103
- """
104
- dpm solver donnot need variance prediction
105
- """
106
- # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
107
- model_out = self.forward(model_input, timestep, context)
108
- if self.cfg.variance_type.upper() in ["LEARNED", "LEARNED_RANGE"]:
109
- return model_out.chunk(2, dim=-1)[0]
110
- else:
111
- return model_out
112
-
113
  def identity_initialize(self):
114
  for block in self.blocks:
115
  nn.init.constant_(block.attn.c_proj.weight, 0)
 
25
  context_dim: int = 1024
26
  n_views: int = 1
27
  context_ln: bool = True
 
28
  init_scale: float = 0.25
29
  use_checkpoint: bool = False
30
  drop_path: float = 0.
 
 
31
  clip_weight: float = 1.0
32
  dino_weight: float = 1.0
 
33
 
34
  cfg: Config
35
 
 
59
 
60
  init_scale = self.cfg.init_scale * math.sqrt(1.0 / self.cfg.width)
61
  drop_path = [x.item() for x in torch.linspace(0, self.cfg.drop_path, self.cfg.layers)]
 
62
  self.blocks = nn.ModuleList([
63
+ DiTBlock(
64
  width=self.cfg.width,
65
  heads=self.cfg.heads,
66
  init_scale=init_scale,
 
77
  )
78
 
79
  # final layer
80
+ self.final_layer = T2IFinalLayer(self.cfg.width, self.cfg.output_channels)
 
 
 
 
81
 
82
  self.identity_initialize()
83
 
 
90
  self.denoiser_ckpt[k.replace('denoiser_model.', '')] = v
91
  self.load_state_dict(self.denoiser_ckpt, strict=False)
92
 
 
 
 
 
 
 
 
 
 
 
 
93
  def identity_initialize(self):
94
  for block in self.blocks:
95
  nn.init.constant_(block.attn.c_proj.weight, 0)
craftsman/models/denoisers/utils.py CHANGED
@@ -10,126 +10,6 @@ from timm.models.layers import DropPath
10
  from craftsman.models.transformers.utils import MLP
11
  from craftsman.models.transformers.attention import MultiheadAttention, MultiheadCrossAttention
12
 
13
- class PatchEmbed(nn.Module):
14
- """ 2D Image to Patch Embedding
15
- """
16
- def __init__(
17
- self,
18
- patch_size=16,
19
- in_chans=3,
20
- embed_dim=768,
21
- norm_layer=None,
22
- flatten=True,
23
- bias=True,
24
- ):
25
- super().__init__()
26
- patch_size = to_2tuple(patch_size)
27
- self.patch_size = patch_size
28
- self.flatten = flatten
29
- self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
30
- self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
31
-
32
- def forward(self, x):
33
- x = self.proj(x)
34
- if self.flatten:
35
- x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
36
- x = self.norm(x)
37
- return x
38
-
39
- class DiTBlock(nn.Module):
40
- """
41
- A PixArt block with adaptive layer norm (adaLN-single) conditioning.
42
- """
43
-
44
- def __init__(self, width, heads, init_scale=1.0, qkv_bias=True, use_flash=True, drop_path=0.0):
45
- super().__init__()
46
- self.norm1 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
47
- self.attn = MultiheadAttention(
48
- n_ctx=None,
49
- width=width,
50
- heads=heads,
51
- init_scale=init_scale,
52
- qkv_bias=qkv_bias,
53
- use_flash=use_flash
54
- )
55
- self.cross_attn = MultiheadCrossAttention(
56
- n_data=None,
57
- width=width,
58
- heads=heads,
59
- data_width=None,
60
- init_scale=init_scale,
61
- qkv_bias=qkv_bias,
62
- use_flash=use_flash,
63
- )
64
-
65
- self.norm2 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
66
-
67
- self.mlp = MLP(width=width, init_scale=init_scale)
68
- self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
69
- self.scale_shift_table = nn.Parameter(torch.randn(6, width) / width ** 0.5)
70
-
71
- def forward(self, x, visual_cond, t, **kwargs):
72
- B, N, C = x.shape
73
-
74
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
75
- x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
76
- x = x + self.cross_attn(x, visual_cond)
77
- x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
78
-
79
- return x
80
-
81
- class DiTBlock_text(nn.Module):
82
- """
83
- A PixArt block with adaptive layer norm (adaLN-single) conditioning.
84
- """
85
-
86
- def __init__(self, width, heads, init_scale=1.0, qkv_bias=True, use_flash=True, drop_path=0.0):
87
- super().__init__()
88
- self.norm1 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
89
- self.attn = MultiheadAttention(
90
- n_ctx=None,
91
- width=width,
92
- heads=heads,
93
- init_scale=init_scale,
94
- qkv_bias=qkv_bias,
95
- use_flash=use_flash
96
- )
97
- self.cross_attn = MultiheadCrossAttention(
98
- n_data=None,
99
- width=width,
100
- heads=heads,
101
- data_width=None,
102
- init_scale=init_scale,
103
- qkv_bias=qkv_bias,
104
- use_flash=use_flash,
105
- )
106
-
107
- self.cross_attn_extra = MultiheadCrossAttention(
108
- n_data=None,
109
- width=width,
110
- heads=heads,
111
- data_width=None,
112
- init_scale=init_scale,
113
- qkv_bias=qkv_bias,
114
- use_flash=use_flash,
115
- )
116
- self.norm2 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
117
-
118
- self.mlp = MLP(width=width, init_scale=init_scale)
119
- self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
120
- self.scale_shift_table = nn.Parameter(torch.randn(6, width) / width ** 0.5)
121
-
122
- def forward(self, x, visual_cond, text_cond, t, **kwargs):
123
- B, N, C = x.shape
124
-
125
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
126
- x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
127
- x = x + self.cross_attn(x, visual_cond)
128
- x = x + self.cross_attn_extra(x, text_cond)
129
- x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
130
-
131
- return x
132
-
133
  class DiTBlock(nn.Module):
134
  """
135
  A DiT block with adaptive layer norm (adaLN-single) conditioning.
@@ -174,11 +54,6 @@ class DiTBlock(nn.Module):
174
  def t2i_modulate(x, shift, scale):
175
  return x * (1 + scale) + shift
176
 
177
- # def t2i_modulate(x, shift, scale):
178
- # a = torch.ones_like(scale)
179
- # a[..., 768:] = 0
180
- # return x * (a + scale) + shift
181
-
182
  def auto_grad_checkpoint(module, *args, **kwargs):
183
  if getattr(module, 'grad_checkpointing', False):
184
  if not isinstance(module, Iterable):
@@ -268,63 +143,4 @@ class T2IFinalLayer(nn.Module):
268
  shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
269
  x = t2i_modulate(self.norm_final(x), shift, scale)
270
  x = self.linear(x)
271
- return x
272
-
273
- def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
274
- """
275
- embed_dim: output dimension for each position
276
- pos: a list of positions to be encoded: size (M,)
277
- out: (M, D)
278
- """
279
- assert embed_dim % 2 == 0
280
- omega = np.arange(embed_dim // 2, dtype=np.float64)
281
- omega /= embed_dim / 2.
282
- omega = 1. / 10000 ** omega # (D/2,)
283
-
284
- pos = pos.reshape(-1) # (M,)
285
- out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
286
-
287
- emb_sin = np.sin(out) # (M, D/2)
288
- emb_cos = np.cos(out) # (M, D/2)
289
-
290
- emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
291
- return emb
292
-
293
- def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
294
- assert embed_dim % 2 == 0
295
-
296
- # use half of dimensions to encode grid_h
297
- emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
298
- emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
299
-
300
- emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
301
- return emb
302
-
303
- def _ntuple(n):
304
- def parse(x):
305
- if isinstance(x, Iterable) and not isinstance(x, str):
306
- return x
307
- return tuple(repeat(x, n))
308
- return parse
309
-
310
- to_1tuple = _ntuple(1)
311
- to_2tuple = _ntuple(2)
312
-
313
- def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, pe_interpolation=1.0, base_size=16):
314
- """
315
- grid_size: int of the grid height and width
316
- return:
317
- pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
318
- """
319
- if isinstance(grid_size, int):
320
- grid_size = to_2tuple(grid_size)
321
- grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0]/base_size) / pe_interpolation
322
- grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1]/base_size) / pe_interpolation
323
- grid = np.meshgrid(grid_w, grid_h) # here w goes first
324
- grid = np.stack(grid, axis=0)
325
- grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
326
-
327
- pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
328
- if cls_token and extra_tokens > 0:
329
- pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
330
- return pos_embed
 
10
  from craftsman.models.transformers.utils import MLP
11
  from craftsman.models.transformers.attention import MultiheadAttention, MultiheadCrossAttention
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class DiTBlock(nn.Module):
14
  """
15
  A DiT block with adaptive layer norm (adaLN-single) conditioning.
 
54
  def t2i_modulate(x, shift, scale):
55
  return x * (1 + scale) + shift
56
 
 
 
 
 
 
57
  def auto_grad_checkpoint(module, *args, **kwargs):
58
  if getattr(module, 'grad_checkpointing', False):
59
  if not isinstance(module, Iterable):
 
143
  shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
144
  x = t2i_modulate(self.norm_final(x), shift, scale)
145
  x = self.linear(x)
146
+ return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc differ
 
craftsman/models/geometry/__pycache__/base.cpython-310.pyc CHANGED
Binary files a/craftsman/models/geometry/__pycache__/base.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/base.cpython-310.pyc differ
 
craftsman/models/geometry/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc differ
 
craftsman/models/transformers/__pycache__/attention.cpython-310.pyc CHANGED
Binary files a/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc differ
 
craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc CHANGED
Binary files a/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc differ
 
craftsman/models/transformers/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc differ
 
craftsman/models/transformers/attention.py CHANGED
@@ -9,126 +9,6 @@ from craftsman.utils.checkpoint import checkpoint
9
  from .utils import init_linear, MLP
10
  from timm.models.vision_transformer import Attention
11
 
12
- def scaled_dot_product_gqa(
13
- query: Tensor,
14
- key: Tensor,
15
- value: Tensor,
16
- dropout: float = 0.0,
17
- scale: Optional[float] = None,
18
- mask: Optional[Tensor] = None,
19
- is_causal: Optional[bool] = None,
20
- need_weights: bool = False,
21
- average_attn_weights: bool = False,
22
- force_grouped: bool = False,
23
- ):
24
- """Scaled dot product attention with support for grouped queries.
25
-
26
- Einstein notation:
27
- - b: batch size
28
- - n / s: sequence length
29
- - h: number of heads
30
- - g: number of groups
31
- - d: dimension of query/key/value
32
-
33
- Args:
34
- query: Query tensor of shape (b, n, h, d)
35
- key: Key tensor of shape (b, s, h, d)
36
- value: Value tensor of shape (b, s, h, d)
37
- dropout: Dropout probability (default: 0.0)
38
- scale: Scale factor for query (default: d_query ** 0.5)
39
- mask: Mask tensor of shape (b, n, s) or (b, s). If 'ndim == 2', the mask is
40
- applied to all 'n' rows of the attention matrix. (default: None)
41
- force_grouped: If True, apply grouped-query attention even if the number of
42
- heads is equal for query, key, and value. (default: False)
43
-
44
- Returns:
45
- 2-tuple of:
46
- - Attention output with shape (b, n, h, d)
47
- - (Optional) Attention weights with shape (b, h, n, s). Only returned if
48
- 'need_weights' is True.
49
- """
50
- if (mask is not None) and (is_causal is not None):
51
- raise ValueError(
52
- "Only one of 'mask' and 'is_causal' should be provided, but got both."
53
- )
54
- elif not query.ndim == key.ndim == value.ndim == 4:
55
- raise ValueError(
56
- f"Expected query, key, and value to be 4-dimensional, but got shapes "
57
- f"{query.shape}, {key.shape}, and {value.shape}."
58
- )
59
-
60
- # Move sequence length dimension to axis 2.
61
- # This makes the attention operations below *much* faster.
62
- query = rearrange(query, "b n h d -> b h n d")
63
- key = rearrange(key, "b s h d -> b h s d")
64
- value = rearrange(value, "b s h d -> b h s d")
65
-
66
- bq, hq, nq, dq = query.shape
67
- bk, hk, nk, dk = key.shape
68
- bv, hv, nv, dv = value.shape
69
- if not (bq == bk == bv and dq == dk == dv):
70
- raise ValueError(
71
- "Expected query, key, and value to have the same batch size (dim=0) and "
72
- f"embedding dimension (dim=3), but got query: {query.shape}, "
73
- f"key: {key.shape}, and value: {value.shape}."
74
- )
75
- elif (hk != hv) or (nk != nv):
76
- raise ValueError(
77
- "Expected key and value to have the same size in dimensions 1 and 2, but "
78
- f"got key: {key.shape} and value: {value.shape}."
79
- )
80
- elif hq % hk != 0:
81
- raise ValueError(
82
- "Expected query heads to be a multiple of key/value heads, but got "
83
- f"query: {query.shape} and key/value: {key.shape}."
84
- )
85
-
86
- if scale is None:
87
- scale = query.size(-1) ** 0.5
88
- query = query / scale
89
-
90
- num_head_groups = hq // hk
91
- query = rearrange(query, "b (h g) n d -> b g h n d", g=num_head_groups)
92
- similarity = einsum(query, key, "b g h n d, b h s d -> b g h n s")
93
-
94
- if is_causal:
95
- # Mask out the upper triangular portion of the attention matrix. This prevents
96
- # the model from attending to tokens in the future.
97
- mask = torch.ones((bq, nq, nk), device=query.device, dtype=torch.bool).tril_()
98
-
99
- if mask is not None:
100
- # Expand mask to match the shape of the attention matrix.
101
- # If mask is 2D, assume that it is applied to the key/value sequence dimension.
102
- # Else if mask is 3D, assume that it is applied to the query/key/value sequence
103
- # dimension for all attention heads.
104
- #
105
- if mask.ndim == 2:
106
- mask = rearrange(mask, "b s -> b () () () s")
107
- elif mask.ndim == 3:
108
- mask = rearrange(mask, "b n s -> b () () n s")
109
- # Mask similarity values by setting them to negative infinity. This guarantees
110
- # that they will not contribute to the softmax computation below.
111
- similarity.masked_fill_(~mask, torch.finfo(similarity.dtype).min)
112
-
113
- attention = F.softmax(similarity, dim=-1)
114
- if dropout > 0.0:
115
- attention = F.dropout(attention, p=dropout)
116
-
117
- # Apply attention matrix to the value Tensor.
118
- out = einsum(attention, value, "b g h n s, b h s d -> b g h n d")
119
- # Move head dimension back to axis 2
120
- out = rearrange(out, "b g h n d -> b n (h g) d")
121
-
122
- attn_weights: Optional[Tensor] = None
123
- if need_weights:
124
- # Move the sequence dimensions back to positions 1, 2. Move the head dimension
125
- # to position 3. This more closely matches the return shape of the attention
126
- # output: (b, n, h, d).
127
- attn_weights = rearrange(attention, "b g h n s -> b n s (h g)")
128
- if average_attn_weights:
129
- attn_weights = attn_weights.mean(dim=1)
130
-
131
- return out, attn_weights
132
 
133
  class MultiheadAttention(nn.Module):
134
  def __init__(
@@ -327,4 +207,4 @@ class ResidualCrossAttentionBlock(nn.Module):
327
  def forward(self, x: torch.Tensor, data: torch.Tensor):
328
  x = x + self.attn(self.ln_1(x), self.ln_2(data))
329
  x = x + self.mlp(self.ln_3(x))
330
- return x
 
9
  from .utils import init_linear, MLP
10
  from timm.models.vision_transformer import Attention
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  class MultiheadAttention(nn.Module):
14
  def __init__(
 
207
  def forward(self, x: torch.Tensor, data: torch.Tensor):
208
  x = x + self.attn(self.ln_1(x), self.ln_2(data))
209
  x = x + self.mlp(self.ln_3(x))
210
+ return x
craftsman/models/transformers/perceiver_1d.py CHANGED
File without changes
craftsman/models/transformers/utils.py CHANGED
File without changes
craftsman/pipeline.py CHANGED
@@ -158,6 +158,7 @@ class CraftsManPipeline():
158
  background_color: List[int] = [255, 255, 255],
159
  foreground_ratio: float = 0.95,
160
  mc_depth: int = 8,
 
161
  ):
162
  r"""
163
  Function invoked when calling the pipeline for generation.
@@ -198,6 +199,9 @@ class CraftsManPipeline():
198
  mc_depth (`int`, *optional*, defaults to 8):
199
  The resolution of the Marching Cubes algorithm. The resolution is the number of cubes in the x, y, and z.
200
  8 means 2^8 = 256 cubes in each dimension. The higher the resolution, the more detailed the mesh will be.
 
 
 
201
  Examples:
202
 
203
  Returns:
@@ -258,6 +262,15 @@ class CraftsManPipeline():
258
  if output_type == "trimesh":
259
  import trimesh
260
  cur_mesh = trimesh.Trimesh(vertices=mesh_v_f[0][0], faces=mesh_v_f[0][1])
 
 
 
 
 
 
 
 
 
261
  mesh.append(cur_mesh)
262
  elif output_type == "np":
263
  mesh.append(mesh_v_f[0])
 
158
  background_color: List[int] = [255, 255, 255],
159
  foreground_ratio: float = 0.95,
160
  mc_depth: int = 8,
161
+ only_max_component: bool = False,
162
  ):
163
  r"""
164
  Function invoked when calling the pipeline for generation.
 
199
  mc_depth (`int`, *optional*, defaults to 8):
200
  The resolution of the Marching Cubes algorithm. The resolution is the number of cubes in the x, y, and z.
201
  8 means 2^8 = 256 cubes in each dimension. The higher the resolution, the more detailed the mesh will be.
202
+ only_max_component (`bool`, *optional*, defaults to `False`):
203
+ Whether to only keep the largest connected component of the mesh. This is useful when the mesh has
204
+ multiple components and only the largest one is needed.
205
  Examples:
206
 
207
  Returns:
 
262
  if output_type == "trimesh":
263
  import trimesh
264
  cur_mesh = trimesh.Trimesh(vertices=mesh_v_f[0][0], faces=mesh_v_f[0][1])
265
+ if only_max_component:
266
+ components = cur_mesh.split(only_watertight=False)
267
+ bbox = []
268
+ for c in components:
269
+ bbmin = c.vertices.min(0)
270
+ bbmax = c.vertices.max(0)
271
+ bbox.append((bbmax - bbmin).max())
272
+ max_component = np.argmax(bbox)
273
+ cur_mesh = components[max_component]
274
  mesh.append(cur_mesh)
275
  elif output_type == "np":
276
  mesh.append(mesh_v_f[0])
craftsman/systems/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/craftsman/systems/__pycache__/__init__.cpython-310.pyc and b/craftsman/systems/__pycache__/__init__.cpython-310.pyc differ
 
craftsman/systems/__pycache__/base.cpython-310.pyc CHANGED
Binary files a/craftsman/systems/__pycache__/base.cpython-310.pyc and b/craftsman/systems/__pycache__/base.cpython-310.pyc differ
 
craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc CHANGED
Binary files a/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc and b/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc differ
 
craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc CHANGED
Binary files a/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc and b/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc differ
 
craftsman/systems/__pycache__/utils.cpython-310.pyc CHANGED
File without changes
craftsman/systems/pixart_diffusion.py CHANGED
@@ -251,9 +251,9 @@ class PixArtDiffusionSystem(BaseSystem):
251
  return {
252
  "loss_diffusion": loss,
253
  "latents": latents,
254
- "x_t": x_t,
255
  "noise": noise,
256
- "noise_pred": pred_noise,
257
  "timesteps": timesteps,
258
  }
259
 
@@ -373,4 +373,4 @@ class PixArtDiffusionSystem(BaseSystem):
373
  return outputs
374
 
375
  def on_validation_epoch_end(self):
376
- pass
 
251
  return {
252
  "loss_diffusion": loss,
253
  "latents": latents,
254
+ "x_t": noisy_z,
255
  "noise": noise,
256
+ "noise_pred": noise_pred,
257
  "timesteps": timesteps,
258
  }
259
 
 
373
  return outputs
374
 
375
  def on_validation_epoch_end(self):
376
+ pass
craftsman/utils/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/craftsman/utils/__pycache__/__init__.cpython-310.pyc and b/craftsman/utils/__pycache__/__init__.cpython-310.pyc differ
 
craftsman/utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (223 Bytes). View file
 
craftsman/utils/__pycache__/base.cpython-310.pyc CHANGED
Binary files a/craftsman/utils/__pycache__/base.cpython-310.pyc and b/craftsman/utils/__pycache__/base.cpython-310.pyc differ
 
craftsman/utils/__pycache__/base.cpython-311.pyc ADDED
Binary file (7.57 kB). View file
 
craftsman/utils/__pycache__/checkpoint.cpython-310.pyc CHANGED
Binary files a/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc and b/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc differ
 
craftsman/utils/__pycache__/config.cpython-310.pyc CHANGED
Binary files a/craftsman/utils/__pycache__/config.cpython-310.pyc and b/craftsman/utils/__pycache__/config.cpython-310.pyc differ
 
craftsman/utils/__pycache__/config.cpython-311.pyc ADDED
Binary file (9.1 kB). View file