ashawkey commited on
Commit
4fb334a
1 Parent(s): c7c5a2f

Upload folder using huggingface_hub

Browse files
clip_camera_projection/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "CLIPCameraProjection",
3
+ "_diffusers_version": "0.22.2",
4
+ "additional_embeddings": 4,
5
+ "embedding_dim": 768
6
+ }
clip_camera_projection/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de49f3ac9d5b93f4ff521dda6c2ad5f9852c7a7f5e865b8dbebad7ec4bc6bf89
3
+ size 1187512
feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "feature_extractor_type": "CLIPFeatureExtractor",
12
+ "image_mean": [
13
+ 0.48145466,
14
+ 0.4578275,
15
+ 0.40821073
16
+ ],
17
+ "image_processor_type": "CLIPImageProcessor",
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 224
27
+ },
28
+ "use_square_size": false
29
+ }
image_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "lambdalabs/sd-image-variations-diffusers",
3
+ "architectures": [
4
+ "CLIPVisionModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "quick_gelu",
9
+ "hidden_size": 1024,
10
+ "image_size": 224,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 24,
19
+ "patch_size": 14,
20
+ "projection_dim": 768,
21
+ "torch_dtype": "float16",
22
+ "transformers_version": "4.35.0"
23
+ }
image_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dd0c1777bb75e2e9b7aa29b799b3ee8ebaaaa731e2c471a9ec589f12542cce7
3
+ size 607980096
model_index.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Zero123Pipeline",
3
+ "_diffusers_version": "0.22.2",
4
+ "clip_camera_projection": [
5
+ "zero123",
6
+ "CLIPCameraProjection"
7
+ ],
8
+ "feature_extractor": [
9
+ "transformers",
10
+ "CLIPImageProcessor"
11
+ ],
12
+ "image_encoder": [
13
+ "transformers",
14
+ "CLIPVisionModelWithProjection"
15
+ ],
16
+ "requires_safety_checker": false,
17
+ "safety_checker": [
18
+ null,
19
+ null
20
+ ],
21
+ "scheduler": [
22
+ "diffusers",
23
+ "DDIMScheduler"
24
+ ],
25
+ "unet": [
26
+ "diffusers",
27
+ "UNet2DConditionModel"
28
+ ],
29
+ "vae": [
30
+ "diffusers",
31
+ "AutoencoderKL"
32
+ ]
33
+ }
readme.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ # Uses
6
+ _Note: This section is originally taken from the [Stable Diffusion v2 model card](https://huggingface.co/stabilityai/stable-diffusion-2), but applies in the same way to Zero-1-to-3._
7
+
8
+ ## Direct Use
9
+ The model is intended for research purposes only. Possible research areas and tasks include:
10
+
11
+ - Safe deployment of large-scale models.
12
+ - Probing and understanding the limitations and biases of generative models.
13
+ - Generation of artworks and use in design and other artistic processes.
14
+ - Applications in educational or creative tools.
15
+ - Research on generative models.
16
+
17
+ Excluded uses are described below.
18
+
19
+ ### Misuse, Malicious Use, and Out-of-Scope Use
20
+ The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.
21
+
22
+ #### Out-of-Scope Use
23
+ The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model.
24
+
25
+ #### Misuse and Malicious Use
26
+ Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to:
27
+
28
+ - Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc.
29
+ - Intentionally promoting or propagating discriminatory content or harmful stereotypes.
30
+ - Impersonating individuals without their consent.
31
+ - Sexual content without consent of the people who might see it.
32
+ - Mis- and disinformation
33
+ - Representations of egregious violence and gore
34
+ - Sharing of copyrighted or licensed material in violation of its terms of use.
35
+ - Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use.
36
+
37
+ ## Limitations and Bias
38
+
39
+ ### Limitations
40
+
41
+ - The model does not achieve perfect photorealism.
42
+ - The model cannot render legible text.
43
+ - Faces and people in general may not be parsed or generated properly.
44
+ - The autoencoding part of the model is lossy.
45
+ - Stable Diffusion was trained on a subset of the large-scale dataset [LAION-5B](https://laion.ai/blog/laion-5b/), which contains adult, violent and sexual content. To partially mitigate this, Stability AI has filtered the dataset using LAION's NSFW detector.
46
+ - Zero-1-to-3 was subsequently finetuned on a subset of the large-scale dataset [Objaverse](https://objaverse.allenai.org/), which might also potentially contain inappropriate content. To partially mitigate this, our demo applies a safety check to every uploaded image.
47
+
48
+ ### Bias
49
+ While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.
50
+ Stable Diffusion was primarily trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/), which consists of images that are limited to English descriptions.
51
+ Images and concepts from communities and cultures that use other languages are likely to be insufficiently accounted for.
52
+ This affects the overall output of the model, as Western cultures are often overrepresented.
53
+ Stable Diffusion mirrors and exacerbates biases to such a degree that viewer discretion must be advised irrespective of the input or its intent.
54
+
55
+
56
+ ### Safety Module
57
+ The intended use of this model is with the [Safety Checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) in Diffusers.
58
+ This checker works by checking model inputs against known hard-coded NSFW concepts.
59
+ Specifically, the checker compares the class probability of harmful concepts in the embedding space of the uploaded input images.
60
+ The concepts are passed into the model with the image and compared to a hand-engineered weight for each NSFW concept.
61
+
62
+ ## Citation
63
+ ```
64
+ @misc{liu2023zero1to3,
65
+ title={Zero-1-to-3: Zero-shot One Image to 3D Object},
66
+ author={Ruoshi Liu and Rundi Wu and Basile Van Hoorick and Pavel Tokmakov and Sergey Zakharov and Carl Vondrick},
67
+ year={2023},
68
+ eprint={2303.11328},
69
+ archivePrefix={arXiv},
70
+ primaryClass={cs.CV}
71
+ }
72
+ ```
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.22.2",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "num_train_timesteps": 1000,
11
+ "prediction_type": "epsilon",
12
+ "rescale_betas_zero_snr": false,
13
+ "sample_max_value": 1.0,
14
+ "set_alpha_to_one": false,
15
+ "steps_offset": 1,
16
+ "thresholding": false,
17
+ "timestep_spacing": "leading",
18
+ "trained_betas": null
19
+ }
unet/config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.22.2",
4
+ "act_fn": "silu",
5
+ "addition_embed_type": null,
6
+ "addition_embed_type_num_heads": 64,
7
+ "addition_time_embed_dim": null,
8
+ "attention_head_dim": 8,
9
+ "attention_type": "default",
10
+ "block_out_channels": [
11
+ 320,
12
+ 640,
13
+ 1280,
14
+ 1280
15
+ ],
16
+ "center_input_sample": false,
17
+ "class_embed_type": null,
18
+ "class_embeddings_concat": false,
19
+ "conv_in_kernel": 3,
20
+ "conv_out_kernel": 3,
21
+ "cross_attention_dim": 768,
22
+ "cross_attention_norm": null,
23
+ "down_block_types": [
24
+ "CrossAttnDownBlock2D",
25
+ "CrossAttnDownBlock2D",
26
+ "CrossAttnDownBlock2D",
27
+ "DownBlock2D"
28
+ ],
29
+ "downsample_padding": 1,
30
+ "dropout": 0.0,
31
+ "dual_cross_attention": false,
32
+ "encoder_hid_dim": null,
33
+ "encoder_hid_dim_type": null,
34
+ "flip_sin_to_cos": true,
35
+ "freq_shift": 0,
36
+ "in_channels": 8,
37
+ "layers_per_block": 2,
38
+ "mid_block_only_cross_attention": null,
39
+ "mid_block_scale_factor": 1,
40
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
41
+ "norm_eps": 1e-05,
42
+ "norm_num_groups": 32,
43
+ "num_attention_heads": null,
44
+ "num_class_embeds": null,
45
+ "only_cross_attention": false,
46
+ "out_channels": 4,
47
+ "projection_class_embeddings_input_dim": null,
48
+ "resnet_out_scale_factor": 1.0,
49
+ "resnet_skip_time_act": false,
50
+ "resnet_time_scale_shift": "default",
51
+ "reverse_transformer_layers_per_block": null,
52
+ "sample_size": 32,
53
+ "time_cond_proj_dim": null,
54
+ "time_embedding_act_fn": null,
55
+ "time_embedding_dim": null,
56
+ "time_embedding_type": "positional",
57
+ "timestep_post_act": null,
58
+ "transformer_layers_per_block": 1,
59
+ "up_block_types": [
60
+ "UpBlock2D",
61
+ "CrossAttnUpBlock2D",
62
+ "CrossAttnUpBlock2D",
63
+ "CrossAttnUpBlock2D"
64
+ ],
65
+ "upcast_attention": null,
66
+ "use_linear_projection": false
67
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b5e327399abc3a6d54e89f16a10b728dfd2162b4fc70258dbf934859ff7520f
3
+ size 1719148344
vae/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.22.2",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "layers_per_block": 2,
21
+ "norm_num_groups": 32,
22
+ "out_channels": 3,
23
+ "sample_size": 256,
24
+ "scaling_factor": 0.18215,
25
+ "up_block_types": [
26
+ "UpDecoderBlock2D",
27
+ "UpDecoderBlock2D",
28
+ "UpDecoderBlock2D",
29
+ "UpDecoderBlock2D"
30
+ ]
31
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
3
+ size 167335342