Spaces:

yfdeng
/

Anymate

Running on Zero

App Files Files Community

yfdeng commited on May 9

Commit

744eb4e

1 Parent(s): 5df226f

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Anymate/.gitignore +26 -0
Anymate/__init__.py +0 -0
Anymate/args.py +22 -0
Anymate/blender_script.py +747 -0
Anymate/checkpoints/.gitkeep +0 -0
Anymate/configs/.gitkeep +0 -0
Anymate/configs/conn.yaml +40 -0
Anymate/configs/conn_token.yaml +40 -0
Anymate/configs/diffusion.yaml +49 -0
Anymate/configs/diffusion_concat.yaml +46 -0
Anymate/configs/diffusion_cross.yaml +51 -0
Anymate/configs/joints.yaml +40 -0
Anymate/configs/joints_implicit.yaml +40 -0
Anymate/configs/joints_triplane.yaml +40 -0
Anymate/configs/skin.yaml +40 -0
Anymate/configs/skin_multi.yaml +40 -0
Anymate/dataset.py +62 -0
Anymate/get_checkpoints.sh +22 -0
Anymate/get_datasets.sh +12 -0
Anymate/model.py +360 -0
Anymate/models/__init__.py +0 -0
Anymate/models/conn.py +195 -0
Anymate/models/diffusion.py +483 -0
Anymate/models/joint.py +282 -0
Anymate/models/skin.py +309 -0
Anymate/tmp/.gitkeep +0 -0
Anymate/utils/dataset_utils.py +129 -0
Anymate/utils/diffusion_encoder.py +258 -0
Anymate/utils/diffusion_utils.py +314 -0
Anymate/utils/eval_utils.py +225 -0
Anymate/utils/loss_utils.py +56 -0
Anymate/utils/render_utils.py +1169 -0
Anymate/utils/train_utils.py +406 -0
Anymate/utils/ui_utils.py +284 -0
Anymate/utils/ui_utils_bpy.py +134 -0
Anymate/utils/utils.py +77 -0
Anymate/utils/vol_utils.py +135 -0
Render.py +17 -0
ThirdParty/PointLLM/.gitignore +12 -0
ThirdParty/PointLLM/README.md +353 -0
ThirdParty/PointLLM/__init__.py +0 -0
ThirdParty/PointLLM/pointllm/__init__.py +1 -0
ThirdParty/PointLLM/pointllm/conversation.py +375 -0
ThirdParty/PointLLM/pointllm/data/__init__.py +3 -0
ThirdParty/PointLLM/pointllm/data/modelnet.py +147 -0
ThirdParty/PointLLM/pointllm/data/modelnet_config/ModelNet40.yaml +8 -0
ThirdParty/PointLLM/pointllm/data/object_point_dataset.py +250 -0
ThirdParty/PointLLM/pointllm/data/utils.py +236 -0
ThirdParty/PointLLM/pointllm/eval/PointLLM_chat.py +157 -0
ThirdParty/PointLLM/pointllm/eval/chat_gradio.py +394 -0

Anymate/.gitignore ADDED Viewed

	@@ -0,0 +1,26 @@

+__pycache__
+*.pt
+*.tar
+*.tar
+*.txt
+*.glb*
+*.obj
+*.ckpt
+*.blend
+*.blend1
+test_*
+blender-*
+*.json*
+*.glb
+*.gltf
+*.fbx
+*.FBX
+*.dae
+*.obj
+*.mtl
+*.binvox
+*.csv
+*.tga
+*.png
+*.jpg

Anymate/__init__.py ADDED Viewed

File without changes

Anymate/args.py ADDED Viewed

	@@ -0,0 +1,22 @@

+class AnymateArgs:
+    def __init__(self):
+        # self.encoder = "miche"
+        # self.decoder = "transformer_latent"
+        # self.dataset = "train"
+        # self.run_name = "miche-transformer_latent-train-8gpu-finetune"
+        self.checkpoint_joint = "Anymate/checkpoints/joint/bert-transformer_latent-train-8gpu-finetune.pth.tar"
+        self.checkpoint_conn = "Anymate/checkpoints/conn/bert-attendjoints_con_combine-train-8gpu-finetune.pth.tar"
+        self.checkpoint_skin = "Anymate/checkpoints/skin/bert-attendjoints_combine-train-8gpu-finetune.pth.tar"
+        self.device = "cuda"
+        self.num_joints = 96
+class UIArgs:
+    def __init__(self):
+        self.checkpoint_joint = "Anymate/checkpoints/joint/bert-transformer_latent-train-8gpu-finetune.pth.tar"
+        self.checkpoint_conn = "Anymate/checkpoints/conn/bert-attendjoints_con_combine-train-8gpu-finetune.pth.tar"
+        self.checkpoint_skin = "Anymate/checkpoints/skin/bert-attendjoints_combine-train-8gpu-finetune.pth.tar"
+ui_args = UIArgs()
+anymate_args = AnymateArgs()

Anymate/blender_script.py ADDED Viewed

	@@ -0,0 +1,747 @@

+import bpy
+import mathutils
+from mathutils import Vector, Matrix
+import os
+import sys
+import random
+import numpy as np
+import json
+import argparse
+IMPORT_FUNCTIONS = {
+    "obj": bpy.ops.wm.obj_import,
+    "glb": bpy.ops.import_scene.gltf,
+    "gltf": bpy.ops.import_scene.gltf,
+    "usd": bpy.ops.import_scene.usd,
+    "fbx": bpy.ops.import_scene.fbx,
+    "stl": bpy.ops.import_mesh.stl,
+    "usda": bpy.ops.import_scene.usda,
+    "dae": bpy.ops.wm.collada_import,
+    "ply": bpy.ops.import_mesh.ply,
+    "abc": bpy.ops.wm.alembic_import,
+    "blend": bpy.ops.wm.append,
+}
+def load_object(object_path: str) -> None:
+    """Loads a model with a supported file extension into the scene.
+    Args:
+        object_path (str): Path to the model file.
+    Raises:
+        ValueError: If the file extension is not supported.
+    Returns:
+        None
+    """
+    file_extension = object_path.split(".")[-1].lower()
+    if file_extension is None:
+        raise ValueError(f"Unsupported file type: {object_path}")
+    # load from existing import functions
+    import_function = IMPORT_FUNCTIONS[file_extension]
+    if file_extension == "blend":
+        import_function(directory=object_path, link=False)
+    elif file_extension in {"glb", "gltf"}:
+        import_function(filepath=object_path, merge_vertices=True)
+    else:
+        import_function(filepath=object_path)
+####################### save json ################################
+def save_json(output_path, mesh_obj, armature_obj, extra=None, arm_name=False):
+    # makedirs output_path
+    os.makedirs(output_path, exist_ok=True)
+    # start retrieve the information of mesh, skining and rigging
+    #1. retrieve the information of rigging, save the world matrix of the amature object
+    total_armature_info = {}
+    for obj in armature_obj:
+        # depsgraph = bpy.context.evaluated_depsgraph_get()
+        # obj = obj.evaluated_get(depsgraph)
+        armature_info = {}
+        armature_info["world_matrix"] = [list(row) for row in obj.matrix_world.copy()]
+        translation = obj.matrix_world.translation
+        for bone in obj.pose.bones:
+            bone_info = {}
+            bone_info["head_local"] = list(bone.head.copy())
+            bone_info["head_world"] = list((obj.matrix_world.to_3x3() @ bone.head+translation).copy())
+            # bone_info["matrix_local"] = [list(row) for row in bone.matrix_local.copy()]
+            bone_info["tail_local"] = list(bone.tail.copy())
+            bone_info["tail_world"] = list((obj.matrix_world.to_3x3() @ bone.tail+translation).copy())
+            if bone.parent:
+                bone_info["parent"] = bone.parent.name.replace(" ", "_")
+                if arm_name:
+                    bone_info["parent"] = obj.name + "--" + bone_info["parent"]
+            else:
+                bone_info["parent"] = None
+            bone_info["children"] = []
+            if bone.children:
+                for child in bone.children:
+                    if arm_name:
+                        bone_info["children"].append(obj.name + "--" + child.name.replace(" ", "_"))
+                    else:
+                        bone_info["children"].append(child.name.replace(" ", "_"))
+            bone_name = bone.name.replace(" ", "_")
+            if arm_name:
+                bone_name = obj.name + "--" + bone_name
+            armature_info[bone_name] = bone_info
+        obj_name = obj.name.replace(" ", "_")
+        total_armature_info[obj.name] = armature_info
+    #2. retrieve the informatioon of skining
+    total_skinning_info = {}
+    for obj in mesh_obj:
+        vertex_groups = obj.vertex_groups
+        # if not vertex_groups:
+        #     continue
+        # for group in vertex_groups:
+        skinning_info = {}
+        skinning_info["world_matrix"] = [list(row) for row in obj.matrix_world.copy()]
+        weight_info = []
+        for vertex in obj.data.vertices:
+            vertex_info = {}
+            for group in vertex.groups:
+                name = vertex_groups[group.group].name
+                name = name.replace(" ", "_")
+                if arm_name:
+                    arm_modifier = [modifier for modifier in obj.modifiers if modifier.type == 'ARMATURE']
+                    assert(len(arm_modifier) == 1)
+                    name = arm_modifier[0].object.name + "--" + name
+                weight = group.weight
+                vertex_info[name] = weight
+            weight_info.append(vertex_info)
+        skinning_info["weight"] = weight_info
+        obj_name = obj.name.replace(" ", "_")
+        total_skinning_info[obj_name]=skinning_info
+    rigging_file_path = os.path.join(output_path, "rigging.json")
+    if extra:
+        rigging_file_path = rigging_file_path.replace("rigging.json", f'rigging_{extra}.json')
+    with open(rigging_file_path, "w") as f:
+        json.dump(total_armature_info, f, indent = 2)
+    skining_file_path = os.path.join(output_path, "skining.json")
+    if extra:
+        skining_file_path = skining_file_path.replace("skining.json", f'skining_{extra}.json')
+    with open(skining_file_path, "w") as f:
+        json.dump(total_skinning_info, f , indent = 2)
+    return rigging_file_path
+def apply_skinning_weights(json_file):
+    with open(json_file, "r") as f:
+        skinning_data = json.load(f)
+    armature_obj = bpy.data.objects.get("Armature")
+    if not armature_obj:
+        print("Error: Armature object 'Armature' not found.")
+        return
+    # 将所有网格对象放置在骨骼对象的子集中
+    count = 0
+    for obj in bpy.context.scene.objects:
+        if obj.type == 'MESH':
+            obj.parent = armature_obj
+            count += 1
+    print("total mesh count:", count)
+    for obj in bpy.context.scene.objects:
+        vertex_index = 0
+        if obj.type == 'MESH':
+            mesh_name = obj.name
+            if mesh_name in skinning_data:
+                skinning_info = skinning_data[mesh_name]
+                if "weight" in skinning_info:
+                    print("Applying skinning data for mesh:", mesh_name)
+                    vertex_index = 0
+                    for vertex_weight in skinning_info["weight"]:
+                        for bone_name, weight_value in vertex_weight.items():
+                            vertex_group = obj.vertex_groups.get(bone_name)
+                            if vertex_group is None:
+                                vertex_group = obj.vertex_groups.new(name=bone_name)
+                                print("Vertex group created:", bone_name)
+                            vertex_group.add([vertex_index], weight_value, 'REPLACE')
+                        vertex_index += 1
+            else:
+                print("No skinning data found for mesh:", mesh_name)
+    for obj in bpy.context.scene.objects:
+        if obj.type == 'MESH':
+            modifier = obj.modifiers.new(name="Armature", type='ARMATURE')
+            modifier.object = armature_obj
+            modifier.use_vertex_groups = True
+            print("Armature modifier added to mesh:", obj.name)
+def reload_rigging(rigging_file_path):
+    with open(rigging_file_path, "r") as f:
+        total_armature_info = json.load(f)
+    bpy.ops.object.armature_add()
+    armature_obj = bpy.context.object
+    armature_obj.name = "Armature"
+    bpy.ops.object.mode_set(mode='EDIT')
+    bpy.ops.armature.select_all(action='SELECT')
+    bpy.ops.armature.delete()
+    bpy.ops.object.mode_set(mode='OBJECT')
+    bpy.ops.object.mode_set(mode='EDIT')
+    world_matrix = mathutils.Matrix([[1, 0, 0, 0],
+                                     [0, 1, 0, 0],
+                                     [0, 0, 1, 0],
+                                     [0, 0, 0, 1]])
+    armature_obj.matrix_world = world_matrix
+    for armature_name, armature_info in total_armature_info.items():
+        for bone_name, bone_info in armature_info.items():
+            if bone_name == "world_matrix":
+                continue
+            bone = armature_obj.data.edit_bones.new(bone_name)
+            bone.head = bone_info["head_world"]
+            bone.tail = bone_info["tail_world"]
+        for bone_name, bone_info in armature_info.items():
+            if bone_name == "world_matrix":
+                continue
+            bone = armature_obj.data.edit_bones[bone_name]
+            parent_name = bone_info["parent"]
+            if parent_name:
+                parent_bone = armature_obj.data.edit_bones[parent_name]
+                bone.parent = parent_bone
+    edit_len = len(armature_obj.data.edit_bones.keys())
+    bpy.ops.object.mode_set(mode='OBJECT')
+    bone_len = len(armature_obj.data.bones.keys())
+    assert(edit_len == bone_len, "bone number not match!" + str(edit_len) + " " + str(bone_len))
+    bpy.ops.object.select_all(action='DESELECT')
+    armature_obj.select_set(True)
+    bpy.context.view_layer.objects.active = armature_obj
+    print("Rigging information has been reloaded!")
+############################# reload json ################################
+def reload_json(folder_path, version=0, export = None):
+    bpy.ops.wm.read_homefile(use_empty=True)
+    if version == 0:
+        obj_path = os.path.join(folder_path, "object.obj")
+        skinning_file_path = os.path.join(folder_path, "skining.json")
+        rigging_file_path = os.path.join(folder_path, "rigging.json")
+    elif version == 1:
+        obj_path = os.path.join(folder_path, "join.obj")
+        skinning_file_path = os.path.join(folder_path, "skining_norig.json")
+        rigging_file_path = os.path.join(folder_path, "rigging_norig.json")
+    elif version == 2:
+        obj_path = os.path.join(folder_path, "join.obj")
+        skinning_file_path = os.path.join(folder_path, "skining_norig2.json")
+        rigging_file_path = os.path.join(folder_path, "rigging_norig2.json")
+    # import_obj(obj_path)
+    load_object(obj_path)
+    reload_rigging(rigging_file_path)
+    apply_skinning_weights(skinning_file_path)
+    if export:
+        bpy.ops.wm.save_as_mainfile(filepath=export)
+    print("Done!")
+def reset_scene() -> None:
+    """Resets the scene to a clean state.
+    Returns:
+        None
+    """
+    # delete everything that isn't part of a camera or a light
+    for obj in bpy.data.objects:
+        if obj.type not in {"CAMERA", "LIGHT"}:
+            bpy.data.objects.remove(obj, do_unlink=True)
+    # delete all the materials
+    for material in bpy.data.materials:
+        bpy.data.materials.remove(material, do_unlink=True)
+    # delete all the textures
+    for texture in bpy.data.textures:
+        bpy.data.textures.remove(texture, do_unlink=True)
+    # delete all the images
+    for image in bpy.data.images:
+        bpy.data.images.remove(image, do_unlink=True)
+def save_mesh(path, mtl=False, obj_path=None):
+    if mtl:
+        # save the blend file
+        bpy.ops.wm.save_as_mainfile(filepath=obj_path + '/object.blend')
+        # reopen the blend file
+        bpy.ops.wm.open_mainfile(filepath=obj_path + '/object.blend')
+        # unpack all the materials and textures to obj_path
+        bpy.ops.file.unpack_all(method='WRITE_LOCAL')
+    # save to .obj without material
+    bpy.ops.wm.obj_export(filepath=path, export_materials=mtl, export_uv=mtl, export_triangulated_mesh=True)
+def get_root_obj(obj):
+    if not obj.parent:
+        return obj
+    return get_root_obj(obj.parent)
+def normalize(objs):
+    # bpy.ops.object.select_all(action='DESELECT')
+    # # select objs and join them
+    # for obj in objs:
+    #     obj.select_set(True)
+    # bpy.context.view_layer.objects.active = objs[0]
+    # name_join = objs[0].name
+    # bpy.ops.object.join()
+    # obj_join = bpy.context.active_object
+    # print(obj_join.matrix_world)
+    # print(name_join)
+    # assert(name_join == obj_join.name)
+    objs_eval = []
+    depsgraph = bpy.context.evaluated_depsgraph_get()
+    for obj in objs:
+        objs_eval.append(obj.evaluated_get(depsgraph))
+    vertices = []
+    for obj in objs_eval:
+        for v in obj.data.vertices:
+            vertices.append(obj.matrix_world @ Vector((v.co.x, v.co.y, v.co.z, 1)))
+    vertices = np.array(vertices)
+    min_x, min_y, min_z, _ = np.min(vertices, axis=0)
+    max_x, max_y, max_z, _ = np.max(vertices, axis=0)
+    # print(min_x, min_y, min_z)
+    # print(max_x, max_y, max_z)
+    scale_x = 1 / (max_x - min_x)
+    scale_y = 1 / (max_y - min_y)
+    scale_z = 1 / (max_z - min_z)
+    scale_min = min(scale_x, scale_y, scale_z)
+    assert scale_min < 1e6
+    translate_x = - (max_x + min_x) / 2 * scale_min
+    translate_y = - (max_y + min_y) / 2 * scale_min
+    translate_z = - min_z * scale_min
+    # form transformation matrix
+    trans = Matrix.Translation((translate_x, translate_y, translate_z))
+    scale = Matrix.Scale(scale_min, 4, (1, 0, 0)) @ Matrix.Scale(scale_min, 4, (0, 1, 0)) @ Matrix.Scale(scale_min, 4, (0, 0, 1))
+    # print(trans, scale)
+    root = get_root_obj(objs[0])
+    # print(root.name)
+    # print(root.scale)
+    # print(root.location)
+    # print(root.matrix_world)
+    # root.location = mathutils.Vector(root.location) + mathutils.Vector((translate_x, translate_y, translate_z))
+    # root.scale = mathutils.Vector(root.scale) * mathutils.Vector((scale_x, scale_y, scale_z))
+    # add the extra transformation to the root object's world matrix
+    root.matrix_world = trans @ scale @ root.matrix_world
+    # print(root.name)
+    # print(root.scale)
+    # print(root.location)
+    # print(root.matrix_world)
+    # refresh
+    bpy.context.view_layer.update()
+    ######### check if its successful
+    # objs_eval = []
+    # depsgraph = bpy.context.evaluated_depsgraph_get()
+    # for obj in objs:
+    #     objs_eval.append(obj.evaluated_get(depsgraph))
+    # vertices = []
+    # for obj in objs_eval:
+    #     for v in obj.data.vertices:
+    #         vertices.append(obj.matrix_world @ Vector((v.co.x, v.co.y, v.co.z, 1)))
+    # vertices = np.array(vertices)
+    # min_x, min_y, min_z, _ = np.min(vertices, axis=0)
+    # max_x, max_y, max_z, _ = np.max(vertices, axis=0)
+    # print(min_x, min_y, min_z)
+    # print(max_x, max_y, max_z)
+def remesh(objs, target=5000):
+    num_v = {}
+    for obj in objs:
+        num_v[obj] = len(obj.data.vertices)
+    # sort the num_v dict and make it a dict again
+    num_v_sort = sorted(num_v.items(), key=lambda x: x[1], reverse=True)
+    # print(num_v_sort)
+    total_v = sum([num_v[obj] for obj in num_v])
+    iters = 0
+    while total_v > target and iters<20:
+        reduce = []
+        for obj, v in num_v_sort:
+            reduce.append(obj)
+            if sum([num_v[oo] for oo in reduce]) > 0.5 * total_v:
+                break
+        for obj in reduce:
+            # check if have shape key
+            if obj.data.shape_keys is not None:
+                # remove obj from num_v
+                num_v.pop(obj)
+                continue
+            ratio = 0.5
+            # apply decimate modifier
+            bpy.context.view_layer.objects.active = obj
+            bpy.ops.object.modifier_add(type='DECIMATE')
+            bpy.context.object.modifiers["Decimate"].ratio = ratio
+            bpy.ops.object.modifier_apply(modifier="Decimate")
+            # update num_v
+            num_v[obj] = len(obj.data.vertices)
+        total_v = sum([num_v[obj] for obj in num_v])
+        num_v_sort = sorted(num_v.items(), key=lambda x: x[1], reverse=True)
+        # print(num_v_sort)
+        iters+=1
+def get_parents(obj):
+    if not obj.parent:
+        return [obj.name]
+    parents = get_parents(obj.parent)
+    parents.append(obj.name)
+    return parents
+def check(objs, arm):
+    # assert('Sketchfab_model' in bpy.data.objects)
+    # root_arm = get_root_obj(arm)
+    # for obj in objs:
+    #     if root_arm != get_root_obj(obj):
+    #         print('not same root')
+    #         return -1
+    # return 1
+    # action_num = 0
+    # actions = bpy.data.actions
+    # for act in actions:
+    #     action_num += 1
+    #     fcurves = act.fcurves
+    #     data_paths = []
+    #     not_pose = False
+    #     for fcurve in fcurves:
+    #         data_paths.append(fcurve.data_path)
+    #         if not fcurve.data_path.startswith('pose.bones'):
+    #             # print(fcurve.data_path)
+    #             not_pose = True
+    #             # return -1
+    #     if not_pose:
+    #         print('zyhsb')
+    #         print(data_paths)
+    #         return -1
+    # return action_num
+    for obj in objs:
+        vertex_groups = obj.vertex_groups
+        # if not vertex_groups:
+        #     continue
+        # for group in vertex_groups:
+        for vertex in obj.data.vertices:
+            vertex_info = {}
+            for group in vertex.groups:
+                name = vertex_groups[group.group].name
+                name = name.replace(" ", "_")
+                if True:
+                    arm_modifier = [modifier for modifier in obj.modifiers if modifier.type == 'ARMATURE']
+                    if len(arm_modifier) != 1:
+                        print('zyhsb', len(arm_modifier))
+                        return -2
+                    # name = arm_modifier[0].object.name + "--" + name
+    return 1
+    # for obj in objs:
+    #     if obj.data.shape_keys is not None:
+    #         return 1
+    #         # only 942!!!
+    # return 0
+def delete(objs):
+    # check if the mesh object has skinning weight
+    for obj in objs:
+        vertex_groups = obj.vertex_groups
+        if not vertex_groups:
+            # delete the object
+            bpy.data.objects.remove(obj)
+            # print('delete!!!')
+    meshes = []
+    for obj in bpy.context.scene.objects:
+        if obj.type == "MESH":
+            meshes.append(obj)
+    return meshes
+def merge_mesh(folder_path, export = None, save_join = True):
+    # output_path = os.path.join(folder_path, "rigging_norig.json")
+    # if os.path.exists(output_path):
+    #     print("Already processed folder:", folder_path)
+    #     return
+    bpy.ops.wm.read_homefile(use_empty=True)
+    try:
+        reload_json(folder_path)
+    except:
+        print("Error in reloading json file")
+        # remove the folder
+        os.system(f"rm -r {folder_path}")
+        return None, None
+    bpy.ops.object.select_all(action='DESELECT')
+    if export:
+        bpy.ops.wm.save_as_mainfile(filepath='reload_' + export)
+    meshes = []
+    for obj in bpy.context.scene.objects:
+        if obj.type == "MESH":
+            bpy.context.view_layer.objects.active = obj
+            obj.select_set(True)
+            meshes.append(obj)
+    print("meshes length", len(meshes))
+    bpy.ops.object.join()
+    if export:
+        bpy.ops.wm.save_as_mainfile(filepath='join_' + export)
+    meshes = []
+    for obj in bpy.context.scene.objects:
+        if obj.type == "MESH":
+            meshes.append(obj)
+    if len(meshes) != 1:
+        bpy.ops.wm.save_as_mainfile(filepath='join_f.blend')
+    assert len(meshes) == 1
+    # remesh(meshes[0])
+    if save_join:
+        obj_path = os.path.join(folder_path, "object.obj")
+        bpy.ops.wm.obj_export(filepath=obj_path, export_materials=False, export_uv=False, export_triangulated_mesh=True)
+    # mesh = trimesh.load(glb_file_path)
+    # mesh.export(obj_path, file_type='obj')
+    # save to json file
+    total_armature_count = 0
+    armature_obj = []
+    mesh_obj = []
+    for obj in bpy.context.scene.objects:
+        if obj.type == "ARMATURE":
+            total_armature_count += 1
+            armature_obj.append(obj)
+        if obj.type == "MESH":
+            mesh_obj.append(obj)
+    if total_armature_count == 0:
+        print("No rigging information for the file:", folder_path+"\n")
+        return None, None
+    ######### delete bones that are not in the vertex group
+    vertex_group_name = [group.name for group in mesh_obj[0].vertex_groups]
+    bpy.context.view_layer.objects.active = armature_obj[0]
+    bpy.ops.object.mode_set(mode='EDIT')
+    edit_bones = armature_obj[0].data.edit_bones
+    bone_delete = set([bone.name for bone in edit_bones]) - set(vertex_group_name)
+    print(f"Deleting {len(bone_delete)} bones")
+    for bone in bone_delete:
+        # if the bone is root, then do not delete it
+        if edit_bones[bone].parent == None:
+            # return len([1 for child in edit_bones[bone].children if child.name in bone_delete])
+            num_children = len(edit_bones[bone].children)
+            if num_children <= 1:
+                edit_bones.remove(edit_bones[bone])
+                continue
+            if num_children > 1:
+                center = mathutils.Vector((0, 0, 0))
+                for child in edit_bones[bone].children:
+                    center += child.head
+                center /= num_children
+                min_dist = 1e9
+                for child in edit_bones[bone].children:
+                    dist = (child.head - center).length
+                    if dist < min_dist:
+                        min_dist = dist
+                        min_child = child
+                for child in edit_bones[bone].children:
+                    if child != min_child:
+                        child.parent = min_child
+                edit_bones.remove(edit_bones[bone])
+                continue
+            continue
+        # assign bone's children to bone's parent
+        bone_obj = edit_bones[bone]
+        for child in bone_obj.children:
+            child.parent = bone_obj.parent
+        edit_bones.remove(edit_bones[bone])
+    bpy.ops.object.mode_set(mode='OBJECT')
+    if export:
+        bpy.ops.wm.save_as_mainfile(filepath='delete_' + export)
+    mesh_obj = []
+    armature_obj = []
+    for obj in bpy.context.scene.objects:
+        if obj.type == "MESH":
+            mesh_obj.append(obj)
+        if obj.type == "ARMATURE":
+            armature_obj.append(obj)
+    assert len(mesh_obj) == 1
+    assert len(armature_obj) == 1
+    return mesh_obj, armature_obj
+def process(file_path, obj_path=None, stamp=None, tex=False):
+    # check if obj_path exists
+    # if os.path.exists(obj_path + '/object.obj'):
+    #     print('object.obj exists')
+    #     return True
+    reset_scene()
+    load_object(file_path)
+    # bpy.ops.import_scene.gltf(filepath=glb_file_path)
+    # delete hierarchy collections['glTF_not_exported']
+    if 'glTF_not_exported' in bpy.data.collections:
+        print('DELETE glTF_not_exported')
+        bpy.data.collections.remove(bpy.data.collections['glTF_not_exported'])
+    if stamp is not None:
+        # Set the current frame to the stamp value
+        bpy.context.scene.frame_set(stamp)
+        print(f'Set the current frame to {stamp}')
+        # Ensure all objects are updated to this frame
+        bpy.context.view_layer.update()
+    mesh_obj = []
+    armature_obj = []
+    for obj in bpy.context.scene.objects:
+        if obj.type == "ARMATURE":
+            # if len(armature_obj) > 0:
+            #     print(file_path, 'has more than 1 armature')
+            #     return -2
+            armature_obj.append(obj)
+            # obj.show_in_front = True
+            armature_obj[-1].data.pose_position = 'POSE'
+        if obj.type == "MESH":
+            mesh_obj.append(obj)
+            # if obj.data.shape_keys is not None:
+            #     return False
+    # mesh_obj = delete(mesh_obj)
+    # if len(mesh_obj) == 0:
+    #     # print('zyhsb -1', file_path, obj_path)
+    #     return -1
+    # return check(mesh_obj, armature_obj)
+    # total_vertices = np.array([len(obj.data.vertices) for obj in mesh_obj]).sum()
+    # if total_vertices < 1000: return
+    # if total_vertices > 10000: remesh(mesh_obj)
+    # bpy.ops.object.select_all(action='DESELECT')
+    # armature_obj.select_set(True)
+    # execute(bpy.context)
+    # normalize(mesh_obj)
+    mesh_obj = delete(mesh_obj)
+    if len(mesh_obj) == 0:
+        # print('zyhsb -1', file_path, obj_path)
+        return -1
+    save_json(obj_path, mesh_obj, armature_obj, arm_name=True)
+    if not tex:
+        save_mesh(obj_path + '/object.obj')
+    else:
+        save_mesh(obj_path + '/object.obj', mtl=True, obj_path=obj_path)
+    mesh_obj, armature_obj = merge_mesh(obj_path)
+    if mesh_obj is None or armature_obj is None:
+        # print('zyhsb -2', file_path, obj_path)
+        return -2
+    try:
+        normalize(mesh_obj)
+    except:
+        os.system(f"rm -r {obj_path}")
+        # print('zyhsb -3', file_path, obj_path)
+        return -3
+    save_json(obj_path, mesh_obj, armature_obj)
+    if not tex:
+        save_mesh(obj_path + '/object.obj')
+    else:
+        save_mesh(obj_path + '/object.obj', mtl=True, obj_path=obj_path)
+    return 1
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--object_path",
+        type=str,
+        required=True,
+        help="Path to the object file",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="Path to the directory where the rendered images and metadata will be saved.",
+    )
+    parser.add_argument(
+        "--stamp",
+        type=int,
+        required=False,
+        help="Stamp to be used for the rendering.",
+    )
+    parser.add_argument(
+        "--tex",
+        type=bool,
+        required=False,
+        help="Save the texture.",
+    )
+    argv = sys.argv[sys.argv.index("--") + 1 :]
+    args = parser.parse_args(argv)
+    os.makedirs(args.output_dir, exist_ok=True)
+    stamp = args.stamp if args.stamp else None
+    print(f'Stamp: {stamp}')
+    result = process(args.object_path, obj_path=args.output_dir, stamp=stamp, tex=args.tex)
+    # import numpy as np
+    # os.makedirs(args.output_dir, exist_ok=True)  # the directory may be removed
+    # np.save(args.output_dir + '/result.npy', np.array(result))

Anymate/checkpoints/.gitkeep ADDED Viewed

File without changes

Anymate/configs/.gitkeep ADDED Viewed

File without changes

Anymate/configs/conn.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+args:
+  aggr: max
+  checkpoint: Anymate/checkpoints
+  device: cuda
+  epochs: 200
+  finetune: true
+  gamma: 0.2
+  input_normal: false
+  logdir: Anymate/logs
+  loss: ce
+  mode: conn
+  resume: ''
+  root: Anymate/data
+  schedule: []
+  start_epoch: 0
+  test_batch: 1
+  testset: Anymate_test
+  train_batch: 16
+  trainset: Anymate_train
+  test_freq: 10
+optimizer:
+  weight_decay: 1.0e-05
+  lr: 0.0001
+model:
+  decoder: attendjoints_con_combine
+  encoder: bert
+  config_path: ./ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml
+  ckpt_path: ./ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt
+  load_encoder: ''
+  num_joints: 96
+  out_channels: 3
+  width: 768
+  heads: 12
+  init_scale: 0.25
+  flash: False
+  use_checkpoint: False
+  qkv_bias: False
+  separate: False

Anymate/configs/conn_token.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+args:
+  aggr: max
+  checkpoint: Anymate/checkpoints
+  device: cuda
+  epochs: 200
+  finetune: true
+  gamma: 0.2
+  input_normal: false
+  logdir: Anymate/logs
+  loss: ce
+  mode: conn
+  resume: ''
+  root: Anymate/data
+  schedule: []
+  start_epoch: 0
+  test_batch: 1
+  testset: Anymate_test
+  train_batch: 16
+  trainset: Anymate_train
+  test_freq: 10
+optimizer:
+  weight_decay: 1.0e-05
+  lr: 0.0001
+model:
+  decoder: attendjoints_con_combine
+  encoder: bert
+  config_path: ./ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml
+  ckpt_path: ./ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt
+  load_encoder: ''
+  num_joints: 96
+  out_channels: 3
+  width: 768
+  heads: 12
+  init_scale: 0.25
+  flash: False
+  use_checkpoint: False
+  qkv_bias: False
+  separate: False

Anymate/configs/diffusion.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+args:
+  aggr: max
+  checkpoint: Anymate/checkpoints
+  device: cuda
+  epochs: 4000
+  finetune: true
+  gamma: 0.2
+  input_normal: false
+  logdir: Anymate/logs
+  loss: chamfer
+  mode: diffusion
+  resume: ''
+  root: Anymate/data
+  schedule: []
+  start_epoch: 0
+  test_batch: 1
+  testset: Anymate_test
+  train_batch: 16
+  trainset: Anymate_train
+  test_freq: 50
+  num_train_step: 100
+  num_training_points: 128
+  seed: 42
+optimizer:
+  weight_decay: 1.0e-05
+  lr: 0.0001
+model:
+  encoder: transformer
+  decoder: Cross_Attention_Diffusion
+  config_path: ./ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml
+  ckpt_path: ./ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt
+  input_channels: 3
+  output_channels: 3
+  num_z: 16
+  num_x: 128
+  z_dim: 768
+  x_dim: 512
+  num_blocks: 4
+  num_compute_layers: 4
+  num_heads: 8
+  mlp_ratio: 4.0
+  qkv_bias: true
+  drop: 0.0
+  attn_drop: 0.0
+  drop_path: 0.0
+  num_latents: 16
+  use_projection: true

Anymate/configs/diffusion_concat.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+args:
+  aggr: max
+  checkpoint: Anymate/checkpoints
+  device: cuda
+  epochs: 4000
+  finetune: true
+  gamma: 0.2
+  input_normal: false
+  logdir: Anymate/logs
+  loss: chamfer
+  mode: diffusion
+  resume: ''
+  root: Anymate/data
+  schedule: []
+  start_epoch: 0
+  test_batch: 1
+  testset: Anymate_test
+  train_batch: 16
+  trainset: Anymate_train
+  test_freq: 1000
+  num_train_step: 100
+  num_training_points: 128
+  seed: 42
+optimizer:
+  weight_decay: 1.0e-05
+  lr: 0.0001
+model:
+  encoder: bert
+  decoder: Pointe_Diffusion
+  config_path: ./ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml
+  ckpt_path: ./ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt
+  input_channels: 3
+  output_channels: 3
+  n_ctx: 128
+  width: 768
+  layers: 12
+  heads: 8
+  init_scale: 0.25
+  time_token_cond: true
+  cond_drop_prob: 0.1
+  use_projection: true

Anymate/configs/diffusion_cross.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+args:
+  aggr: max
+  checkpoint: Anymate/checkpoints
+  device: cuda
+  epochs: 4000
+  finetune: true
+  gamma: 0.2
+  input_normal: false
+  logdir: Anymate/logs
+  loss: chamfer
+  mode: diffusion
+  resume: ''
+  root: Anymate/data
+  schedule: []
+  start_epoch: 0
+  test_batch: 1
+  testset: Anymate_test
+  train_batch: 32
+  trainset: Anymate_train
+  test_freq: 1000
+  num_train_step: 100
+  num_training_points: 128
+  seed: 42
+optimizer:
+  weight_decay: 1.0e-05
+  lr: 0.0001
+model:
+  encoder: miche
+  decoder: Cross_Attention_Diffusion
+  config_path: ./ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml
+  ckpt_path: ./ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt
+  input_channels: 3
+  output_channels: 3
+  num_z: 16
+  num_x: 128
+  z_dim: 768
+  x_dim: 512
+  num_blocks: 4
+  num_compute_layers: 4
+  num_heads: 8
+  mlp_ratio: 4.0
+  qkv_bias: true
+  drop: 0.0
+  attn_drop: 0.0
+  drop_path: 0.0
+  num_latents: 16
+  use_projection: true

Anymate/configs/joints.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+args:
+  aggr: max
+  checkpoint: Anymate/checkpoints
+  device: cuda
+  epochs: 200
+  finetune: true
+  gamma: 0.2
+  input_normal: false
+  logdir: Anymate/logs
+  loss: chamfer
+  mode: joints
+  resume: ''
+  root: Anymate/data
+  schedule: []
+  start_epoch: 0
+  test_batch: 1
+  testset: Anymate_test
+  train_batch: 16
+  trainset: Anymate_train
+  test_freq: 10
+optimizer:
+  weight_decay: 1.0e-05
+  lr: 0.0001
+model:
+  decoder: transformer_latent
+  encoder: bert
+  config_path: ./ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml
+  ckpt_path: ./ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt
+  load_encoder: ''
+  num_joints: 96
+  out_channels: 3
+  width: 768
+  heads: 12
+  init_scale: 0.25
+  flash: False
+  use_checkpoint: False
+  qkv_bias: False
+  separate: False

Anymate/configs/joints_implicit.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+args:
+  aggr: max
+  checkpoint: Anymate/checkpoints
+  device: cuda
+  epochs: 200
+  finetune: true
+  gamma: 0.2
+  input_normal: false
+  logdir: Anymate/logs
+  loss: chamfer
+  mode: joints
+  resume: ''
+  root: Anymate/data
+  schedule: []
+  start_epoch: 0
+  test_batch: 1
+  testset: Anymate_test
+  train_batch: 8
+  trainset: Anymate_train
+  test_freq: 10
+optimizer:
+  weight_decay: 1.0e-05
+  lr: 0.0001
+model:
+  decoder: implicit_transformer
+  encoder: bert
+  config_path: ./ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml
+  ckpt_path: ./ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt
+  load_encoder: ''
+  num_joints: 96
+  out_channels: 3
+  width: 768
+  heads: 12
+  init_scale: 0.25
+  flash: False
+  use_checkpoint: False
+  qkv_bias: False
+  separate: False

Anymate/configs/joints_triplane.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+args:
+  aggr: max
+  checkpoint: Anymate/checkpoints
+  device: cuda
+  epochs: 200
+  finetune: true
+  gamma: 0.2
+  input_normal: false
+  logdir: Anymate/logs
+  loss: chamfer
+  mode: joints
+  resume: ''
+  root: Anymate/data
+  schedule: []
+  start_epoch: 0
+  test_batch: 1
+  testset: Anymate_test
+  train_batch: 16
+  trainset: Anymate_train
+  test_freq: 10
+optimizer:
+  weight_decay: 1.0e-05
+  lr: 0.0001
+model:
+  decoder: triplane
+  encoder: bert
+  config_path: ./ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml
+  ckpt_path: ./ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt
+  load_encoder: ''
+  num_joints: 96
+  out_channels: 3
+  width: 768
+  heads: 12
+  init_scale: 0.25
+  flash: False
+  use_checkpoint: False
+  qkv_bias: False
+  separate: False

Anymate/configs/skin.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+args:
+  aggr: max
+  checkpoint: Anymate/checkpoints
+  device: cuda
+  epochs: 200
+  finetune: true
+  gamma: 0.2
+  input_normal: false
+  logdir: Anymate/logs
+  loss: cos_clamp
+  mode: skin
+  resume: ''
+  root: Anymate/data
+  schedule: []
+  start_epoch: 0
+  test_batch: 1
+  testset: Anymate_test
+  train_batch: 16
+  trainset: Anymate_train
+  test_freq: 10
+optimizer:
+  weight_decay: 1.0e-05
+  lr: 0.0001
+model:
+  decoder: attendjoints_combine
+  encoder: bert
+  config_path: ./ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml
+  ckpt_path: ./ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt
+  load_encoder: ''
+  num_joints: 96
+  out_channels: 3
+  width: 768
+  heads: 12
+  init_scale: 0.25
+  flash: False
+  use_checkpoint: False
+  qkv_bias: False
+  separate: False

Anymate/configs/skin_multi.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+args:
+  aggr: max
+  checkpoint: Anymate/checkpoints
+  device: cuda
+  epochs: 200
+  finetune: true
+  gamma: 0.2
+  input_normal: false
+  logdir: Anymate/logs
+  loss: cos_clamp
+  mode: skin
+  resume: ''
+  root: Anymate/data
+  schedule: []
+  start_epoch: 0
+  test_batch: 1
+  testset: Anymate_test
+  train_batch: 4
+  trainset: Anymate_train
+  test_freq: 10
+optimizer:
+  weight_decay: 1.0e-05
+  lr: 0.0001
+model:
+  decoder: attendjoints_multi
+  encoder: bert
+  config_path: ./ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml
+  ckpt_path: ./ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt
+  load_encoder: ''
+  num_joints: 96
+  out_channels: 3
+  width: 768
+  heads: 12
+  init_scale: 0.25
+  flash: False
+  use_checkpoint: False
+  qkv_bias: False
+  separate: False

Anymate/dataset.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+from torch.utils.data import Dataset
+import os
+import numpy as np
+from Anymate.utils.dataset_utils import create_mask, index_to_sparse, index_to_sparse_con
+def my_collate(batch):
+    # print(len(batch))
+    data = {}
+    for key in batch[0]:
+        if key=='vox' or key=='name' or key=='joints_num' or key=='skins_index' or key=='skins_weight' or key=='parent_index' or key=='conns' or key=='joints' or key=='bones' or key=='mesh_skins_index' or key=='mesh_skins_weight' or key=='mesh_pc' or key=='mesh_face':
+            data[key] = [sample[key] for sample in batch]
+        elif key=='pc':
+            data['points_cloud'] = torch.stack([sample['pc'] for sample in batch])
+        elif key=='skins':
+            continue
+        elif key=='bones_num':
+            data[key] = torch.tensor([sample['bones_num'] for sample in batch])
+        else:
+            data[key] = torch.stack([sample[key] for sample in batch])
+    if 'skins_index' in batch[0]:
+        max_joints = max(data['joints_num'])
+        max_bones = max(data['bones_num'])
+        # max_joints = 64
+        skin_list = [index_to_sparse(data['skins_index'][i].unsqueeze(0), data['skins_weight'][i].unsqueeze(0), [1, 8192, max_bones])[0] for i in range(len(data['skins_index']))]
+        data['skins'] = torch.stack(skin_list,dim=0)
+        data['joints_mask'] = torch.stack([create_mask(sample['joints_num'],max_len=max_joints) for sample in batch])
+        data['bones_mask'] = torch.stack([create_mask(sample['bones_num'],max_len=max_bones) for sample in batch])
+    if 'conns' in batch[0]:
+        max_joints = max(data['joints_num'])
+        conn_matrix = torch.zeros(len(data['conns']), 96, max_joints)
+        for i in range(len(data['conns'])):
+            for j in range(data['joints_num'][i]):
+                conn_matrix[i, j, data['conns'][i][j].long()] = 1
+        data['conns'] = conn_matrix
+    if 'joints' in batch[0]:
+        padded_joints_matrix = torch.ones(len(data['name']), 96, 3) * (-3)
+        for i in range(len(data['name'])):
+            padded_joints_matrix[i, :data['joints_num'][i], :] = data['joints'][i]
+        data['joints'] = padded_joints_matrix
+    if 'bones' in batch[0]:
+        padded_bones_matrix = torch.ones(len(data['name']), 64, 6) * (-3)
+        for i in range(len(data['name'])):
+            padded_bones_matrix[i, :data['bones_num'][i], :] = data['bones'][i]
+        data['bones'] = padded_bones_matrix
+    return data
+class AnymateDataset(Dataset):
+    def __init__(self, name='Anymate_test', root='Anymate/data'):
+        if os.path.exists(os.path.join(root, name) + '.pt'):
+            self.data_list = torch.load(os.path.join(root, name) + '.pt')
+        else:
+            raise ValueError('Dataset not found at path: {}'.format(os.path.join(root, name) + '.pt'))
+    def __len__(self):
+        return len(self.data_list)
+    def __getitem__(self, idx):
+        return self.data_list[idx]

Anymate/get_checkpoints.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+cd Anymate/checkpoints
+mkdir joint
+cd joint
+echo "Downloading joint checkpoints..."
+wget "https://huggingface.co/yfdeng/Anymate/resolve/main/checkpoints/joint/bert-transformer_latent-train-8gpu-finetune.pth.tar?download=true" -O bert-transformer_latent-train-8gpu-finetune.pth.tar
+cd ..
+mkdir conn
+cd conn
+echo "Downloading conn checkpoints..."
+wget "https://huggingface.co/yfdeng/Anymate/resolve/main/checkpoints/conn/bert-attendjoints_con_combine-train-8gpu-finetune.pth.tar?download=true" -O bert-attendjoints_con_combine-train-8gpu-finetune.pth.tar
+cd ..
+mkdir skin
+cd skin
+echo "Downloading skin checkpoints..."
+wget "https://huggingface.co/yfdeng/Anymate/resolve/main/checkpoints/skin/bert-attendjoints_combine-train-8gpu-finetune.pth.tar?download=true" -O bert-attendjoints_combine-train-8gpu-finetune.pth.tar
+echo "Finished downloading checkpoints!"

Anymate/get_datasets.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+cd Anymate/data
+wget "https://huggingface.co/datasets/yfdeng/Anymate/resolve/main/Anymate_test.pt?download=true" -O Anymate_test.pt
+wget "https://huggingface.co/datasets/yfdeng/Anymate/resolve/main/Anymate_train_0.pt?download=true" -O Anymate_train_0.pt
+wget "https://huggingface.co/datasets/yfdeng/Anymate/resolve/main/Anymate_train_1.pt?download=true" -O Anymate_train_1.pt
+wget "https://huggingface.co/datasets/yfdeng/Anymate/resolve/main/Anymate_train_2.pt?download=true" -O Anymate_train_2.pt
+wget "https://huggingface.co/datasets/yfdeng/Anymate/resolve/main/Anymate_train_3.pt?download=true" -O Anymate_train_3.pt
+wget "https://huggingface.co/datasets/yfdeng/Anymate/resolve/main/Anymate_train_4.pt?download=true" -O Anymate_train_4.pt
+wget "https://huggingface.co/datasets/yfdeng/Anymate/resolve/main/Anymate_train_5.pt?download=true" -O Anymate_train_5.pt
+wget "https://huggingface.co/datasets/yfdeng/Anymate/resolve/main/Anymate_train_6.pt?download=true" -O Anymate_train_6.pt
+wget "https://huggingface.co/datasets/yfdeng/Anymate/resolve/main/Anymate_train_7.pt?download=true" -O Anymate_train_7.pt
+echo "Finished downloading datasets!"

Anymate/model.py ADDED Viewed

	@@ -0,0 +1,360 @@

+import torch
+import torch.nn as nn
+from ThirdParty.michelangelo.utils.misc import get_config_from_file, instantiate_from_config
+# from ThirdParty.PointLLM.pointllm.model.pointllm import PointLLMLlamaForCausalLM
+from ThirdParty.michelangelo.models.modules.distributions import DiagonalGaussianDistribution
+from ThirdParty.michelangelo.models.modules.embedder import components_from_spherical_harmonics
+from Anymate.utils.diffusion_encoder import TransformerEncoder
+from Anymate.models.joint import TransformerDecoder, ImplicitTransformerDecoder, TriPlaneDecoder
+from Anymate.models.conn import AttendjointsDecoder_con_combine, AttendjointsDecoder_con_token
+from Anymate.models.skin import AttendjointsDecoder_combine, AttendjointsDecoder_multi
+from Anymate.models.diffusion import Pointe_Diffusion, Cross_Attention_Diffusion
+class Encoder(nn.Module):
+    def __init__(self,
+                 only_embed = True,
+                 config_path = './ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml',
+                 ckpt_path = './ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt',
+                 num_latents = 257,
+                 device = 'cuda'):
+        super().__init__()
+        model_config = get_config_from_file(config_path)
+        if hasattr(model_config, "model"):
+            model_config = model_config.model
+        if ckpt_path is not None:
+            model = instantiate_from_config(model_config, ckpt_path=ckpt_path)
+        else:
+            model = instantiate_from_config(model_config)
+            model.model.shape_model.encoder.num_latents = num_latents
+            model.model.shape_model.encoder.query = nn.Parameter(torch.randn((num_latents, 768), device=device, dtype=torch.float32) * 0.02)
+        self.shape_projection = model.model.shape_projection
+        self.encoder = model.model.shape_model.encoder
+        self.normal_embedder = components_from_spherical_harmonics
+        old_linear_proj = self.encoder.input_proj
+        self.encoder.input_proj = nn.Linear(old_linear_proj.in_features + 25, old_linear_proj.out_features)
+        self.encoder.input_proj.weight.data[:, :old_linear_proj.in_features] = old_linear_proj.weight.data[:, :old_linear_proj.in_features].clone()
+        self.encoder.input_proj.bias.data = old_linear_proj.bias.data.clone()
+        if not only_embed:
+            self.embed_dim = model.model.shape_model.embed_dim
+            self.pre_kl = model.model.shape_model.pre_kl
+            self.post_kl = model.model.shape_model.post_kl
+            self.transformer = model.model.shape_model.transformer
+    def encode_latents(self,
+                       pc: torch.FloatTensor,
+                       feats = None):
+        feats_embed = self.normal_embedder(feats)
+        feats = torch.cat([feats, feats_embed], dim=-1)
+        x, _ = self.encoder(pc, feats)
+        shape_embed = x[:, 0]
+        latents = x[:, 1:]
+        return shape_embed, latents
+    def encode_shape_embed(self, surface, return_latents: bool = False):
+        """
+        Args:
+            surface (torch.FloatTensor): [bs, n, 3 + c]
+            return_latents (bool):
+        Returns:
+            x (torch.FloatTensor): [bs, projection_dim]
+            shape_latents (torch.FloatTensor): [bs, m, d]
+        """
+        pc = surface[..., 0:3]
+        feats = surface[..., 3:]
+        shape_embed, shape_latents = self.encode_latents(pc, feats)
+        x = shape_embed @ self.shape_projection
+        if return_latents:
+            return x, shape_latents
+        else:
+            return x
+    def encode_kl_embed(self, latents: torch.FloatTensor, sample_posterior: bool = True):
+        posterior = None
+        if self.embed_dim > 0:
+            moments = self.pre_kl(latents)
+            posterior = DiagonalGaussianDistribution(moments, feat_dim=-1)
+            if sample_posterior:
+                kl_embed = posterior.sample()
+            else:
+                kl_embed = posterior.mode()
+        else:
+            kl_embed = latents
+        return kl_embed, posterior
+    def decode(self, latents: torch.FloatTensor):
+        latents = self.post_kl(latents)
+        return self.transformer(latents)
+class EncoderDecoder(nn.Module):
+    def __init__(self,
+                 decoder = 'mlp',
+                 encoder = 'miche',
+                 config_path = './ThirdParty/michelangelo/configs/aligned_shape_latents/shapevae-256.yaml',
+                 ckpt_path = './ThirdParty/michelangelo/checkpoints/aligned_shape_latents/shapevae-256.ckpt',
+                 load_encoder = '',
+                 num_joints = 96,
+                 out_channels = 3,
+                 width = 768,
+                 device = 'cuda',
+                 dtype = torch.float32,
+                 heads = 12,
+                 init_scale: float = 0.25,
+                 flash = False,
+                 use_checkpoint = False,
+                 qkv_bias = False,
+                 separate = False,
+                 **kwargs):
+        super().__init__()
+        self.decoder_name = decoder
+        self.encoder_name = encoder
+        self.dtype = dtype
+        self.load_encoder = load_encoder
+        if decoder == 'transformer_latent':
+            self.only_embed = False
+            self.return_latents = True
+            self.decoder = TransformerDecoder(
+                num_latents = num_joints,
+                out_channels = out_channels,
+                width = width,
+                device = device,
+                dtype = dtype,
+                heads = heads,
+                init_scale = init_scale,
+                flash = flash,
+                use_checkpoint = use_checkpoint,
+                qkv_bias = qkv_bias
+            )
+        elif decoder == 'implicit_transformer':
+            self.only_embed = False
+            self.return_latents = True
+            self.decoder = ImplicitTransformerDecoder(
+                device = device,
+                dtype = dtype,
+                num_latents = 257,
+                out_channels = 1,
+                width = width,
+                heads = heads,
+                init_scale = init_scale,
+                flash = flash,
+                use_checkpoint = use_checkpoint,
+                qkv_bias = qkv_bias
+            )
+        elif decoder == 'triplane': #consider add these parameters to config
+            self.only_embed = True
+            self.return_latents = False
+            self.decoder = TriPlaneDecoder(
+                z_dim = 768,
+                c_dim = 0,
+                w_dim = 768,
+                mapping_kwargs = {'num_layers': 2},
+                synthesis_kwargs = {'num_fp16_res': 0, 'conv_clamp': None, 'fused_modconv_default': 'inference_only'}
+            )
+        elif decoder == 'Pointe_Diffusion':
+            self.only_embed = False
+            self.return_latents = True
+            self.decoder = Pointe_Diffusion(**kwargs)
+        elif decoder == 'Cross_Attention_Diffusion':
+            self.only_embed = False
+            self.return_latents = True
+            self.decoder = Cross_Attention_Diffusion(**kwargs)
+        elif decoder == 'attendjoints_combine':
+            self.only_embed = False
+            self.return_latents = True
+            self.decoder = AttendjointsDecoder_combine(
+                width = width,
+                device = device,
+                dtype = dtype,
+                heads = heads,
+                init_scale = init_scale,
+                flash = flash,
+                use_checkpoint = use_checkpoint,
+                separate = separate,
+                qkv_bias = qkv_bias
+            )
+        elif decoder == 'attendjoints_multi':
+            self.only_embed = False
+            self.return_latents = True
+            self.decoder = AttendjointsDecoder_multi(
+                width = width,
+                device = device,
+                dtype = dtype,
+                heads = heads,
+                init_scale = init_scale,
+                flash = flash,
+                use_checkpoint = use_checkpoint,
+                qkv_bias = qkv_bias,
+                separate=separate
+            )
+        elif decoder == 'attendjoints_con_combine':
+            self.only_embed = False
+            self.return_latents = True
+            self.decoder = AttendjointsDecoder_con_combine(
+                width = width,
+                device = device,
+                dtype = dtype,
+                heads = heads,
+                init_scale = init_scale,
+                flash = flash,
+                use_checkpoint = use_checkpoint,
+                qkv_bias = qkv_bias
+            )
+        elif decoder == 'attendjoints_con_token':
+            self.only_embed = False
+            self.return_latents = True
+            self.decoder = AttendjointsDecoder_con_token(
+                width = width,
+                device = device,
+                dtype = dtype,
+                heads = heads,
+                init_scale = init_scale,
+                flash = flash,
+                use_checkpoint = use_checkpoint,
+                qkv_bias = qkv_bias,
+                separate = separate
+            )
+        if encoder == 'miche':
+            if not self.load_encoder:
+                self.encoder = Encoder(only_embed=self.only_embed, config_path=config_path, ckpt_path=ckpt_path, device=device)
+            else:
+                self.encoder = Encoder(only_embed=self.only_embed, config_path=config_path, ckpt_path=None, device=device)
+                try:
+                    print("=> loading encoder checkpoint '{}'".format(self.load_encoder))
+                    checkpoint = torch.load(self.load_encoder, map_location='cpu')
+                    state_dict = {k[8:]: v for k, v in checkpoint['state_dict'].items() if k.startswith('encoder')}
+                    self.encoder.load_state_dict(state_dict)
+                    print("=> loaded encoder checkpoint '{}'".format(self.load_encoder))
+                except:
+                    print("=> no encoder checkpoint found at '{}'".format(self.load_encoder))
+                if self.load_encoder:
+                    self.point_proj = nn.Sequential(
+                        nn.Linear(768, 768, dtype=dtype),
+                        nn.GELU(),
+                        nn.Linear(768, 768, dtype=dtype),
+                    )
+        if encoder == 'bert':
+            # model_name = 'RunsenXu/PointLLM_7B_v1.2'
+            # model = PointLLMLlamaForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=False, use_cache=True, torch_dtype=dtype)
+            # self.encoder = model.model.point_backbone.to(device)
+            # model = None
+            from ThirdParty.PointLLM.pointllm.model import PointTransformer
+            from ThirdParty.PointLLM.pointllm.utils import cfg_from_yaml_file
+            import os
+            # address of config file, in the same dir of this file
+            point_bert_config_name = "PointTransformer_8192point_2layer" # * default for v1.2, v1.1 uses PointTransformer_base_8192point.yaml
+            point_bert_config_addr = os.path.join("./ThirdParty/PointLLM/pointllm/model/pointbert/PointTransformer_8192point_2layer.yaml")
+            print(f"Loading PointBERT config from {point_bert_config_addr}.")
+            point_bert_config = cfg_from_yaml_file(point_bert_config_addr)
+            point_bert_config.model.point_dims = 6
+            use_max_pool = getattr(point_bert_config.model, "use_max_pool", False) # * default is false
+            self.encoder = PointTransformer(point_bert_config.model, use_max_pool=use_max_pool).to(device)
+            if self.return_latents:
+                self.point_proj = nn.Sequential(
+                    nn.Linear(384, 512, dtype=dtype),
+                    nn.GELU(),
+                    nn.Linear(512, 512, dtype=dtype),
+                    nn.GELU(),
+                    nn.Linear(512, 768, dtype=dtype)
+                )
+            else:
+                self.point_proj = nn.ModuleList([
+                    nn.Sequential(
+                        nn.Linear(384, 512, dtype=dtype),
+                        nn.GELU(),
+                        nn.Linear(512, 512, dtype=dtype),
+                        nn.GELU(),
+                        nn.Linear(512, 768, dtype=dtype)
+                    ),
+                    nn.Linear(513, 1, dtype=dtype)
+                ])
+        if encoder == 'transformer':
+            self.points_cloud_embed = nn.Linear(
+            768, 768, device=device, dtype=dtype
+        )
+            self.encoder = TransformerEncoder(device=device,dtype=dtype, num_latents=kwargs['num_latents'])
+    def encode(self, data, device='cuda'):
+        assert self.encoder_name in ['miche', 'bert', 'transformer'], f'Encoder {self.encoder_name} not supported'
+        if self.encoder_name == 'miche':
+            surface = data['points_cloud'].to(self.dtype).to(device)
+            # encoding
+            shape_embed, shape_latents = self.encoder.encode_shape_embed(surface, return_latents=True)  # ShapeAsLatentPerceiver.encode_latents(): encoder
+            if self.only_embed:
+                if self.return_latents:
+                    if self.load_encoder:
+                        return self.point_proj(torch.cat([shape_embed.unsqueeze(1), shape_latents], dim=1))
+                    return torch.cat([shape_embed.unsqueeze(1), shape_latents], dim=1)  # torch.Size([bs, 257, 768]
+                return shape_embed  # shape_embed: torch.Size([bs, 768])
+            shape_zq, posterior = self.encoder.encode_kl_embed(shape_latents)  # ShapeAsLatentPerceiver.encode_kl_embed(): pre_kl + DiagonalGaussianDistribution()
+            # shape_zq, posterior = self.encoder.encode_kl_embed(shape_latents, sample_posterior=False)  # not sample
+            # pretrained weight has 0 +- 0.7 mean and 0.5 +- 0.5 std
+            # trained weight has 0 +- 1.8 mean and 0.1 +- 0.1 std
+            # generally okay
+            latents = self.encoder.decode(shape_zq)  # ShapeAsLatentPerceiver.decode(): post_kl + transformer
+            if not self.return_latents:
+                latents = torch.cat([shape_latents, latents], dim=1)  # torch.Size([bs, 512, 768])
+            if self.load_encoder:
+                return self.point_proj(torch.cat([shape_embed.unsqueeze(1), latents], dim=1))
+            return torch.cat([shape_embed.unsqueeze(1), latents], dim=1)  # torch.Size([bs, 257 / 513, 768])
+        if self.encoder_name == 'bert':
+            points = data['points_cloud'].to(self.dtype).to(device)
+            points = points[:, :, :3] / 2
+            points = torch.cat([points, torch.zeros_like(points)], dim=-1)
+            points = self.encoder(points)
+            if self.return_latents:
+                points = self.point_proj(points)
+            else:
+                points = self.point_proj[0](points)
+                points = self.point_proj[1](points.permute(0, 2, 1)).squeeze(-1)
+            return points
+        if self.encoder_name == 'transformer':
+            points = data['points_cloud'].to(self.dtype).to(device)
+            cond = self.encoder.encode_pc(points)
+            cond = self.points_cloud_embed(cond)
+            return cond
+    def forward(self, data, device='cuda', downsample=False, **kwargs):
+        latents = self.encode(data, device)
+        # print('latents shape', latents.shape)
+        logits = self.decoder(latents, data, device=device, downsample=downsample,**kwargs)
+        return logits

Anymate/models/__init__.py ADDED Viewed

File without changes

Anymate/models/conn.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import torch
+import torch.nn as nn
+from ThirdParty.michelangelo.models.modules.transformer_blocks import ResidualCrossAttentionBlock, ResidualAttentionBlock, Transformer
+from ThirdParty.michelangelo.models.modules.embedder import FourierEmbedder, components_from_spherical_harmonics
+class AttendjointsDecoder_con_combine(nn.Module):
+    def __init__(self,
+                 width = 768,
+                 layers = 2,
+                 device = 'cuda',
+                 dtype = torch.float32,
+                 heads = 12,
+                 init_scale: float = 0.25,
+                 flash = False,
+                 use_checkpoint = False,
+                 qkv_bias = False,
+                 num_freqs: int = 8,
+                 include_pi: bool = True,
+                 separate = False,
+                 use_mask = True):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.separate = separate
+        self.use_mask = use_mask
+        # self.num_latents = num_latents
+        # self.query = nn.Parameter(torch.randn((num_latents, width), device=device, dtype=dtype) * 0.02)
+        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
+        self.co_proj = nn.Linear(self.fourier_embedder.out_dim, width, device=device, dtype=dtype)
+        # self.proj_attn = nn.Linear(width, width, device=device, dtype=dtype)
+        self.cross_attn = nn.ModuleList([ResidualCrossAttentionBlock(
+            device=device,
+            dtype=dtype,
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            flash=flash,
+        ) for _ in range(layers)])
+        self.self_attn = nn.ModuleList([ResidualAttentionBlock(
+            device=device,
+            dtype=dtype,
+            n_ctx=-1,
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            flash=flash,
+        ) for _ in range(layers * 2)])
+        # self.joint_embed_proj = nn.ModuleList([nn.Linear(width, width, device=device, dtype=dtype) for _ in range(layers)])
+        self.q_proj = nn.Linear(width, width, device=device, dtype=dtype)
+        self.k_proj = nn.Linear(width, width, device=device, dtype=dtype)
+        self.ln_1 = nn.LayerNorm(width, device=device, dtype=dtype)
+        self.ln_2 = nn.LayerNorm(width, device=device, dtype=dtype)
+        # self.last_cross_attn = ResidualCrossAttentionBlock(
+        #     device=device,
+        #     dtype=dtype,
+        #     width=width,
+        #     heads=heads,
+        #     init_scale=init_scale,
+        #     qkv_bias=qkv_bias,
+        #     flash=flash,
+        # )
+        # self.mlp = MLP(device=device, dtype=dtype, width=width, init_scale=init_scale)
+        # self.output_proj = nn.Linear(width, 1, device=device, dtype=dtype)
+    def forward(self, latents, data=None, device='cuda', downsample=None, dtype=torch.float32):
+        joints = data['joints'].to(device)
+        max_joints = max(data['joints_num'])
+        joints = joints[:, :max_joints, :3]
+        joints_embeds = self.fourier_embedder(joints)
+        joints_embeds = self.co_proj(joints_embeds)
+        joints_num = joints_embeds.shape[-2]
+        x = [joints_embeds, joints_embeds.clone()]
+        for i in range(2):
+            for j, layer in enumerate(self.cross_attn):
+                x[i] = layer(x[i], latents)
+                if self.use_mask:
+                    x[i] = self.self_attn[2*i+j](x[i], mask=data['joints_mask'].to(device))
+                else:
+                    x[i] = self.self_attn[2*i+j](x[i])
+        # Dot Product between points and joints
+        logits = torch.einsum('bnc,bmc->bnm', self.k_proj(self.ln_1(x[0])), self.q_proj(self.ln_2(x[1])))  # (b, n, m)
+        if self.use_mask:
+            mask = data['joints_mask'].to(device)
+            logits = logits.masked_fill(mask.unsqueeze(1) == 0, -1e8)
+        return logits
+class AttendjointsDecoder_con_token(nn.Module):
+    def __init__(self,
+                 width = 768,
+                 layers = 4,
+                 device = 'cuda',
+                 dtype = torch.float32,
+                 heads = 12,
+                 init_scale: float = 0.25,
+                 flash = False,
+                 use_checkpoint = False,
+                 qkv_bias = False,
+                 num_freqs: int = 8,
+                 include_pi: bool = True,
+                 head_token_length =128,
+                separate = False,
+                use_mask = True):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.use_mask = use_mask
+        self.layer_norm = nn.LayerNorm(width)
+        self.head_token = nn.Parameter(torch.randn((1, 1, head_token_length), device=device, dtype=dtype) * 0.02)
+        self.tail_token = nn.Parameter(torch.randn((1, 1, head_token_length), device=device, dtype=dtype) * 0.02)
+        self.head_mlp = nn.ModuleList([
+            nn.Linear(width + head_token_length, 512, device=device, dtype=dtype),
+            nn.Linear(512, 512, device=device, dtype=dtype),
+            nn.Linear(512, width, device=device, dtype=dtype),
+            nn.LayerNorm(width)
+        ])
+        self.tail_mlp = nn.ModuleList([
+            nn.Linear(width + head_token_length, 512, device=device, dtype=dtype),
+            nn.Linear(512, 512, device=device, dtype=dtype),
+            nn.Linear(512, width, device=device, dtype=dtype),
+            nn.LayerNorm(width)
+        ])
+        self.self_attn = Transformer(
+            device=device,
+            dtype=dtype,
+            n_ctx=-1,
+            width=width,
+            layers=layers,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            flash=flash,
+            use_checkpoint=False,
+        )
+        self.separate = separate
+        self.normal_embedder = components_from_spherical_harmonics
+        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
+        self.joints_proj = nn.Linear(self.fourier_embedder.out_dim, width, device=device, dtype=dtype)
+        self.output_proj_joints = nn.Linear(width, width, device=device, dtype=dtype)
+    def forward(self, latents, data=None,device='cuda', downsample=None, dtype='float32'):
+        joints = data['joints'].to(device)
+        max_joints = max(data['joints_num'])
+        joints = joints[:, :max_joints, :3]
+        joints_embeds_fourier = self.fourier_embedder(joints)
+        joints_embeds = self.joints_proj(joints_embeds_fourier)
+        # Concatenate embeddings
+        x = torch.cat([joints_embeds, latents], dim=-2) # (b, max_joint+token_num, c)
+        # Pass through self-attention
+        if self.use_mask:
+            mask = data['mask'].to(device)
+            append_size = x.shape[1]-mask.shape[1] # the zero needs to append after mask
+            batch_size = mask.shape[0]
+            mask_extend = torch.ones((batch_size,append_size)).to(device)
+            mask = torch.cat([mask,mask_extend],dim=-1).to(device)
+            x = self.self_attn(x,mask)
+        else:
+            x = self.self_attn(x)
+        joints, _= x.split([joints_embeds.shape[1], latents.shape[1]], dim=1)
+        joints = self.output_proj_joints(self.layer_norm(joints))
+        joints_head = torch.concat([joints, self.head_token.repeat(joints.shape[0],joints.shape[1],1)], dim=-1)
+        joints_tail = torch.concat([joints, self.tail_token.repeat(joints.shape[0],joints.shape[1],1)], dim=-1)
+        for layer in self.head_mlp:
+            joints_head = layer(joints_head)
+        for layer in self.tail_mlp:
+            joints_tail = layer(joints_tail)
+        logits = torch.einsum('bik,bjk->bij', joints_head, joints_tail)
+        return logits

Anymate/models/diffusion.py ADDED Viewed

	@@ -0,0 +1,483 @@

+F"""
+Adapted from: https://github.com/openai/openai/blob/55363aa496049423c37124b440e9e30366db3ed6/orc/orc/diffusion/vit.py
+"""
+import math
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+from einops import repeat
+from Anymate.utils.diffusion_utils import *
+from ThirdParty.michelangelo.models.modules.transformer_blocks import Transformer, ResidualCrossAttentionBlock
+from diffusers import DDPMScheduler, DDIMScheduler
+from sklearn.cluster import DBSCAN
+def init_linear(l, stddev):
+    nn.init.normal_(l.weight, std=stddev)
+    if l.bias is not None:
+        nn.init.constant_(l.bias, 0.0)
+class projection_transformer(nn.Module):
+    def __init__(self, num_latents=16, width = 16, heads=8, dtype = torch.float32):
+        super().__init__()
+        self.num_latents = num_latents
+        self.query = nn.Parameter(torch.randn((num_latents, width), dtype=dtype) * 0.02)
+        self.cross_attn = ResidualCrossAttentionBlock(
+            device= 'cuda',
+            dtype=dtype,
+            width=width,
+            heads=heads,
+            init_scale=0.25,
+            qkv_bias=True,
+            flash=False,
+        )
+        self.output_proj = nn.Linear(width, width,dtype=dtype)
+    def forward(self, latents):
+        bs = latents.shape[0]
+        query = repeat(self.query, "m c -> b m c", b=bs)
+        embed = self.cross_attn(query, latents)
+        logits = self.output_proj(embed)
+        return logits
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].to(timesteps.dtype) * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+class MultiheadAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dtype: torch.dtype,
+        n_ctx: int,
+        width: int,
+        heads: int,
+        init_scale: float,
+    ):
+        super().__init__()
+        self.n_ctx = n_ctx
+        self.width = width
+        self.heads = heads
+        self.c_qkv = nn.Linear(width, width * 3, dtype=dtype)
+        self.c_proj = nn.Linear(width, width, dtype=dtype)
+        self.attention = QKVMultiheadAttention(dtype=dtype, heads=heads, n_ctx=n_ctx)
+        init_linear(self.c_qkv, init_scale)
+        init_linear(self.c_proj, init_scale)
+    def forward(self, x):
+        x = self.c_qkv(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x
+class MLP(nn.Module):
+    def __init__(self, *, dtype: torch.dtype, width: int, init_scale: float):
+        super().__init__()
+        self.width = width
+        self.c_fc = nn.Linear(width, width * 4,  dtype=dtype)
+        self.c_proj = nn.Linear(width * 4, width, dtype=dtype)
+        self.gelu = nn.GELU()
+        init_linear(self.c_fc, init_scale)
+        init_linear(self.c_proj, init_scale)
+    def forward(self, x):
+        return self.c_proj(self.gelu(self.c_fc(x)))
+class QKVMultiheadAttention(nn.Module):
+    def __init__(self, *, dtype: torch.dtype, heads: int, n_ctx: int):
+        super().__init__()
+        self.dtype = dtype
+        self.heads = heads
+        self.n_ctx = n_ctx
+    def forward(self, qkv):
+        bs, n_ctx, width = qkv.shape
+        attn_ch = width // self.heads // 3
+        scale = 1 / math.sqrt(math.sqrt(attn_ch))
+        qkv = qkv.view(bs, n_ctx, self.heads, -1)
+        q, k, v = torch.split(qkv, attn_ch, dim=-1)
+        weight = torch.einsum(
+            "bthc,bshc->bhts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        wdtype = weight.dtype
+        weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
+        return torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dtype: torch.dtype,
+        n_ctx: int,
+        width: int,
+        heads: int,
+        init_scale: float = 1.0,
+    ):
+        super().__init__()
+        self.attn = MultiheadAttention(
+            dtype=dtype,
+            n_ctx=n_ctx,
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+        )
+        self.ln_1 = nn.LayerNorm(width, dtype=dtype)
+        self.mlp = MLP(dtype=dtype, width=width, init_scale=init_scale)
+        self.ln_2 = nn.LayerNorm(width, dtype=dtype)
+    def forward(self, x: torch.Tensor):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        dtype: torch.dtype,
+        n_ctx: int,
+        width: int,
+        layers: int,
+        heads: int,
+        init_scale: float = 0.25,
+    ):
+        super().__init__()
+        self.n_ctx = n_ctx
+        self.width = width
+        self.layers = layers
+        init_scale = init_scale * math.sqrt(1.0 / width)
+        self.resblocks = nn.ModuleList(
+            [
+                ResidualAttentionBlock(
+                    dtype=dtype,
+                    n_ctx=n_ctx,
+                    width=width,
+                    heads=heads,
+                    init_scale=init_scale,
+                )
+                for _ in range(layers)
+            ]
+        )
+    def forward(self, x: torch.Tensor):
+        for block in self.resblocks:
+            x = block(x)
+        return x
+class PointDiffusionTransformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        dtype: torch.dtype,
+        input_channels: int = 3,
+        output_channels: int = 3,
+        n_ctx: int = 1024,
+        width: int = 768,
+        layers: int = 12,
+        heads: int = 8,
+        init_scale: float = 0.25,
+        time_token_cond: bool = True,
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.output_channels = output_channels
+        self.n_ctx = n_ctx
+        self.time_token_cond = time_token_cond
+        self.time_embed = MLP(
+            dtype=dtype, width=width, init_scale=init_scale * math.sqrt(1.0 / width)
+        )
+        self.ln_pre = nn.LayerNorm(width, dtype=dtype)
+        self.backbone = Transformer(
+            dtype=dtype,
+            n_ctx=n_ctx + int(time_token_cond),
+            width=width,
+            layers=layers,
+            heads=heads,
+            init_scale=init_scale,
+        )
+        self.ln_post = nn.LayerNorm(width,dtype=dtype)
+        self.input_proj = nn.Linear(input_channels, width, dtype=dtype)
+        self.output_proj = nn.Linear(width, output_channels,dtype=dtype)
+        with torch.no_grad():
+            self.output_proj.weight.zero_()
+            self.output_proj.bias.zero_()
+    def forward(self, x: torch.Tensor, t: torch.Tensor):
+        """
+        :param x: an [N x C x T] tensor.
+        :param t: an [N] tensor.
+        :return: an [N x C' x T] tensor.
+        """
+        assert x.shape[-1] == self.n_ctx
+        t_embed = self.time_embed(timestep_embedding(t, self.backbone.width))
+        return self._forward_with_cond(x, [(t_embed, self.time_token_cond)])
+    def _forward_with_cond(
+        self, x: torch.Tensor, cond_as_token: List[Tuple[torch.Tensor, bool]]
+    ) -> torch.Tensor:
+        h = self.input_proj(x.permute(0, 2, 1))  # NCL -> NLC
+        for emb, as_token in cond_as_token:
+            if not as_token:
+                h = h + emb[:, None]
+        extra_tokens = [
+            (emb[:, None] if len(emb.shape) == 2 else emb)
+            for emb, as_token in cond_as_token
+            if as_token
+        ]
+        if len(extra_tokens):
+            h = torch.cat(extra_tokens + [h], dim=1)
+        h = self.ln_pre(h)
+        h = self.backbone(h)
+        h = self.ln_post(h)
+        if len(extra_tokens):
+            h = h[:, sum(h.shape[1] for h in extra_tokens) :]
+        h = self.output_proj(h)
+        return h.permute(0, 2, 1)
+class Pointe_Diffusion(PointDiffusionTransformer):
+    '''
+    input: data: data dict
+            x: [N x C x T] tensor
+            t: [N] tensor
+    init:
+            n_ctx: int = 1024: context length
+    '''
+    def __init__(
+        self,
+        *,
+        device = 'cuda',
+        dtype = torch.float32,
+        encoder = 'miche',
+        n_ctx: int = 1024,
+        token_cond: bool = True,
+        cond_drop_prob: float = 0.1,
+        fix_emb: bool = False,
+        **kwargs,
+    ):
+        super().__init__(dtype=dtype, n_ctx=n_ctx + int(token_cond), **kwargs)
+        self.n_ctx = n_ctx
+        self.token_cond = token_cond
+        # self.proj_transformer = projection_transformer(**kwargs)
+        self.encoder_name = encoder
+        self.cond_drop_prob = cond_drop_prob
+        self.fix_emb = fix_emb
+        self.dtype = dtype
+        self.inference = False
+    def cached_model_kwargs(self, batch_size: int, model_kwargs: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            return dict(embeddings=self.clip(batch_size, **model_kwargs))
+    def inference_mode(self,eps=0.03):
+        self.inference = True
+    def forward_func(
+        self,
+        latent: torch.Tensor,
+        data,
+        device='cuda',
+        downsample = False,
+        **kwargs,
+    ):
+        t = kwargs['timesteps'].to(latent.device)
+        x = kwargs['noisy_joints'].to(latent.device)
+        assert x.shape[-1] == self.n_ctx, f"x shape: {x.shape}, n_ctx: {self.n_ctx}"
+        t_embed = self.time_embed(timestep_embedding(t, self.backbone.width))
+        if self.training:
+            mask = torch.rand(size=[len(x)]) >= self.cond_drop_prob
+            latent = latent * mask[:,None,None].to(latent.device)
+        latent = [(latent, self.token_cond), (t_embed, self.time_token_cond)]
+        return self._forward_with_cond(x, latent)
+    def forward(self, latent, data, device='cuda', downsample = False, **kwargs):
+        if self.inference == False:
+            return self.forward_func(latent, data, device, downsample, **kwargs)
+        else:
+            generator=torch.Generator(device='cpu')
+            scheduler = DDIMScheduler(100)
+            scheduler.set_timesteps(100)
+            points_shape = [1, self.n_ctx, 3]
+            points_noise = randn_tensor(points_shape, generator=generator)
+            points = points_noise.permute(0, 2, 1).to(latent.device)
+            for t in scheduler.timesteps:
+                with torch.no_grad():
+                    time_steps = torch.ones(1, 1, dtype=torch.long) * t
+                    model_output = self.forward_func(latent, data, noisy_joints=points, timesteps = time_steps)
+                    points = scheduler.step(model_output, t, points, generator=generator).prev_sample
+            points = points.permute(0, 2, 1).cpu()
+            assert points.shape[0] == 1, "Inference mode only supports batch size 1"
+            joints = points[0].detach().cpu().numpy()
+            clustering = DBSCAN(eps=0.05, min_samples=1).fit(joints)
+            cluster_centers = []
+            for cluster in set(clustering.labels_):
+                cluster_centers.append(joints[clustering.labels_ == cluster].mean(axis=0))
+            return cluster_centers
+class Cross_Attention_Diffusion(nn.Module):
+    def __init__(self,
+                 input_channels=3, output_channels=3,
+                 num_z=16, num_x=1024, z_dim=768, x_dim=512,
+                 num_blocks=6, num_compute_layers=4, num_heads=8,
+                 mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,num_latents=16,
+                 device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                 use_projection = True,):
+        super().__init__()
+        self.use_projection = use_projection
+        self.device = device
+        self.num_z = num_z
+        self.num_x = num_x
+        self.z_dim = z_dim
+        if use_projection:
+            self.proj_transformer = projection_transformer(num_latents=num_latents, width=z_dim, heads=num_heads)
+        self.prev_latent = nn.Parameter(torch.zeros(1, self.num_z + num_latents + 1, z_dim))
+        self.inference = False
+        self.input_proj = nn.Linear(input_channels, x_dim)
+        self.ln_pre = nn.LayerNorm(x_dim)
+        self.z_init = nn.Parameter(torch.zeros(1, num_z, z_dim))
+        mlp_hidden_dim = int(z_dim * mlp_ratio)
+        self.time_embed = Mlp(in_features=z_dim, hidden_features=mlp_hidden_dim)
+        self.latent_mlp = Mlp(in_features=z_dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.ln_latent = nn.LayerNorm(z_dim)
+        self.blocks = nn.ModuleList([
+            RCW_Block(z_dim, x_dim, num_compute_layers=num_compute_layers,
+                      num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
+                      drop=drop, attn_drop=attn_drop, drop_path=drop_path,
+                      act_layer=act_layer, norm_layer=norm_layer)
+            for _ in range(num_blocks)
+        ])
+        # output blocks
+        self.ln_post = nn.LayerNorm(x_dim)
+        self.output_proj = nn.Linear(x_dim, output_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        nn.init.normal_(self.z_init, std=.02)
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+        nn.init.constant_(self.ln_latent.weight, 0)
+        nn.init.constant_(self.ln_latent.bias, 0)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def inference_mode(self,eps=0.03):
+        self.inference = True
+    def forward_func(self, latent, data, device='cuda', downsample = False, **kwargs):
+        """
+        Forward pass of the model.
+        Parameters:
+        x: [B, num_x, C_in]
+        t: [B]
+        cond: [B, num_cond, C_latent]
+        prev_latent: [B, num_z + num_cond + 1, C_latent]
+        Returns:
+        x_denoised: [B, num_x, C_out]
+        z: [B, num_z + num_cond + 1, C_latent]
+        """
+        t = kwargs['timesteps'].to(latent.device)
+        x = kwargs['noisy_joints'].to(latent.device)
+        x = x.permute(0, 2, 1)
+        B, num_x, _ = x.shape
+        if self.use_projection:
+            latent = self.proj_transformer(latent)
+        assert num_x == self.num_x, f"x shape: {x.shape}, num_x: {self.num_x}"
+        # if prev_latent is not None:
+        #     _, num_z, _ = prev_latent.shape
+        #     assert num_z == self.num_z + num_cond + 1
+        # else:
+        #     prev_latent = torch.zeros(B, self.num_z + num_cond + 1, self.z_dim).to(x.device)
+        # timestep embedding, [B, 1, z_dim]
+        t_embed = self.time_embed(timestep_embedding(t, self.z_dim))
+        if t_embed.dim() == 2:
+            t_embed = t_embed.unsqueeze(1)
+        # project x -> [B, num_x, C_x]
+        x = self.input_proj(x)
+        x = self.ln_pre(x)
+        # latent self-conditioning
+        z = self.z_init.repeat(B, 1, 1) # [B, num_z, z_dim
+        z = torch.cat([z, latent, t_embed], dim=1) # [B, num_z + num_cond + 1, z_dim]
+        prev_latent = self.prev_latent + self.latent_mlp(self.prev_latent.detach())
+        z = z + (self.ln_latent(prev_latent))
+        # compute
+        for blk in self.blocks:
+            z, x = blk(z, x)
+        # output proj
+        x = self.ln_post(x)
+        x_denoised = self.output_proj(x)
+        return x_denoised.permute(0, 2, 1)
+    def forward(self, latent, data, device='cuda', downsample = False, **kwargs):
+        if self.inference == False:
+            return self.forward_func(latent, data, device, downsample, **kwargs)
+        else:
+            generator=torch.Generator(device='cpu')
+            scheduler = DDIMScheduler(100)
+            scheduler.set_timesteps(100)
+            points_shape = [1, self.num_x, 3]
+            points_noise = randn_tensor(points_shape, generator=generator)
+            points = points_noise.permute(0, 2, 1).to(latent.device)
+            for t in scheduler.timesteps:
+                with torch.no_grad():
+                    time_steps = torch.ones(1, 1, dtype=torch.long) * t
+                    time_steps = time_steps.to(latent.device)
+                    model_output = self.forward_func(latent, data, noisy_joints=points, timesteps = time_steps)
+                    points = scheduler.step(model_output, t, points, generator=generator).prev_sample
+            points = points.permute(0, 2, 1).cpu()
+            assert points.shape[0] == 1, "Inference mode only supports batch size 1"
+            joints = points[0].detach().cpu().numpy()
+            clustering = DBSCAN(eps=0.05, min_samples=1).fit(joints)
+            cluster_centers = []
+            for cluster in set(clustering.labels_):
+                cluster_centers.append(joints[clustering.labels_ == cluster].mean(axis=0))
+            return cluster_centers

Anymate/models/joint.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import torch
+import torch.nn as nn
+from ThirdParty.michelangelo.models.modules.embedder import FourierEmbedder
+from ThirdParty.michelangelo.models.modules.transformer_blocks import ResidualCrossAttentionBlock
+from ThirdParty.eg3d.training.networks_stylegan2 import Generator as StyleGAN2Backbone
+from ThirdParty.eg3d.training.networks_stylegan2 import FullyConnectedLayer
+from Anymate.utils.vol_utils import get_co, sample_from_planes, generate_planes
+from einops import repeat
+from sklearn.cluster import DBSCAN
+from Anymate.utils.vol_utils import extract_keypoints
+class TransformerDecoder(nn.Module):
+    def __init__(self,
+                 num_latents = 96,
+                 num_kv_latents = 257,
+                 out_channels = 3,
+                 width = 768,
+                 layers = 7,
+                 device = 'cuda',
+                 dtype = torch.float32,
+                 heads = 12,
+                 init_scale: float = 0.25,
+                 flash = False,
+                 use_checkpoint = False,
+                 qkv_bias = False):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.num_latents = num_latents
+        self.inference = False
+        self.eps = 0.03
+        self.query = nn.Parameter(torch.randn((num_latents, width), device=device, dtype=dtype) * 0.02)
+        self.cross_attn_decoder = ResidualCrossAttentionBlock(
+            device=device,
+            dtype=dtype,
+            n_data=num_kv_latents,
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            flash=flash
+        )
+        self.ln_post = nn.LayerNorm(width, device=device, dtype=dtype)
+        self.output_proj = nn.Linear(width, out_channels, device=device, dtype=dtype)
+    def inference_mode(self, eps=0.03, min_samples=1):
+        self.inference = True
+        self.eps = eps
+        self.min_samples = min_samples
+    def forward(self, latents, data=None, device='cuda', downsample=False, dtype=torch.float32):
+        bs = latents.shape[0]
+        query = repeat(self.query, "m c -> b m c", b=bs)
+        logits = self.cross_attn_decoder(query, latents)
+        logits = self.ln_post(logits)
+        logits = self.output_proj(logits)
+        if self.inference:
+            assert logits.shape[0] == 1, "Inference mode only supports batch size 1"
+            joints = logits[0].detach().cpu().numpy()
+            clustering = DBSCAN(eps=self.eps, min_samples=self.min_samples).fit(joints)
+            cluster_centers = []
+            for cluster in set(clustering.labels_):
+                cluster_centers.append(joints[clustering.labels_ == cluster].mean(axis=0))
+            return cluster_centers
+        return logits
+class ImplicitTransformerDecoder(nn.Module):
+    def __init__(self, *,
+                 device = 'cuda',
+                 dtype = torch.float32,
+                 num_latents = 257,
+                 out_channels = 1,
+                 width = 768,
+                 heads = 12,
+                 num_freqs: int = 8,
+                 include_pi: bool = True,
+                 init_scale: float = 0.25,
+                 qkv_bias: bool = False,
+                 flash: bool = False,
+                 use_checkpoint: bool = False):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
+        self.inference = False
+        self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width, device=device, dtype=dtype)
+        self.cross_attn_decoder = ResidualCrossAttentionBlock(
+            device=device,
+            dtype=dtype,
+            n_data=num_latents,
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            flash=flash
+        )
+        self.ln_post = nn.LayerNorm(width, device=device, dtype=dtype)
+        self.output_proj = nn.Linear(width, out_channels, device=device, dtype=dtype)
+        # self.queries = get_vol().to(device)
+    def inference_mode(self):
+        self.inference = True
+    def forward(self, latents: torch.FloatTensor, data=None, device='cuda', downsample=False):
+        bs = latents.shape[0]
+        # queries = repeat(self.queries, "m c -> b m c", b=bs)
+        out = []
+        for b in range(bs):
+            queries = get_co(data['vox'][b]).to(device).unsqueeze(0)
+            if downsample and data['vox'][b].shape[0] > 50000:
+                # random sample
+                idx = torch.randperm(data['vox'][b].shape[0])[:50000]
+                queries = queries[:, idx]
+            queries = self.query_proj(self.fourier_embedder(queries))
+            x = self.cross_attn_decoder(queries, latents[b:b+1])
+            x = self.ln_post(x)
+            x = self.output_proj(x)
+            if downsample and data['vox'][b].shape[0] > 50000:
+                out.append((x.squeeze(0), idx))
+            else:
+                out.append(x.squeeze(0))
+        if self.inference:
+            assert len(out) == 1, "Inference mode only supports batch size 1"
+            return extract_keypoints(out[0], data['vox'][0])
+        return out
+class TriPlaneDecoder(torch.nn.Module):
+    def __init__(self,
+        z_dim = 768,                      # Input latent (Z) dimensionality.
+        c_dim = 0,                      # Conditioning label (C) dimensionality.
+        w_dim = 768,                      # Intermediate latent (W) dimensionality.
+        # img_resolution,             # Output resolution.
+        # img_channels,               # Number of output color channels.
+        # sr_num_fp16_res     = 0,
+        mapping_kwargs      = {'num_layers': 2},   # Arguments for MappingNetwork.
+        # rendering_kwargs    = {},
+        # sr_kwargs = {},
+        synthesis_kwargs    = {'num_fp16_res': 0, 'conv_clamp': None, 'fused_modconv_default': 'inference_only'},         # Arguments for SynthesisNetwork.
+    ):
+        super().__init__()
+        self.z_dim=z_dim
+        self.c_dim=c_dim
+        self.w_dim=w_dim
+        # self.img_resolution=img_resolution
+        # self.img_channels=img_channels
+        # self.renderer = ImportanceRenderer()
+        # self.ray_sampler = RaySampler()
+        self.backbone = StyleGAN2Backbone(z_dim, c_dim, w_dim, img_resolution=256, img_channels=32*3, mapping_kwargs=mapping_kwargs, **synthesis_kwargs)
+        # self.superresolution = dnnlib.util.construct_class_by_name(class_name=rendering_kwargs['superresolution_module'], channels=32, img_resolution=img_resolution, sr_num_fp16_res=sr_num_fp16_res, sr_antialias=rendering_kwargs['sr_antialias'], **sr_kwargs)
+        self.decoder = OSGDecoder(32, {'decoder_output_dim': 0})
+        self.inference = False
+        # self.neural_rendering_resolution = 64
+        # self.rendering_kwargs = rendering_kwargs
+        self._last_planes = None
+        self.plane_axes = generate_planes()
+    def mapping(self, z, c=None, truncation_psi=1, truncation_cutoff=None, update_emas=False):
+        # if self.rendering_kwargs['c_gen_conditioning_zero']:
+        #         c = torch.zeros_like(c)
+        # return self.backbone.mapping(z, c * self.rendering_kwargs.get('c_scale', 0), truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, update_emas=update_emas)
+        return self.backbone.mapping(z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, update_emas=update_emas)
+    def synthesis(self, ws, c=None, neural_rendering_resolution=None, update_emas=False, cache_backbone=False, use_cached_backbone=False, **synthesis_kwargs):
+        # cam2world_matrix = c[:, :16].view(-1, 4, 4)
+        # intrinsics = c[:, 16:25].view(-1, 3, 3)
+        # if neural_rendering_resolution is None:
+        #     neural_rendering_resolution = self.neural_rendering_resolution
+        # else:
+        #     self.neural_rendering_resolution = neural_rendering_resolution
+        # Create a batch of rays for volume rendering
+        # ray_origins, ray_directions = self.ray_sampler(cam2world_matrix, intrinsics, neural_rendering_resolution)
+        # Create triplanes by running StyleGAN backbone
+        # N, M, _ = ray_origins.shape
+        if use_cached_backbone and self._last_planes is not None:
+            planes = self._last_planes
+        else:
+            planes = self.backbone.synthesis(ws, update_emas=update_emas, **synthesis_kwargs)
+        if cache_backbone:
+            self._last_planes = planes
+        # Reshape output into three 32-channel planes
+        planes = planes.view(len(planes), 3, 32, planes.shape[-2], planes.shape[-1])
+        return planes
+        # Perform volume rendering
+        feature_samples, depth_samples, weights_samples = self.renderer(planes, self.decoder, ray_origins, ray_directions, self.rendering_kwargs) # channels last
+        # Reshape into 'raw' neural-rendered image
+        H = W = self.neural_rendering_resolution
+        feature_image = feature_samples.permute(0, 2, 1).reshape(N, feature_samples.shape[-1], H, W).contiguous()
+        depth_image = depth_samples.permute(0, 2, 1).reshape(N, 1, H, W)
+        # Run superresolution to get final image
+        rgb_image = feature_image[:, :3]
+        sr_image = self.superresolution(rgb_image, feature_image, ws, noise_mode=self.rendering_kwargs['superresolution_noise_mode'], **{k:synthesis_kwargs[k] for k in synthesis_kwargs.keys() if k != 'noise_mode'})
+        return {'image': sr_image, 'image_raw': rgb_image, 'image_depth': depth_image}
+    def sample(self, coordinates, directions, z, c, truncation_psi=1, truncation_cutoff=None, update_emas=False, **synthesis_kwargs):
+        # Compute RGB features, density for arbitrary 3D coordinates. Mostly used for extracting shapes.
+        ws = self.mapping(z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, update_emas=update_emas)
+        planes = self.backbone.synthesis(ws, update_emas=update_emas, **synthesis_kwargs)
+        planes = planes.view(len(planes), 3, 32, planes.shape[-2], planes.shape[-1])
+        return self.renderer.run_model(planes, self.decoder, coordinates, directions, self.rendering_kwargs)
+    def sample_mixed(self, coordinates, directions, ws, truncation_psi=1, truncation_cutoff=None, update_emas=False, **synthesis_kwargs):
+        # Same as sample, but expects latent vectors 'ws' instead of Gaussian noise 'z'
+        planes = self.backbone.synthesis(ws, update_emas = update_emas, **synthesis_kwargs)
+        planes = planes.view(len(planes), 3, 32, planes.shape[-2], planes.shape[-1])
+        return self.renderer.run_model(planes, self.decoder, coordinates, directions, self.rendering_kwargs)
+    def inference_mode(self):
+        self.inference = True
+    def forward(self, z, data=None, device='cuda', downsample=False, c=None, truncation_psi=1, truncation_cutoff=None, neural_rendering_resolution=None, update_emas=False, cache_backbone=False, use_cached_backbone=False, **synthesis_kwargs):
+        # Render a batch of generated images.
+        assert z.shape[-1] == self.z_dim
+        ws = self.mapping(z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, update_emas=update_emas)
+        planes = self.synthesis(ws, c, update_emas=update_emas, neural_rendering_resolution=neural_rendering_resolution, cache_backbone=cache_backbone, use_cached_backbone=use_cached_backbone, **synthesis_kwargs)
+        bs = planes.shape[0]
+        logits = []
+        for b in range(bs):
+            queries = get_co(data['vox'][b]).to(device).unsqueeze(0)
+            if downsample and data['vox'][b].shape[0] > 50000:
+                # random sample
+                idx = torch.randperm(data['vox'][b].shape[0])[:50000]
+                queries = queries[:, idx]
+            out = sample_from_planes(self.plane_axes.to(device), planes[b:b+1], queries)
+            out = self.decoder(out)
+            if downsample and data['vox'][b].shape[0] > 50000:
+                logits.append((out.squeeze(0), idx))
+            else:
+                logits.append(out.squeeze(0))
+        if self.inference:
+            assert len(logits) == 1, "Inference mode only supports batch size 1"
+            return extract_keypoints(logits[0], data['vox'][0])
+        return logits
+class OSGDecoder(torch.nn.Module):
+    def __init__(self, n_features, options):
+        super().__init__()
+        self.hidden_dim = 64
+        self.net = torch.nn.Sequential(
+            FullyConnectedLayer(n_features, self.hidden_dim),
+            torch.nn.Softplus(),
+            FullyConnectedLayer(self.hidden_dim, 1 + options['decoder_output_dim'])
+        )
+    def forward(self, sampled_features, ray_directions=None):
+        # Aggregate features
+        sampled_features = sampled_features.mean(1)
+        x = sampled_features
+        N, M, C = x.shape
+        x = x.view(N*M, C)
+        x = self.net(x)
+        x = x.view(N, M, -1)
+        return x
+        rgb = torch.sigmoid(x[..., 1:])*(1 + 2*0.001) - 0.001 # Uses sigmoid clamping from MipNeRF
+        sigma = x[..., 0:1]
+        return {'rgb': rgb, 'sigma': sigma}

Anymate/models/skin.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import torch
+import torch.nn as nn
+from ThirdParty.michelangelo.models.modules.transformer_blocks import ResidualCrossAttentionBlock, Transformer
+from ThirdParty.michelangelo.models.modules.embedder import components_from_spherical_harmonics, FourierEmbedder
+from einops import repeat, rearrange
+class AttendjointsDecoder_combine(nn.Module):
+    def __init__(self,
+                 width = 768,
+                 layers = 2,
+                 device = 'cuda',
+                 dtype = torch.float32,
+                 heads = 12,
+                 init_scale: float = 0.25,
+                 flash = False,
+                 use_checkpoint = False,
+                 qkv_bias = False,
+                 num_freqs: int = 8,
+                 include_pi: bool = True,
+                 separate = False,
+                 use_mask = True,
+                 use_bone = True,
+                 inference= False):
+        super().__init__()
+        self.inference = inference
+        self.use_checkpoint = use_checkpoint
+        self.separate = separate
+        self.use_mask = use_mask
+        # self.num_latents = num_latents
+        # self.query = nn.Parameter(torch.randn((num_latents, width), device=device, dtype=dtype) * 0.02)
+        self.normal_embedder = components_from_spherical_harmonics
+        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
+        self.bone_proj = None if not use_bone else nn.Linear(self.fourier_embedder.out_dim * 2, width, device=device, dtype=dtype)
+        self.use_bone = use_bone
+        if not self.separate:
+            self.co_proj = nn.Linear(self.fourier_embedder.out_dim, width, device=device, dtype=dtype)
+            self.normal_proj = nn.Linear(25, width, device=device, dtype=dtype)
+        else:
+            self.pc_proj = nn.Linear(self.fourier_embedder.out_dim + 25, width, device=device, dtype=dtype)
+        # self.proj_attn = nn.Linear(width, width, device=device, dtype=dtype)
+        self.cross_attn = nn.ModuleList([ResidualCrossAttentionBlock(
+            device=device,
+            dtype=dtype,
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            flash=flash,
+        ) for _ in range(layers)])
+        self.cross_attn_joint = nn.ModuleList([ResidualCrossAttentionBlock(
+            device=device,
+            dtype=dtype,
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            flash=flash,
+        ) for _ in range(layers)])
+        # self.joint_embed_proj = nn.ModuleList([nn.Linear(width, width, device=device, dtype=dtype) for _ in range(layers)])
+        self.q_proj = nn.Linear(width, width, device=device, dtype=dtype)
+        self.k_proj = nn.Linear(width, width, device=device, dtype=dtype)
+        self.ln_1 = nn.LayerNorm(width, device=device, dtype=dtype)
+        self.ln_2 = nn.LayerNorm(width, device=device, dtype=dtype)
+        # self.last_cross_attn = ResidualCrossAttentionBlock(
+        #     device=device,
+        #     dtype=dtype,
+        #     width=width,
+        #     heads=heads,
+        #     init_scale=init_scale,
+        #     qkv_bias=qkv_bias,
+        #     flash=flash,
+        # )
+        # self.mlp = MLP(device=device, dtype=dtype, width=width, init_scale=init_scale)
+        # self.output_proj = nn.Linear(width, 1, device=device, dtype=dtype)
+    def forward(self, latents, data=None, device='cuda', downsample=None, dtype=torch.float32):
+        joints = data['bones'].to(device) if self.use_bone else data['joints'].to(device)
+        max_joints = max(data['bones_num']) if self.use_bone else max(data['joints_num'])
+        mask = data['bones_mask'].to(device) if self.use_bone else data['joints_mask']
+        pc = data['vertices'][..., 0:3].to(device) if self.inference else data['points_cloud'][..., 0:3].to(device)
+        feats = data['vertices'][..., 3:].to(device) if self.inference else data['points_cloud'][..., 3:].to(device)
+        if downsample and not self.inference:
+            # random sample
+            idx = torch.randperm(pc.shape[1])[:downsample].to(device)
+            pc = pc[:, idx]
+            feats = feats[:, idx]
+        # Embed the input data
+        co_embeds = self.fourier_embedder(pc)
+        if not self.separate:
+            co_embeds = self.co_proj(co_embeds)
+        if self.use_bone:
+            # joints_fourier = torch.cat((self.fourier_embedder(joints[:,:max_joints*2:2, :3]), self.fourier_embedder(joints[:,1:max_joints*2:2, :3])), dim=-1)
+            joints_fourier = torch.cat((self.fourier_embedder(joints[:,:max_joints,:3]), self.fourier_embedder(joints[:,:max_joints, 3:])), dim=-1)
+        else:
+            joints_fourier = self.fourier_embedder(joints[:,:max_joints, :3])
+        if not self.separate:
+            joints_embeds = self.co_proj(joints_fourier) if not self.use_bone else self.bone_proj(joints_fourier)
+        normal_embeds = self.normal_proj(self.normal_embedder(feats)) if not self.separate else self.normal_embedder(feats)
+        if not self.separate:
+            pc_embeds = co_embeds + normal_embeds
+        else:
+            joints_embeds = self.co_proj(joints_fourier.to(dtype)) if not self.use_bone else self.bone_proj(joints_fourier.to(dtype))
+            pc_embeds = self.pc_proj(torch.cat([co_embeds.to(dtype), normal_embeds.to(dtype)], dim=-1))
+        pc_num = pc_embeds.shape[-2]
+        joints_num = joints_embeds.shape[-2]
+        x = torch.cat([pc_embeds, joints_embeds], dim=-2)
+        for i, layer in enumerate(self.cross_attn):
+            x = layer(x, latents)
+            if self.use_mask:
+                x = self.cross_attn_joint[i](x, x[:, pc_num:], mask=mask.to(device))
+            else:
+                x = self.cross_attn_joint[i](x, x[:, pc_num:])
+        pc_embeds, joints_embeds = x.split([pc_num, joints_num], dim=1)
+        logits = torch.einsum('bnc,bmc->bnm', self.k_proj(self.ln_1(pc_embeds)), self.q_proj(self.ln_2(joints_embeds)))  # (b, n, m)
+        if self.use_mask:
+            logits = logits.masked_fill(mask.unsqueeze(1) == 0, -1e8)
+        if downsample and not self.inference:
+            return logits, idx
+        return logits
+class AttendjointsDecoder_multi(nn.Module):
+    def __init__(self,
+                #  num_latents = 64,
+                #  num_kv_latents = 257,
+                #  out_channels = 3,
+                 width = 768,
+                 layers = 4,
+                 device = 'cuda',
+                 dtype = torch.float32,
+                 heads = 12,
+                 init_scale: float = 0.25,
+                 flash = False,
+                 use_checkpoint = False,
+                 qkv_bias = False,
+                 num_freqs: int = 8,
+                 concat_num: int = 512,
+                 include_pi: bool = True,
+                separate = False,
+                use_mask = True,
+                inference_with_repeat=False,
+                use_bone = True,
+                inference = False):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.use_mask = use_mask
+        self.inference_with_repeat = inference_with_repeat
+        self.inference = inference
+        self.self_attn = Transformer(
+            device=device,
+            dtype=dtype,
+            n_ctx=-1,
+            width=width,
+            layers=layers,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            flash=flash,
+            use_checkpoint=False,
+        )
+        self.concat_number = concat_num
+        self.separate = separate
+        self.normal_embedder = components_from_spherical_harmonics
+        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
+        self.bone_proj = None if not use_bone else nn.Linear(self.fourier_embedder.out_dim * 2, width, device=device, dtype=dtype)
+        self.use_bone = use_bone
+        if not self.separate:
+            self.co_proj = nn.Linear(self.fourier_embedder.out_dim, width, device=device, dtype=dtype)
+            self.normal_proj = nn.Linear(25, width, device=device, dtype=dtype)
+        else:
+            self.pc_proj = nn.Linear(self.fourier_embedder.out_dim + 25, width, device=device, dtype=dtype)
+        # self.proj_attn = nn.Linear(width, width, device=device, dtype=dtype)
+        # self.ln_post = nn.LayerNorm(width, device=device, dtype=dtype)
+        self.output_proj_joints = nn.Linear(width, width, device=device, dtype=dtype)
+        self.output_proj_points = nn.Linear(width, width, device=device, dtype=dtype)
+        self.layer_norm = nn.LayerNorm(width)
+    # def inference(self, latents, data=None,device='cuda', dtype='float32', use_mask=False):
+    def inference_mode(self):
+        self.inference = True
+    def forward(self, latents, data=None,device='cuda', downsample=None, dtype='float32'):
+        joints = data['bones'].to(device) if self.use_bone else data['joints'].to(device)
+        max_joints = max(data['bones_num']) if self.use_bone else max(data['joints_num'])
+        pc = data['points_cloud'][..., 0:3].to(device)
+        feats = data['points_cloud'][..., 3:].to(device)
+        if downsample:
+            # random sample
+            idx = torch.randperm(pc.shape[1])[:downsample].to(device)
+            pc = pc[:, idx]
+            feats = feats[:, idx]
+        bs = pc.shape[1]//self.concat_number
+        # Embed the input data
+        if self.use_bone:
+            # joints_fourier = torch.cat((self.fourier_embedder(joints[:,:max_joints*2:2, :3]), self.fourier_embedder(joints[:,1:max_joints*2:2, :3])), dim=-1)
+            joints_fourier = torch.cat((self.fourier_embedder(joints[:,:max_joints,:3]), self.fourier_embedder(joints[:,:max_joints, 3:])), dim=-1)
+        else:
+            joints_fourier = self.fourier_embedder(joints[:,:max_joints, :3])
+        if self.separate:
+            joints_embeds = self.co_proj(joints_fourier.to(dtype)) if not self.use_bone else self.bone_proj(joints_fourier.to(dtype))
+            points_embeds = self.fourier_embedder(pc)
+            normal_embeds = self.normal_embedder(feats)
+            points = self.pc_proj(torch.cat([points_embeds, normal_embeds], dim=-1))
+        else:
+            joints_embeds = self.co_proj(joints_fourier) if not self.use_bone else self.bone_proj(joints_fourier)
+            co_embeds = self.fourier_embedder(pc)
+            co_embeds = self.co_proj(co_embeds)
+            # Embed the normals
+            normal_embeds = self.normal_embedder(feats)
+            normal_embeds = self.normal_proj(normal_embeds)  # (b, n, c)
+            points = (co_embeds + normal_embeds)
+        repeated_latents = repeat(latents, "b m c -> b n m c", n=bs)
+        repeated_joints = repeat(joints_embeds, "b m c -> b n m c", n=bs)
+        points = points.reshape( latents.shape[0], bs, self.concat_number, -1)
+        # Concatenate embeddings
+        x = torch.cat([repeated_joints, points, repeated_latents], dim=-2) # (b, bs, concat_number+latent_num+joints_num, c)
+        # Pass through self-attention
+        if self.use_mask:
+            mask = data['bones_mask'].to(device)
+            append_size = x.shape[2]-mask.shape[1] # the zero needs to append after mask
+            batch_size = mask.shape[0]
+            mask_extend = torch.ones((batch_size,append_size)).to(device)
+            mask = torch.cat([mask,mask_extend],dim=-1).repeat(bs,1).to(device)
+            x = rearrange(x, "b n m c -> (b n) m c")
+            x = self.self_attn(x,mask)
+        else:
+            x = rearrange(x, "b n m c -> (b n) m c")
+            x = self.self_attn(x)
+        joints, points, _ = x.split([joints_embeds.shape[1],self.concat_number, latents.shape[1]], dim=1)
+        joints = self.output_proj_joints(self.layer_norm(joints))
+        points = self.output_proj_points(self.layer_norm(points))
+        logits = torch.einsum('bik,bjk->bij', points, joints)
+        logits = rearrange(logits, '(b n) m c -> b (n m) c', b=pc.shape[0],n=bs) # (b, n, c)
+        if self.use_mask:
+            mask = data['bones_mask'].to(device)
+            logits = logits.masked_fill(mask.unsqueeze(1) == 0, -1e8)
+        if self.inference:
+            vertices = data['vertice']
+            points_cloud = data['points_cloud'][0,..., 0:3].to(device)
+            vertices_exp = vertices[0,...,:3]  # (batch_size, num_vertices, 1, 3)
+            logits = compute_nearest_points(vertices_exp, points_cloud, logits[0], device)
+        if downsample:
+            return logits, idx
+        return logits
+def compute_nearest_points(vertices, points, logits, device, batch_size=1024):
+    # vertices: [N, 3]
+    # points: [M, 3]
+    # logits: [M, K]  (K is the number of skinning weights)
+    num_vertices = vertices.shape[0]
+    # Initialize the output tensor for skinning weights
+    skin_predict = torch.zeros((num_vertices, logits.shape[1]), device=device)
+    # Split vertices into batches
+    for i in range(0, num_vertices, batch_size):
+        batch_vertices = vertices[i:i+batch_size]  # [batch_size, 3]
+        vertices_exp = batch_vertices.unsqueeze(1)  # [batch_size, 1, 3]
+        points_exp = points.unsqueeze(0)  # [1, num_points, 3]
+        distances = torch.sum((vertices_exp - points_exp) ** 2, dim=-1)  # [batch_size, num_points]
+        nearest_idx = torch.argmin(distances, dim=-1)  # [batch_size]
+        skin_predict_batch = logits[nearest_idx]  # [batch_size, K]
+        skin_predict[i:i+batch_size] = skin_predict_batch
+    return skin_predict

Anymate/tmp/.gitkeep ADDED Viewed

File without changes

Anymate/utils/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import numpy as np
+import torch
+import trimesh
+from ThirdParty.Rignet_utils import binvox_rw
+def sparse_to_index(sparse_matrix):
+    index = []
+    weight = []
+    for j in range(len(sparse_matrix)):
+        if sparse_matrix[j] > 0:
+            index.append(j)
+            weight.append(sparse_matrix[j])
+    return index, weight
+def index_to_sparse(index, weight, shape):
+    sparse_matrix = np.zeros([shape[0], shape[1], shape[2]+1])
+    row_indices, col_indices = np.meshgrid(np.arange(sparse_matrix.shape[0]), np.arange(sparse_matrix.shape[1]), indexing='ij')
+    row_indices = np.expand_dims(row_indices, axis=-1)
+    col_indices = np.expand_dims(col_indices, axis=-1)
+    sparse_matrix[row_indices, col_indices, index] = weight
+    return torch.from_numpy(sparse_matrix[:, :, :-1])
+def index_to_sparse_con(index, shape):
+    sparse_matrix = np.zeros([shape[0], shape[1], shape[2]+1],dtype=np.int8)
+    row_indices, col_indices = np.meshgrid(np.arange(sparse_matrix.shape[0]), np.arange(sparse_matrix.shape[1]), indexing='ij')
+    row_indices = np.expand_dims(row_indices, axis=-1)
+    col_indices = np.expand_dims(col_indices, axis=-1)
+    sparse_matrix[row_indices, col_indices, index] = 1
+    return torch.from_numpy(sparse_matrix[:, :, :-1])
+def create_mask(n, max_len=64):
+    mask = torch.zeros(max_len, dtype=torch.bool)
+    mask[:n] = 1
+    return mask
+def reduce(vox):
+    new_data = np.zeros((vox.dims[0] // 2, vox.dims[1] // 2, vox.dims[2] // 2)).astype(bool)
+    new_data = np.logical_or(new_data, vox.data[::2, ::2, ::2])
+    new_data = np.logical_or(new_data, vox.data[1::2, ::2, ::2])
+    new_data = np.logical_or(new_data, vox.data[::2, 1::2, ::2])
+    new_data = np.logical_or(new_data, vox.data[::2, ::2, 1::2])
+    new_data = np.logical_or(new_data, vox.data[1::2, 1::2, ::2])
+    new_data = np.logical_or(new_data, vox.data[1::2, ::2, 1::2])
+    new_data = np.logical_or(new_data, vox.data[::2, 1::2, 1::2])
+    new_data = np.logical_or(new_data, vox.data[1::2, 1::2, 1::2])
+    # dilate the new voxel
+    new_data[:-1, :, :] = np.logical_or(new_data[:-1, :, :], new_data[1:, :, :])
+    new_data[:, :-1, :] = np.logical_or(new_data[:, :-1, :], new_data[:, 1:, :])
+    new_data[:, :, :-1] = np.logical_or(new_data[:, :, :-1], new_data[:, :, 1:])
+    return binvox_rw.Voxels(new_data, new_data.shape, vox.translate, vox.scale, vox.axis_order)
+def align(vox, y_max):
+    new_data = np.zeros(vox.dims).astype(bool)
+    ind = np.argwhere(vox.data)
+    ind = ind + (np.array(vox.translate) - np.array([-0.5, -0.5 * (1 - y_max), -0.5])) * vox.dims[0]
+    # round to the nearest integer
+    # ind = np.round(ind).astype(int)
+    ind = np.ceil(ind).astype(int)
+    # clip to the valid range
+    ind = np.clip(ind, 0, vox.dims[0] - 1)
+    # new_data[ind[:, 0], ind[:, 1], ind[:, 2]] = True
+    return ind
+def get_skin_direction(joint_idx, data, parent_index, joints_matrix):
+    # Get points influenced by this joint (weight > 0)
+    weights = index_to_sparse(data['skins_index'].unsqueeze(0), data['skins_weight'].unsqueeze(0), [1, 8192, data['bones_num']])[0][:,joint_idx]
+    mask = weights > 0
+    if not torch.any(mask):
+        # If no points are influenced, return the opposite direction of its parent
+        parent_idx = parent_index[joint_idx].item()
+        if parent_idx == joint_idx:
+            return torch.tensor([0, 0, 0.001])
+        parent_pos = joints_matrix[parent_idx, :3]
+        joint_pos = joints_matrix[joint_idx, :3]
+        direction = joint_pos - parent_pos
+        norm = torch.norm(direction)
+        if norm < 1e-8:  # Add check for zero norm
+            return torch.tensor([0, 0, 0.001])
+        normalized_direction = direction / norm
+        return normalized_direction * 0.01
+    # Get joint position
+    joint_pos = joints_matrix[joint_idx, :3]
+    # Get weighted average direction from joint to influenced points
+    points = data['pc'][mask][:,:3]
+    point_weights = weights[mask]
+    # Calculate directions from joint to each point
+    directions = points - joint_pos
+    # Calculate weighted average direction
+    avg_direction = torch.sum(directions * point_weights.unsqueeze(1), dim=0) / torch.sum(point_weights)
+    if torch.norm(avg_direction) < 1e-5:
+        return torch.tensor([0, 0, 0.001])
+    return avg_direction * 1.25
+def obj2mesh(obj_path):
+    # open the obj as txt
+    vertices = []
+    faces = []
+    with open(obj_path, 'r') as f:
+        obj = f.readlines()
+        for line in obj:
+            if line.startswith('v '):
+                vertices.append(list(map(float, line.split()[1:])))
+            elif line.startswith('f '):
+                faces.append(list(map(int, [i.split('/')[0] for i in line.split()[1:]])))
+    vertices = np.array(vertices)
+    faces = np.array(faces) - 1
+    # print(vertices.shape, faces.shape)
+    # create trimesh mesh with given vertices and faces
+    mesh = trimesh.Trimesh(vertices, faces, process=False)
+    # print(mesh.vertices.shape, mesh.faces.shape)
+    return mesh

Anymate/utils/diffusion_encoder.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import torch
+import torch.nn as nn
+from typing import Optional
+from einops import repeat
+import math
+from ThirdParty.michelangelo.models.modules.transformer_blocks import ResidualCrossAttentionBlock,Transformer, checkpoint
+from torch.nn import Sequential, Dropout, Linear, ReLU, Parameter, BatchNorm1d
+from typing import List, Optional, Tuple, Union
+class ShapeAsLatentModule(nn.Module):
+    latent_shape: Tuple[int, int]
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+    def decode(self, *args, **kwargs):
+        raise NotImplementedError
+    def query_geometry(self, *args, **kwargs):
+        raise NotImplementedError
+class FourierEmbedder(nn.Module):
+    def __init__(self,
+                 num_freqs: int = 6,
+                 logspace: bool = True,
+                 input_dim: int = 3,
+                 include_input: bool = True,
+                 include_pi: bool = True) -> None:
+        """The initialization"""
+        super().__init__()
+        if logspace:
+            frequencies = 2.0 ** torch.arange(
+                num_freqs,
+                dtype=torch.float32
+            )
+        else:
+            frequencies = torch.linspace(
+                1.0,
+                2.0 ** (num_freqs - 1),
+                num_freqs,
+                dtype=torch.float32
+            )
+        if include_pi:
+            frequencies *= torch.pi
+        self.register_buffer("frequencies", frequencies, persistent=False)
+        self.include_input = include_input
+        self.num_freqs = num_freqs
+        self.out_dim = self.get_dims(input_dim)
+    def get_dims(self, input_dim):
+        temp = 1 if self.include_input or self.num_freqs == 0 else 0
+        out_dim = input_dim * (self.num_freqs * 2 + temp)
+        return out_dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.num_freqs > 0:
+            self.frequencies = self.frequencies.to(x.device)
+            embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1)
+            if self.include_input:
+                return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
+            else:
+                return torch.cat((embed.sin(), embed.cos()), dim=-1)
+        else:
+            return x
+def MLP(channels, batch_norm=True):
+    if batch_norm:
+        return Sequential(*[Sequential(Linear(channels[i - 1], channels[i]), ReLU(), BatchNorm1d(channels[i], momentum=0.1))
+                            for i in range(1, len(channels))])
+    else:
+        return Sequential(*[Sequential(Linear(channels[i - 1], channels[i]), ReLU()) for i in range(1, len(channels))])
+class CrossAttentionEncoder(nn.Module):
+    def __init__(self, *,
+                 device: Optional[torch.device],
+                 dtype: Optional[torch.dtype],
+                 num_latents: int,
+                 fourier_embedder: FourierEmbedder,
+                 point_feats: int,
+                 width: int,
+                 heads: int,
+                 layers: int,
+                 init_scale: float = 0.25,
+                 qkv_bias: bool = True,
+                 flash: bool = False,
+                 use_ln_post: bool = False,
+                 use_checkpoint: bool = False):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.num_latents = num_latents
+        self.query = nn.Parameter(torch.randn((num_latents, width), device=device, dtype=dtype) * 0.02)
+        self.fourier_embedder = fourier_embedder
+        self.input_proj = nn.Linear(self.fourier_embedder.out_dim + point_feats, width, device=device, dtype=dtype)
+        self.cross_attn = ResidualCrossAttentionBlock(
+            device=device,
+            dtype=dtype,
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            flash=flash,
+        )
+        self.self_attn = Transformer(
+            device=device,
+            dtype=dtype,
+            n_ctx=num_latents,
+            width=width,
+            layers=layers,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            flash=flash,
+            use_checkpoint=False
+        )
+        if use_ln_post:
+            self.ln_post = nn.LayerNorm(width, dtype=dtype, device=device)
+        else:
+            self.ln_post = None
+    def _forward(self, pc, feats):
+        """
+        Args:
+            pc (torch.FloatTensor): [B, N, 3]
+            feats (torch.FloatTensor or None): [B, N, C]
+        Returns:
+        """
+        bs = pc.shape[0]
+        data = self.fourier_embedder(pc)
+        if feats is not None:
+            data = torch.cat([data, feats], dim=-1)
+        data = self.input_proj(data)
+        query = repeat(self.query, "m c -> b m c", b=bs)
+        latents = self.cross_attn(query, data)
+        latents = self.self_attn(latents)
+        if self.ln_post is not None:
+            latents = self.ln_post(latents)
+        return latents, pc
+    def forward(self, pc: torch.FloatTensor, feats: Optional[torch.FloatTensor] = None):
+        """
+        Args:
+            pc (torch.FloatTensor): [B, N, 3]
+            feats (torch.FloatTensor or None): [B, N, C]
+        Returns:
+            dict
+        """
+        return checkpoint(self._forward, (pc, feats), self.parameters(), self.use_checkpoint)
+class TransformerEncoder(ShapeAsLatentModule):
+    def __init__(self, *,
+                 device: Optional[torch.device]='cuda',
+                 dtype: Optional[torch.dtype],
+                 num_latents: int = 16,
+                 point_feats: int = 3,
+                 embed_dim: int = 64,
+                 num_freqs: int = 8,
+                 include_pi: bool = True,
+                 width: int = 768,
+                 heads: int = 12,
+                 num_encoder_layers: int = 8,
+                 init_scale: float = 0.25,
+                 qkv_bias: bool = True,
+                 flash: bool = False,
+                 use_ln_post: bool = False,
+                 use_checkpoint: bool = False,
+                 out_channels: int = 4):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.num_latents = num_latents
+        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
+        init_scale = init_scale * math.sqrt(1.0 / width)
+        self.encoder = CrossAttentionEncoder(
+            device=device,
+            dtype=dtype,
+            fourier_embedder=self.fourier_embedder,
+            num_latents=num_latents,
+            point_feats=point_feats,
+            width=width,
+            heads=heads,
+            layers=num_encoder_layers,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            flash=flash,
+            use_ln_post=use_ln_post,
+            use_checkpoint=use_checkpoint
+        )
+        self.width = width
+        self.out_channels = out_channels
+        self.device = device
+        self.embed_dim = embed_dim
+    def encode(self,data):
+        input_points = data['points_cloud'].to(self.device)
+        bs = input_points.shape[0]
+        pc, feats = input_points[...,:3], input_points[..., 3:]
+        latents, _ = self.encoder(pc, feats)
+        # print_time('after encoder')
+        latents = latents.reshape(bs,-1, self.width)
+        return latents
+    def encode_pc(self,points_cloud):
+        bs = points_cloud.shape[0]
+        input_points = points_cloud.to(self.device)
+        pc, feats = input_points[...,:3], input_points[..., 3:]
+        latents, _ = self.encoder(pc, feats)
+        latents = latents.reshape(bs,-1, self.width)
+        return latents
+    def forward(self, data):
+        # input_points = torch.from_numpy(np.array(data.points_cloud)).cuda()
+        input_points = data['points_cloud'].to(self.device)
+        pc, feats = input_points[...,:3], input_points[..., 3:]
+        latents, _ = self.encoder(pc, feats)
+        latents = latents.reshape(-1, self.width)
+        latents =latents.reshape(-1, self.num_latents, self.out_channels)
+        latents[..., :3] = torch.tanh(latents[..., :3])
+        latents[..., 3:] = torch.sigmoid(latents[..., 3:])
+        return latents

Anymate/utils/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+from torchvision.utils import make_grid
+import torch
+from typing import List, Optional, Tuple, Union
+import torch.nn as nn
+import math
+from timm.models.vision_transformer import Mlp, DropPath
+def my_collate_diff(batch,return_joints_num=128,random=False):
+    data = {}
+    for key in batch[0]:
+        if key=='vox' or key=='name' or key=='joints_num' or key=='skins_index' or key=='skins_weight' or key=='parent_index' or key=='conns' or key=='joints' or key=='bones' or key=='mesh_skins_index' or key=='mesh_skins_weight' or key=='mesh_pc' or key=='mesh_face':
+            data[key] = [sample[key] for sample in batch]
+        elif key=='pc':
+            data['points_cloud'] = torch.stack([sample['pc'] for sample in batch])
+        elif key=='skins':
+            continue
+        elif key=='bones_num':
+            data[key] = torch.tensor([sample['bones_num'] for sample in batch])
+        else:
+            data[key] = torch.stack([sample[key] for sample in batch])
+    if 'joints' in batch[0]:
+        padded_joints_matrix = torch.ones(len(data['name']), return_joints_num, 3) * (-3)
+        joints_matrix = torch.ones(len(data['name']), 96, 3) * (-3)
+        for i in range(len(data['name'])):
+            joints_matrix[i, :data['joints_num'][i], :] = data['joints'][i]
+        if not random:
+            for i in range(len(data['name'])):
+                padded_joints_matrix[i] = data['joints'][i].repeat(return_joints_num//data['joints_num'][i]+1,1)[:return_joints_num,:]
+        else:
+            for i in range(len(data['name'])):
+                padded_joints_matrix[i] = data['joints'][i][torch.randint(0, data['joints_num'][i], (return_joints_num,))]
+        data['joints_repeat'] = padded_joints_matrix
+        data['joints'] = joints_matrix
+    return data
+def randn_tensor(
+    shape: Union[Tuple, List],
+    generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None,
+    device: Optional["torch.device"] = None,
+    dtype: Optional["torch.dtype"] = None,
+    layout: Optional["torch.layout"] = None,
+):
+    """A helper function to create random tensors on the desired `device` with the desired `dtype`. When
+    passing a list of generators, you can seed each batch size individually. If CPU generators are passed, the tensor
+    is always created on the CPU.
+    """
+    # device on which tensor is created defaults to device
+    rand_device = device
+    batch_size = shape[0]
+    layout = layout or torch.strided
+    device = device or torch.device("cpu")
+    if generator is not None:
+        gen_device_type = generator.device.type if not isinstance(generator, list) else generator[0].device.type
+        if gen_device_type != device.type and gen_device_type == "cpu":
+            rand_device = "cpu"
+        elif gen_device_type != device.type and gen_device_type == "cuda":
+            raise ValueError(f"Cannot generate a {device} tensor from a generator of type {gen_device_type}.")
+    # make sure generator list of length 1 is treated like a non-list
+    if isinstance(generator, list) and len(generator) == 1:
+        generator = generator[0]
+    if isinstance(generator, list):
+        shape = (1,) + shape[1:]
+        latents = [
+            torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype, layout=layout)
+            for i in range(batch_size)
+        ]
+        latents = torch.cat(latents, dim=0).to(device)
+    else:
+        latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype, layout=layout).to(device)
+    return latents
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].to(timesteps.dtype) * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+class CrossAttention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            kv_dim=None,
+            num_heads=16,
+            qkv_bias=False,
+            attn_drop=0.,
+            proj_drop=0.,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        kv_dim = dim if not kv_dim else kv_dim
+        self.wq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.wk = nn.Linear(kv_dim, dim, bias=qkv_bias)
+        self.wv = nn.Linear(kv_dim, dim, bias=qkv_bias)
+        self.attn_drop_rate = attn_drop
+        self.attn_drop = nn.Dropout(self.attn_drop_rate)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x_q, x_kv):
+        B, N_q, C = x_q.shape
+        B, N_kv, _ = x_kv.shape
+        # [B, N_q, C] -> [B, N_q, H, C/H] -> [B, H, N_q, C/H]
+        q = self.wq(x_q).reshape(B, N_q, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        # [B, N_kv, C] -> [B, N_kv, H, C/H] -> [B, H, N_kv, C/H]
+        k = self.wk(x_kv).reshape(B, N_kv, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        # [B, N_kv, C] -> [B, N_kv, H, C/H] -> [B, H, N_kv, C/H]
+        v = self.wv(x_kv).reshape(B, N_kv, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        # [B, H, N_q, C/H] @ [B, H, C/H, N_kv] -> [B, H, N_q, N_kv]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        # [B, H, N_q, N_kv] @ [B, H, N_kv, C/H] -> [B, H, N_q, C/H]
+        x = attn @ v
+        # [B, H, N_q, C/H] -> [B, N_q, C]
+        x = x.transpose(1, 2).reshape(B, N_q, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Compute_Block(nn.Module):
+    def __init__(self, z_dim, num_heads=16, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm_z1 = norm_layer(z_dim)
+        self.attn = CrossAttention(
+            z_dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm_z2 = norm_layer(z_dim)
+        mlp_hidden_dim = int(z_dim * mlp_ratio)
+        self.mlp = Mlp(in_features=z_dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, z):
+        zn = self.norm_z1(z)
+        z = z + self.drop_path(self.attn(zn, zn))
+        z = z + self.drop_path(self.mlp(self.norm_z2(z)))
+        return z
+class Read_Block(nn.Module):
+    def __init__(self, z_dim, x_dim, num_heads=16, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm_x = norm_layer(x_dim)
+        self.norm_z1 = norm_layer(z_dim)
+        self.attn = CrossAttention(
+            z_dim, x_dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm_z2 = norm_layer(z_dim)
+        mlp_hidden_dim = int(z_dim * mlp_ratio)
+        self.mlp = Mlp(in_features=z_dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, z, x):
+        z = z + self.drop_path(self.attn(self.norm_z1(z), self.norm_x(x)))
+        z = z + self.drop_path(self.mlp(self.norm_z2(z)))
+        return z
+class Write_Block(nn.Module):
+    def __init__(self, z_dim, x_dim, num_heads=16, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm_z = norm_layer(z_dim)
+        self.norm_x1 = norm_layer(x_dim)
+        self.attn = CrossAttention(
+            x_dim, z_dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm_x2 = norm_layer(x_dim)
+        mlp_hidden_dim = int(x_dim * mlp_ratio)
+        self.mlp = Mlp(in_features=x_dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, z, x):
+        x = x + self.drop_path(self.attn(self.norm_x1(x), self.norm_z(z)))
+        x = x + self.drop_path(self.mlp(self.norm_x2(x)))
+        return x
+class RCW_Block(nn.Module):
+    def __init__(self, z_dim, x_dim, num_compute_layers=4, num_heads=16,
+                 mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.read = Read_Block(z_dim, x_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop,
+                                   attn_drop=attn_drop, drop_path=drop_path, act_layer=act_layer, norm_layer=norm_layer)
+        self.write = Write_Block(z_dim, x_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop,
+                                   attn_drop=attn_drop, drop_path=drop_path, act_layer=act_layer, norm_layer=norm_layer)
+        self.compute = nn.ModuleList([
+            Compute_Block(z_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop,
+                                attn_drop=attn_drop, drop_path=drop_path, act_layer=act_layer, norm_layer=norm_layer)
+            for _ in range(num_compute_layers)
+        ])
+    def forward(self, z, x):
+        z = self.read(z, x)
+        for layer in self.compute:
+            z = layer(z)
+        x = self.write(z, x)
+        return z, x
+def pairwise_distances(x, y):
+    #Input: x is a Nxd matrix
+    #       y is an optional Mxd matirx
+    #Output: dist is a NxM matrix where dist[i,j] is the square norm between x[i,:] and y[j,:]
+    #        if y is not given then use 'y=x'.
+    #i.e. dist[i,j] = ||x[i,:]-y[j,:]||^2
+    x_norm = (x ** 2).sum(1).view(-1, 1)
+    y_t = torch.transpose(y, 0, 1)
+    y_norm = (y ** 2).sum(1).view(1, -1)
+    dist = x_norm + y_norm - 2.0 * torch.mm(x, y_t)
+    return torch.clamp(dist, 0.0, np.inf)
+def meanshift_cluster(pts_in, bandwidth, weights=None, max_iter=20):
+    """
+    Meanshift clustering
+    :param pts_in: input points
+    :param bandwidth: bandwidth
+    :param weights: weights per pts indicting its importance in the clustering
+    :return: points after clustering
+    """
+    diff = 1e10
+    num_iter = 1
+    while diff > 1e-3 and num_iter < max_iter:
+        Y = np.sum(((pts_in[np.newaxis, ...] - pts_in[:, np.newaxis, :]) ** 2), axis=2)
+        K = np.maximum(bandwidth**2 - Y, np.zeros(Y.shape))
+        if weights is not None:
+            K = K * weights
+        row_sums = K.sum(axis=0, keepdims=True)
+        P = K / (row_sums + 1e-10)
+        P = P.transpose()
+        pts_in_prim = 0.3 * (np.matmul(P, pts_in) - pts_in) + pts_in
+        diff = np.sqrt(np.sum((pts_in_prim - pts_in)**2))
+        pts_in = pts_in_prim
+        num_iter += 1
+    return pts_in
+def nms_meanshift(pts_in, density, bandwidth):
+    """
+    NMS to extract modes after meanshift. Code refers to sci-kit-learn.
+    :param pts_in: input points
+    :param density: density at each point
+    :param bandwidth: bandwidth used in meanshift. Used here as neighbor region for NMS
+    :return: extracted clusters.
+    """
+    Y = np.sum(((pts_in[np.newaxis, ...] - pts_in[:, np.newaxis, :]) ** 2), axis=2)
+    sorted_ids = np.argsort(density)[::-1]
+    unique = np.ones(len(sorted_ids), dtype=bool)
+    dist = np.sqrt(Y)
+    for i in sorted_ids:
+        if unique[i]:
+            neighbor_idxs = np.argwhere(dist[:, i] <= bandwidth)
+            unique[neighbor_idxs.squeeze()] = 0
+            unique[i] = 1  # leave the current point as unique
+    pts_in = pts_in[unique]
+    return pts_in
+def get_predictions(y_pred_np, attn_pred_np=None,bandwidth=0.05, threshold=0.001):
+    """
+    get the final predictions
+    :param pts: input points
+    :param weights: weight per point during clustering
+    :return: clustered points
+    """
+    # if attn_pred_np is None:
+    #     attn_pred_np = np.ones(y_pred_np.shape[0])
+    y_pred_np = meanshift_cluster(y_pred_np, bandwidth, attn_pred_np, max_iter=40)
+    Y_dist = np.sum(((y_pred_np[np.newaxis, ...] - y_pred_np[:, np.newaxis, :]) ** 2), axis=2)
+    density = np.maximum(bandwidth ** 2 - Y_dist, np.zeros(Y_dist.shape))
+    density = np.sum(density, axis=0)
+    density_sum = np.sum(density)
+    y_pred_np = y_pred_np[density / density_sum > threshold]
+    density = density[density / density_sum > threshold]
+    pred_joints = nms_meanshift(y_pred_np, density, bandwidth)
+    return pred_joints
+if __name__ == '__main__':
+    points_cloud = np.ones((100, 3))
+    predict_out = get_predictions(points_cloud, bandwidth=0.05, threshold=0.001)
+    print(predict_out.shape)

Anymate/utils/eval_utils.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from tqdm import tqdm
+import torch
+import torch.nn.functional as F
+import numpy as np
+import point_cloud_utils as pcu
+from Anymate.utils.loss_utils import chamfer_distance_with_average, cross_entropy_with_probs_batch, cos_loss, cos_loss_clamp
+from ThirdParty.Rignet_utils.utils import get_skel
+from ThirdParty.Rignet_utils.Rignet_loss import edit_dist, chamfer_dist, joint2bone_chamfer_dist, bone2bone_chamfer_dist
+from scipy.optimize import linear_sum_assignment
+def evaluate_joint(joints, joints_gt, threshold=1e-1):
+    """
+    joints: list of predicted joints: tensor of shape (n,joints_num,3)
+    joints_gt: list of ground truth joints : tensor of shape (n,joints_num,3)
+    """
+    chamfer_loss_all = 0
+    emd_loss_all = 0
+    precision = 0
+    recall = 0
+    count = 0
+    for i in tqdm(range(len(joints))):
+        joint_predict = joints[i].cpu()
+        joint_gt = joints_gt[i].cpu()
+        distance_matrix = torch.cdist(joint_gt, joint_predict) # (n_gt, n_predict)
+        n_gt,n_predict = distance_matrix.shape
+        min_distance_pred = torch.min(distance_matrix, dim=0)
+        min_distance_gt = torch.min(distance_matrix, dim=1)
+        precision += torch.sum(min_distance_pred.values < threshold).item()/n_predict
+        recall += torch.sum(min_distance_gt.values < threshold).item()/n_gt
+        chamfer_loss_all += chamfer_distance_with_average(joint_predict.unsqueeze(0), joint_gt.unsqueeze(0))
+        joint_predict = joint_predict.numpy().astype(np.float64)
+        joint_gt = joint_gt.numpy().astype(np.float64)
+        emd,_ = pcu.earth_movers_distance(joint_predict, joint_gt)
+        emd_loss_all += emd
+        count += 1
+    print('------------------------------------')
+    print('Evaluation results for joint:')
+    print('chamfer_loss:', chamfer_loss_all/count)
+    print('emd_loss:', emd_loss_all/count)
+    print('precision:', precision/count)
+    print('recall:', recall/count)
+    print('count:', count)
+    print('------------------------------------')
+    return chamfer_loss_all/count, emd_loss_all/count, precision/count, recall/count
+def evaluate_connectivity(conns, conns_gt, joints_gt, vox_list):
+    """
+    conns: list of predicted connections probability: tensor of shape (n,joints_num,joints_num)
+    conns_gt: list of ground truth connections: tensor of shape (n,joints_num,joints_num)
+    """
+    precision_all = 0
+    recall_all = 0
+    cross_entropy_all = 0
+    bone2bone_dist_con = 0
+    count = 0
+    for i in tqdm(range(len(conns))):
+        conn_predict = conns[i].cpu().numpy()
+        conn_gt = conns_gt[i].cpu().numpy()
+        joints = joints_gt[i].cpu().numpy()
+        vox = vox_list[i]
+        cross_entropy_all += cross_entropy_with_probs_batch(torch.from_numpy(conn_predict).unsqueeze(0), torch.from_numpy(conn_gt).unsqueeze(0), reduction='mean')
+        # consider to add tree edit distance (need joint and vox information)
+        pred_skel, parent_matrix = get_skel(joints, conn_predict, vox=vox)
+        gt_skel, parent_matrix = get_skel(joints, conn_gt, vox=vox)
+        bone2bone_dist_con += bone2bone_chamfer_dist(pred_skel, gt_skel)
+        conn_predict = np.argmax(conn_predict, axis=1)
+        conn_gt = np.argmax(conn_gt, axis=1)
+        connection_matrix_pre = torch.zeros((len(conn_predict),len(conn_predict)))
+        connection_matrix_gt = torch.zeros((len(conn_predict),len(conn_predict)))
+        for i in range(len(conn_predict)):
+            connection_matrix_pre[i][conn_predict[i]] = 1
+            connection_matrix_pre[conn_predict[i]][i] = 1
+            connection_matrix_gt[i][conn_gt[i]] = 1
+            connection_matrix_gt[conn_gt[i]][i] = 1
+        TP = 0
+        FP = 0
+        FN = 0
+        FP = 0
+        for i in range(len(conn_predict)):
+            if connection_matrix_gt[i][conn_predict[i]] == 1:
+                TP += 1
+            if connection_matrix_gt[i][conn_predict[i]] == 0:
+                FP += 1
+            if connection_matrix_pre[i][conn_gt[i]] == 0:
+                FN += 1
+        precision = TP/(TP+FP)
+        recall = TP/(TP+FN)
+        precision_all += precision
+        recall_all += recall
+        count+=1
+    print('------------------------------------')
+    print('Evaluation results for connectivity:')
+    print('precision:',precision_all/count)
+    print('recall:',recall_all/count)
+    print('cross_entropy:',cross_entropy_all/count)
+    print('bone2bone_dist_con:',bone2bone_dist_con/count)
+    print('count:',count)
+    print('------------------------------------')
+    return precision_all/count, recall_all/count
+def evaluate_skinning(skins, skins_gt, threshold=5e-2):
+    """
+    skins: list of predicted skinning weights: tensor of shape (n,vertices_num, bones_num)
+    skins_gt: list of ground truth skinning weights: tensor of shape (n,vertices_num, bones_num)
+    """
+    cs_loss = 0
+    ce_loss = 0
+    cs_loss_clamp = 0
+    count = 0
+    L1_loss = 0
+    precision = 0
+    recall = 0
+    mean_l1_dist = 0
+    for i in tqdm(range(len(skins))):
+        skin_predict = skins[i].cpu().unsqueeze(0)
+        skin_gt = skins_gt[i].cpu().unsqueeze(0)
+        precision_one = 0
+        recall_one = 0
+        ce_loss += cross_entropy_with_probs_batch(skin_predict, skin_gt)
+        cs_loss += cos_loss(skin_predict, skin_gt)
+        cs_loss_clamp += cos_loss_clamp(skin_predict, skin_gt)
+        L1_loss += F.l1_loss(skin_predict, skin_gt)
+        skin_predict = skin_predict[0].cpu().detach().numpy()
+        skin_gt = skin_gt[0].cpu().detach().numpy()
+        mean_l1_dist += np.sum(np.abs(skin_predict - skin_gt )) / len(skin_predict)
+        for i in range(len(skin_predict)):
+            influencial_bone_predict = skin_predict[i] >=threshold
+            influencial_bone_gt = skin_gt[i] >=threshold
+            influencial_bone_correct = influencial_bone_predict*influencial_bone_gt
+            if np.sum(influencial_bone_predict)==0 or np.sum(influencial_bone_gt)==0:
+                continue
+            precision_one += np.sum(influencial_bone_correct)/np.sum(influencial_bone_predict)
+            recall_one += np.sum(influencial_bone_correct)/np.sum(influencial_bone_gt)
+        precision += precision_one/len(skin_predict)
+        recall += recall_one/len(skin_predict)
+        count +=1
+    print('------------------------------------')
+    print('Evaluation results for skinning:')
+    print('cos loss: ', cs_loss/count)
+    print('ce loss: ', ce_loss/count)
+    print('cs_loss_clamp: ', cs_loss_clamp/count)
+    print('L1 loss: ', L1_loss/count)
+    print('mean_l1_dist: ', mean_l1_dist/count)
+    print('precision: ', precision/count)
+    print('recall: ', recall/count)
+    print('count: ', count)
+    print('------------------------------------')
+def evaluate_skeleton(joints,joints_gt,conns,conns_gt,vox_list,fs_threshold=0.2):
+    """
+    joints: list of predicted joints: tensor of shape (n,joints_num,3)
+    joints_gt: list of ground truth joints : tensor of shape (n,joints_num,3)
+    conns: list of predicted connections probability: tensor of shape (n,joints_num,joints_num)
+    conns_gt: list of ground truth connections: tensor of shape (n,joints_num,joints_num)
+    vox_list: list of voxel: (n,88,88,88)
+    """
+    data_count = 0
+    chamfer_score = 0
+    j2b_chamfer_joint = 0
+    bone2bone_dist_joint = 0
+    edit_distance_joint = 0
+    joint_IoU_total = 0
+    joint_precision_total = 0
+    joint_recall_total = 0
+    for i in tqdm(range(len(joints))):
+        joint_predict = joints[i].cpu().numpy()
+        joint_gt = joints_gt[i].cpu().numpy()
+        conn_predict = conns[i].cpu().numpy()
+        conn_gt = conns_gt[i].cpu().numpy()
+        vox = vox_list[i]
+        # add shape diameter after we have vertex and faces
+        # shape_diameter = get_shape_diameter(mesh, points, parent_index[:,0])
+        dist_matrix = np.sqrt(np.sum((joint_predict[np.newaxis, ...] - joint_gt[:, np.newaxis, :]) ** 2, axis=2))
+        row_ind, col_ind = linear_sum_assignment(dist_matrix)
+        # fs_threshold = shape_diameter[row_ind]
+        joint_IoU = 2 * np.sum(dist_matrix[row_ind, col_ind] < fs_threshold) / (len(joint_predict) + len(joint_gt))
+        joint_IoU_total += joint_IoU
+        joint_precision = np.sum(dist_matrix[row_ind, col_ind] < fs_threshold) / len(joint_predict)
+        joint_precision_total += joint_precision
+        joint_recall = np.sum(dist_matrix[row_ind, col_ind] < fs_threshold) / len(joint_gt)
+        joint_recall_total += joint_recall
+        pred_skel_joint,parent_matrix = get_skel(joint_predict,conn_predict,vox=vox)
+        gt_skel, parent_matrix = get_skel(joint_gt,conn_gt,vox=vox)
+        chamfer_score += chamfer_dist(joint_predict, joint_gt)
+        j2b_chamfer_joint += joint2bone_chamfer_dist(pred_skel_joint, gt_skel)
+        bone2bone_dist_joint += bone2bone_chamfer_dist(pred_skel_joint, gt_skel)
+        edit_distance_joint += edit_dist(pred_skel_joint, gt_skel)
+        data_count+=1
+    print('------------------------------------')
+    print('Evaluation results for skeleton:')
+    print('chamfer_score:', chamfer_score/data_count)
+    print('j2b_chamfer_joint:', j2b_chamfer_joint/data_count)
+    print('bone2bone_dist_joint:', bone2bone_dist_joint/data_count)
+    print('joint_IoU:', joint_IoU_total/data_count)
+    print('joint_precision:', joint_precision_total/data_count)
+    print('joint_recall:', joint_recall_total/data_count)
+    print('------------------------------------')

Anymate/utils/loss_utils.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+def chamfer_distance_with_average(p1, p2):
+    '''
+    Calculate Chamfer Distance between two point sets
+    :param p1: size[1, N, D]
+    :param p2: size[1, M, D]
+    :param debug: whether need to output debug info
+    :return: sum of Chamfer Distance of two point sets
+    '''
+    assert p1.size(0) == 1 and p2.size(0) == 1
+    assert p1.size(2) == p2.size(2)
+    p1 = p1.repeat(p2.size(1), 1, 1)
+    p1 = p1.transpose(0, 1)
+    p2 = p2.repeat(p1.size(0), 1, 1)
+    dist = torch.add(p1, torch.neg(p2))
+    dist_norm = torch.norm(dist, 2, dim=2)
+    dist1 = torch.min(dist_norm, dim=1)[0]
+    dist2 = torch.min(dist_norm, dim=0)[0]
+    loss = 0.5 * ((torch.mean(dist1)) + (torch.mean(dist2)))
+    return loss
+def cross_entropy_with_probs_batch(input, target, weight=None, reduction="mean"):  # tested, same as nn.CrossEntropyLoss at dim=1, CE can be negative
+    # input_logsoftmax = F.log_softmax(input, dim=2)
+    input_logsoftmax = torch.log(input+1e-6)
+    cum_losses = -target * input_logsoftmax
+    if weight is not None:
+        cum_losses = cum_losses * weight.unsqueeze(1)  # Broadcasting the weight
+    if reduction == "none":
+        return cum_losses
+    elif reduction == "mean":
+        return cum_losses.sum(dim=2).mean(dim=1).mean(dim=0)
+    elif reduction == "sum":
+        return cum_losses.sum(dim=2).sum(dim=1).mean(dim=0)
+    else:
+        raise ValueError("Keyword 'reduction' must be one of ['none', 'mean', 'sum']")
+def cos_loss(input, target):
+    # input = F.softmax(input, dim=-1)
+    cos = nn.CosineSimilarity(dim=-1, eps=1e-6)
+    similarity = cos(input, target)
+    loss = 1 - similarity.mean()
+    return loss
+def cos_loss_clamp(input, target):
+    # input = F.softmax(input, dim=-1)*(1 + 2*0.001) - 0.001
+    input = input*(1 + 2*0.001) - 0.001
+    input = torch.clamp(input, 0, 1)
+    cos = nn.CosineSimilarity(dim=-1, eps=1e-6)
+    similarity = cos(input, target)
+    loss = 1 - similarity.mean()
+    return loss

Anymate/utils/render_utils.py ADDED Viewed

	@@ -0,0 +1,1169 @@

+import bpy
+import numpy as np
+from mathutils import Vector, Matrix
+from tqdm import tqdm
+import glob
+import os
+import torch
+from PIL import Image
+import matplotlib.pyplot as plt
+cmap = plt.get_cmap('viridis')
+import torch
+import torchvision.io as io
+import cv2
+import trimesh
+def get_data(ids, root, animate=False, shift_rig=True, id2=None, rignet=False):
+    dataset= torch.load('/data2/aod/testJointDataSet_9.pt')
+    joints = []
+    conns = []
+    skins = []
+    for id in ids:
+        if id2 is None:
+            for data in dataset:
+                if id in data['name']:
+                    print(data['name'])
+                    break
+        else:
+            for data in dataset:
+                if id2 in data['name']:
+                    print(data['name'])
+                    break
+        joint = torch.tensor(torch.load(root + '/joints/' + id + '.pt')).cpu()
+        if shift_rig and id2 is None:
+            y_max = data['points_cloud'][:,1].max()
+            joint = joint/2 + torch.tensor([0,y_max/2,0])
+        temp = joint[:, 1].clone()
+        joint[:, 1] = -joint[:, 2]
+        joint[:, 2] = temp
+        conn = torch.tensor(torch.load(root + '/connectivity/' + id + '.pt')).long()
+        if not animate:
+            skin = torch.load(root + '/skinning/' + id + '.pt')
+            if rignet:
+                skins.append(skin[0])
+            elif id2 is None:
+                skins.append(skin[0].softmax(dim=-1).cpu().numpy())
+            else:
+                skins.append(skin)
+        joints.append(joint)
+        conns.append(conn)
+    return joints, conns, skins
+def index_to_sparse(index, weight, shape):
+    sparse_matrix = np.zeros([shape[0], shape[1], shape[2]+1])
+    row_indices, col_indices = np.meshgrid(np.arange(sparse_matrix.shape[0]), np.arange(sparse_matrix.shape[1]), indexing='ij')
+    row_indices = np.expand_dims(row_indices, axis=-1)
+    col_indices = np.expand_dims(col_indices, axis=-1)
+    sparse_matrix[row_indices, col_indices, index] = weight
+    return torch.from_numpy(sparse_matrix[:, :, :-1])
+def get_gt(ids, root):
+    dataset= torch.load('/data2/aod/testJointDataSet_9.pt')
+    joints = []
+    conns = []
+    skins = []
+    for id in ids:
+        for data in dataset:
+            if id in data['name']:
+                print(data['name'])
+                break
+        joint = data['joints_matrix'][:data['joints_num'], :3]
+        y_max = data['points_cloud'][:,1].max()
+        joint = joint/2 + torch.tensor([0,y_max/2,0])
+        temp = joint[:, 1].clone()
+        joint[:, 1] = -joint[:, 2]
+        joint[:, 2] = temp
+        conn = data['parent_index'][:data['joints_num']].long().unsqueeze(1)
+        skin = index_to_sparse(data['skin_index'].unsqueeze(0), data['skin_weight'].unsqueeze(0), [1, 8192, data['joints_num']])
+        joints.append(joint)
+        conns.append(conn)
+        skins.append(skin[0])
+    return joints, conns, skins
+def empty():
+    bpy.ops.wm.read_homefile(use_empty=True)
+    # Delete all mesh objects from the scene
+    # for obj in bpy.context.scene.objects:
+    #     bpy.data.objects.remove(obj, do_unlink=True)
+def add_mesh(filepath, co=None, tex=False, color=(0.5, 0.5, 0.5, 1)):
+    bpy.ops.wm.obj_import(filepath=filepath)
+    obj = bpy.context.object
+    if not tex:
+        # give the mesh a material
+        bpy.context.view_layer.objects.active = obj
+        bpy.ops.object.shade_smooth()
+        bpy.ops.object.mode_set(mode='EDIT')
+        bpy.ops.mesh.select_all(action='SELECT')
+        bpy.ops.mesh.normals_make_consistent(inside=False)
+        bpy.ops.object.mode_set(mode='OBJECT')
+        mat = bpy.data.materials.new(name='mat')
+        obj.data.materials.clear()
+        obj.data.materials.append(mat)
+        mat.use_nodes = True
+        mat.node_tree.nodes.clear()
+        bsdf = mat.node_tree.nodes.new('ShaderNodeBsdfPrincipled')
+        output = mat.node_tree.nodes.new('ShaderNodeOutputMaterial')
+        mat.node_tree.links.new(bsdf.outputs['BSDF'], output.inputs['Surface'])
+        mat.node_tree.nodes['Principled BSDF'].inputs['Roughness'].default_value = 0.8
+        # mat.node_tree.nodes['Principled BSDF'].inputs['Specular'].default_value = 0.5
+        # mat.node_tree.nodes['Principled BSDF'].inputs['Metallic'].default_value = 0.5
+        mat.node_tree.nodes['Principled BSDF'].inputs['Base Color'].default_value = color
+    if co is not None:
+        obj.parent = co
+def create_sphere(location, size=0.01, color=(1.0, 0.0, 0.0, 1.0), reduced=False):
+    if reduced:
+        bpy.ops.mesh.primitive_uv_sphere_add(radius=size, location=location, segments=8, ring_count=4)
+    else:
+        bpy.ops.mesh.primitive_uv_sphere_add(radius=size, location=location)
+    sphere = bpy.context.active_object
+    material_name = f"ColorMaterial_{color}"
+    material = bpy.data.materials.get(material_name)
+    if not material:
+        material = bpy.data.materials.new(name=material_name)
+        material.use_nodes = True
+        material.node_tree.nodes.clear()
+        bsdf = material.node_tree.nodes.new('ShaderNodeBsdfPrincipled')
+        output = material.node_tree.nodes.new('ShaderNodeOutputMaterial')
+        material.node_tree.links.new(bsdf.outputs['BSDF'], output.inputs['Surface'])
+        material.node_tree.nodes['Principled BSDF'].inputs['Base Color'].default_value = color
+    sphere.data.materials.append(material)
+    return sphere
+def add_co(location=(0,0,0), rotation=(0,0,0), scale=(1,1,1)):
+    co = bpy.data.objects.new("CoordinateSystem", None)
+    bpy.context.collection.objects.link(co)
+    bpy.context.view_layer.objects.active = co
+    co.empty_display_size = 0.1
+    co.empty_display_type = 'ARROWS'
+    co.location = location
+    co.rotation_euler = rotation
+    co.scale = scale
+    return co
+def add_joint(joints_matrix, co=None):
+    for i, joint in enumerate(joints_matrix):
+        sphere = create_sphere((joint[0], joint[1], joint[2]), size=0.01)
+        if co is not None:
+            sphere.parent = co
+def create_blue_cone(base_point, apex_point, radius=0.1):
+    # Calculate the radius and length of the cone
+    direction = apex_point - base_point
+    length = direction.length
+    # Create cone mesh
+    bpy.ops.mesh.primitive_cone_add(vertices=32, radius1=radius, depth=length, location=(base_point + direction * 0.5))
+    cone = bpy.context.active_object
+    # Create or get the blue material
+    blue_material = bpy.data.materials.get("BlueMaterial")
+    if not blue_material:
+        blue_material = bpy.data.materials.new(name="BlueMaterial")
+        blue_material.use_nodes = True
+        blue_material.node_tree.nodes.clear()
+        bsdf = blue_material.node_tree.nodes.new('ShaderNodeBsdfPrincipled')
+        output = blue_material.node_tree.nodes.new('ShaderNodeOutputMaterial')
+        blue_material.node_tree.links.new(bsdf.outputs['BSDF'], output.inputs['Surface'])
+        blue_material.node_tree.nodes['Principled BSDF'].inputs['Base Color'].default_value = (0.0, 0.0, 1.0, 1.0)
+    cone.data.materials.append(blue_material)
+    # Set the cone's orientation
+    cone.rotation_euler = direction.to_track_quat('Z', 'Y').to_euler()
+    return cone
+def add_conn(con_index, joints_matrix, co=None):
+    for i, parent in enumerate(con_index):
+        parent = parent.item()
+        if parent != i:
+            parent_co = Vector((joints_matrix[parent][0], joints_matrix[parent][1], joints_matrix[parent][2]))
+            position = Vector((joints_matrix[i][0], joints_matrix[i][1], joints_matrix[i][2]))
+            cone = create_blue_cone(parent_co, position, radius=0.008)
+            if co is not None:
+                cone.parent = co
+def merge_images(img1, img2, output_path, alpha=1):
+    image_mesh = Image.open(img1)
+    image_rig = Image.open(img2)
+    if alpha == 1:
+        image_mesh.paste(image_rig, (0, 0), image_rig)
+        image_mesh.save(output_path)
+        return
+    data = image_rig.getdata()
+    data2 = image_mesh.getdata()
+    new_data = []
+    for item, item2 in zip(data, data2):
+        if item[3] == 0:
+            new_data.append(item2)
+        else:
+            new_data.append((int(item[0]*alpha + item2[0]*(1-alpha)), int(item[1]*alpha + item2[1]*(1-alpha)), int(item[2]*alpha + item2[2]*(1-alpha)), 255))
+    image_mesh.putdata(new_data)
+    # image_mesh.paste(image_rig, (0, 0), image_rig)
+    image_mesh.save(output_path)
+def merge_videos(video1, video2, output_path):
+    # overlap two videos together, video1 is the background, video2 is the foreground
+    # os.system(f'ffmpeg -i {video1} -i {video2} -filter_complex "[0:v][1:v] overlay=0:0:enable=\'between(t,0,60)\'" -pix_fmt yuv420p -c:a copy {output_path}')
+    frames_path_1 = glob.glob(video1 + '*.png')
+    total_frames = len(frames_path_1)
+    combined_frames = []
+    for i in range(total_frames):
+        frame1 = Image.open(f'{video1}{i:04d}.png')
+        frame2 = Image.open(f'{video2}{i:04d}.png')
+        frame1.paste(frame2, (0, 0), frame2)
+        combined_frames.append(frame1)
+    # paste the combined frames on a pure white background
+    combined_frames_white = []
+    for frame in combined_frames:
+        white = Image.new('RGB', frame.size, (255, 255, 255))
+        white.paste(frame, (0, 0), frame)
+        combined_frames_white.append(white)
+    combined_frames=combined_frames_white
+    combined_videos = torch.stack([torch.tensor(np.array(frame)) for frame in combined_frames])[..., :3]
+    # write the video with high quality
+    # io.write_video(output_path, combined_videos, 24)
+    io.write_video(output_path, combined_videos, 24, video_codec='libx264', options={'crf': '18'})
+    # comvert the frames to mp4 video
+    # video = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'H264'), 30, (frame1.size[0], frame1.size[1]))
+    # for frame in combined_frames:
+    #     video.write(cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR))
+    # video.release()
+    # video_1, audio_1, fps_1 = io.read_video(video1, pts_unit="sec")
+    # video_2, audio_2, fps_2 = io.read_video(video2, pts_unit="sec")
+    # non_zero = video_2.sum(dim=-1) != 0
+    # non_zero = torch.stack([non_zero, non_zero, non_zero], dim=-1)
+    # video_1[non_zero] = video_2[non_zero]
+    # io.write_video(output_path, video_1, int(fps_1['video_fps']))
+def add_skin(filepath, skin, bone_index, co=None, pc=None):
+    bpy.ops.wm.obj_import(filepath=filepath)
+    obj = bpy.context.object
+    bpy.context.view_layer.objects.active = obj
+    bpy.ops.object.shade_smooth()
+    bpy.ops.object.mode_set(mode='EDIT')
+    bpy.ops.mesh.select_all(action='SELECT')
+    bpy.ops.mesh.normals_make_consistent(inside=False)
+    bpy.ops.object.mode_set(mode='OBJECT')
+    if co is not None:
+        obj.parent = co
+    if pc is not None:
+        skin = np.array(skin)
+        pc = pc[:, :3].numpy()
+        y_max = pc[:, 1].max()
+        pc = pc + np.array([0, y_max, 0])
+        pc = pc / 2
+        new_skin = np.zeros((len(obj.data.vertices), skin.shape[1]))
+        for i, v in enumerate(obj.data.vertices):
+            v_co = np.array(v.co)
+            dist = np.linalg.norm(pc - v_co, axis=1)
+            # min_idx = np.argmin(dist)
+            # sort, and then get top 3 index
+            min_idx_list = np.argsort(dist)[:3]
+            for min_idx in min_idx_list:
+                # get inverse distance weight
+                interpolate_weight = np.square(1 / dist[min_idx]) / np.square(1 / dist[min_idx_list]).sum()
+                new_skin[i] = new_skin[i] + interpolate_weight * skin[min_idx]
+        skin = new_skin
+    color_list = skin
+    color_list = color_list[:,bone_index]
+    vertex_colors = obj.data.vertex_colors.new()
+    for poly in obj.data.polygons:
+        for loop_index in poly.loop_indices:
+            vertex_index = obj.data.loops[loop_index].vertex_index
+            # Get the weight for the vertex
+            weight = color_list[vertex_index]
+            color = cmap(weight)
+            # Assign the weight to the vertex color (RGBA)
+            vertex_colors.data[loop_index].color = color  # Use the weight for RGB
+    # let bsdf use vertex color and then output to surface
+    mat = bpy.data.materials.new(name='mat')
+    # delete all material of obj
+    obj.data.materials.clear()
+    obj.data.materials.append(mat)
+    mat.use_nodes = True
+    mat.node_tree.nodes.clear()
+    vertex_color = mat.node_tree.nodes.new('ShaderNodeVertexColor')
+    bsdf = mat.node_tree.nodes.new('ShaderNodeBsdfPrincipled')
+    output = mat.node_tree.nodes.new('ShaderNodeOutputMaterial')
+    mat.node_tree.links.new(vertex_color.outputs['Color'], bsdf.inputs['Base Color'])
+    mat.node_tree.links.new(bsdf.outputs['BSDF'], output.inputs['Surface'])
+    mat.node_tree.nodes['Principled BSDF'].inputs['Roughness'].default_value = 0.5
+def add_pc(points):
+    base_sphere = create_sphere((points[0][0], points[0][1], points[0][2]), size=0.003, color=cmap(0), reduced=True)
+    # copy the base sphere to create the rest of the spheres
+    for i in tqdm(range(1, points.shape[0])):
+        new_sphere = base_sphere.copy()
+        new_sphere.location = (points[i][0], points[i][1], points[i][2])
+        bpy.context.collection.objects.link(new_sphere)
+def add_floor(back=False):
+    # create a plane as floor
+    bpy.ops.mesh.primitive_plane_add(size=50, enter_editmode=False, align='WORLD', location=(0, 20, 0))
+    floor = bpy.context.object
+    floor.name = 'floor'
+    # set white material for floor
+    mat = bpy.data.materials.new(name='floor_mat')
+    floor.data.materials.append(mat)
+    mat.use_nodes = True
+    mat.node_tree.nodes.clear()
+    bsdf = mat.node_tree.nodes.new('ShaderNodeBsdfDiffuse')
+    output = mat.node_tree.nodes.new('ShaderNodeOutputMaterial')
+    mat.node_tree.links.new(bsdf.outputs['BSDF'], output.inputs['Surface'])
+    mat.node_tree.nodes['Diffuse BSDF'].inputs['Color'].default_value = (1, 1, 1, 1)
+    if back:
+        # create a plane as background
+        bpy.ops.mesh.primitive_plane_add(size=30, enter_editmode=False, align='WORLD', location=(0, 15, 0), rotation=(-0.5*np.pi, 0, 0))
+        background = bpy.context.object
+        background.name = 'background'
+        # set white material for background
+        mat = bpy.data.materials.new(name='background_mat')
+        background.data.materials.append(mat)
+        mat.use_nodes = True
+        mat.node_tree.nodes.clear()
+        bsdf = mat.node_tree.nodes.new('ShaderNodeBsdfDiffuse')
+        output = mat.node_tree.nodes.new('ShaderNodeOutputMaterial')
+        mat.node_tree.links.new(bsdf.outputs['BSDF'], output.inputs['Surface'])
+        mat.node_tree.nodes['Diffuse BSDF'].inputs['Color'].default_value = (1, 1, 1, 1)
+def setup_render():
+    # color management
+    bpy.context.scene.view_settings.view_transform = 'Standard'
+    # set the render engine to Cycles
+    bpy.context.scene.render.engine = 'CYCLES'
+    # enable cuda
+    bpy.context.preferences.addons['cycles'].preferences.get_devices()
+    bpy.context.preferences.addons['cycles'].preferences.compute_device_type = 'CUDA'
+    bpy.context.scene.cycles.device = 'GPU'
+    # set render background to transparent
+    bpy.context.scene.render.film_transparent = True
+def render(output_path, shadow=True, shading=True, quick=False):
+    if shadow:
+        add_floor()
+    if shading:
+        # create a sun light
+        bpy.ops.object.light_add(type='SUN', radius=1, align='WORLD', location=(-1, -1, 3))
+        light = bpy.context.object
+        light.data.energy = 5
+        # angle pointing to the origin
+        light.rotation_euler = (0.1*np.pi, 0, 0)
+        # set angle
+        light.data.angle = 0.08*np.pi
+    else:
+        # global illumination by create world light
+        world = bpy.data.worlds.new('World')
+        bpy.context.scene.world = world
+        world.use_nodes = True
+        world_light = world.node_tree.nodes['Background']
+        world_light.inputs['Strength'].default_value = 1
+        world_light.inputs['Color'].default_value = (1, 1, 1, 1)
+    # create a camera
+    cam = bpy.data.cameras.new("Camera")
+    cam_ob = bpy.data.objects.new("Camera", cam)
+    camera = bpy.data.objects['Camera']
+    bpy.context.scene.collection.objects.link(camera)
+    camera.location = Vector((2, -1.5, 2))
+    look_at = Vector((0, 0, 0.36))
+    # compute the rotation
+    camera.rotation_mode = 'QUATERNION'
+    camera.rotation_quaternion = (camera.location - look_at).to_track_quat('Z', 'Y')
+    # set size
+    camera.data.sensor_width = 26
+    # set the camera to be active
+    bpy.context.scene.camera = camera
+    # make the rendered image square
+    bpy.context.scene.render.resolution_x = 2048
+    bpy.context.scene.render.resolution_y = 2048
+    setup_render()
+    if quick:
+        # reduce the number of samples
+        bpy.context.scene.cycles.samples = 128
+        bpy.context.scene.cycles.preview_samples = 128
+        bpy.context.scene.cycles.max_bounces = 1
+        bpy.context.scene.cycles.min_bounces = 1
+        bpy.context.scene.cycles.diffuse_bounces = 1
+        bpy.context.scene.cycles.glossy_bounces = 1
+    else:
+        bpy.context.scene.cycles.samples = 1024
+        bpy.context.scene.cycles.preview_samples = 1024
+        bpy.context.scene.cycles.max_bounces = 4
+        bpy.context.scene.cycles.min_bounces = 4
+        bpy.context.scene.cycles.diffuse_bounces = 4
+        bpy.context.scene.cycles.glossy_bounces = 4
+    # output path
+    # output_path = '/home/ydengbd/objaverse/test.png'
+    bpy.context.scene.render.filepath = output_path
+    bpy.ops.render.render(write_still=True)
+def render_spin(output_path, co, shadow=True, shading=True, quick=False):
+    # create a new coordinate system at the origin
+    new_co = add_co(location=(0, 0, 0), rotation=(0, 0, 0), scale=(1, 1, 1))
+    # set the object to be the child of the new coordinate system
+    co.parent = new_co
+    # add spin animation to the new coordinate system
+    new_co.rotation_mode = 'XYZ'
+    new_co.rotation_euler = (0, 0, 0)
+    new_co.keyframe_insert(data_path='rotation_euler', index=2, frame=0)
+    new_co.rotation_euler = (0, 0, 2*np.pi)
+    new_co.keyframe_insert(data_path='rotation_euler', index=2, frame=60)
+    if shadow:
+        add_floor()
+    if shading:
+        # create a sun light
+        bpy.ops.object.light_add(type='SUN', radius=1, align='WORLD', location=(-1, -1, 3))
+        light = bpy.context.object
+        light.data.energy = 5
+        # angle pointing to the origin
+        light.rotation_euler = (0.1*np.pi, 0, 0)
+        # set angle
+        light.data.angle = 0.08*np.pi
+    else:
+        # global illumination by create world light
+        world = bpy.data.worlds.new('World')
+        bpy.context.scene.world = world
+        world.use_nodes = True
+        world_light = world.node_tree.nodes['Background']
+        world_light.inputs['Strength'].default_value = 1
+        world_light.inputs['Color'].default_value = (1, 1, 1, 1)
+    # create a camera
+    cam = bpy.data.cameras.new("Camera")
+    cam_ob = bpy.data.objects.new("Camera", cam)
+    camera = bpy.data.objects['Camera']
+    bpy.context.scene.collection.objects.link(camera)
+    camera.location = Vector((2, -1.5, 2))
+    look_at = Vector((0, 0, 0.36))
+    # compute the rotation
+    camera.rotation_mode = 'QUATERNION'
+    camera.rotation_quaternion = (camera.location - look_at).to_track_quat('Z', 'Y')
+    # set size
+    camera.data.sensor_width = 26
+    # set the camera to be active
+    bpy.context.scene.camera = camera
+    # render the animation
+    bpy.context.scene.frame_start = 0
+    bpy.context.scene.frame_end = 60
+    # make the rendered image square
+    bpy.context.scene.render.resolution_x = 1024
+    bpy.context.scene.render.resolution_y = 1024
+    setup_render()
+    if quick:
+        # reduce the number of samples
+        bpy.context.scene.cycles.samples = 128
+        bpy.context.scene.cycles.preview_samples = 128
+        bpy.context.scene.cycles.max_bounces = 1
+        bpy.context.scene.cycles.min_bounces = 1
+        bpy.context.scene.cycles.diffuse_bounces = 1
+        bpy.context.scene.cycles.glossy_bounces = 1
+    else:
+        bpy.context.scene.cycles.samples = 512
+        bpy.context.scene.cycles.preview_samples = 512
+        bpy.context.scene.cycles.max_bounces = 4
+        bpy.context.scene.cycles.min_bounces = 4
+        bpy.context.scene.cycles.diffuse_bounces = 4
+        bpy.context.scene.cycles.glossy_bounces = 4
+    # output path
+    bpy.context.scene.render.filepath = output_path
+    if output_path.endswith('.mp4'):
+        # render a mp4 video
+        bpy.context.scene.render.image_settings.file_format = 'FFMPEG'
+        bpy.context.scene.render.ffmpeg.format = 'MPEG4'
+        bpy.context.scene.render.ffmpeg.codec = 'H264'
+    bpy.ops.render.render(animation=True, write_still=True)
+def setup_anim(armature, arti):
+    # enter pose mode
+    print('Arti shape', arti.shape)
+    bpy.ops.object.mode_set(mode='POSE')
+    print('total bones', len(armature.pose.bones))
+    for i, pose_bone in enumerate(armature.pose.bones):
+        pose_bone.rotation_mode = 'XYZ'
+        pose_bone.keyframe_insert(data_path="rotation_euler", frame=0)
+        pose_bone.rotation_euler = arti[i]
+        pose_bone.keyframe_insert(data_path="rotation_euler", frame=30)
+        pose_bone.rotation_euler = Vector((0, 0, 0))
+        pose_bone.keyframe_insert(data_path="rotation_euler", frame=60)
+    bpy.ops.object.mode_set(mode='OBJECT')
+def render_anim(output_path, armature, arti, quick=False):
+    # enter pose mode
+    setup_anim(armature, arti)
+    # save blend file
+    # bpy.ops.wm.save_as_mainfile(filepath='/data2/ydengbd/objaverse/test.blend')
+    add_floor()
+    # create a sun light
+    bpy.ops.object.light_add(type='SUN', radius=1, align='WORLD', location=(-1, -1, 3))
+    light = bpy.context.object
+    light.data.energy = 5
+    # angle pointing to the origin
+    light.rotation_euler = (50/180*np.pi, 0, -20/180*np.pi)
+    # set angle
+    light.data.angle = 12/180*np.pi
+    # create a camera
+    cam = bpy.data.cameras.new("Camera")
+    cam_ob = bpy.data.objects.new("Camera", cam)
+    camera = bpy.data.objects['Camera']
+    bpy.context.scene.collection.objects.link(camera)
+    camera.location = Vector((0, -3, 1.3))
+    camera.rotation_euler = Vector((1.309, 0, 0))
+    # set size
+    camera.data.sensor_width = 36
+    # set the camera to be active
+    bpy.context.scene.camera = camera
+    # render the animation
+    bpy.context.scene.frame_start = 0
+    bpy.context.scene.frame_end = 60
+    # make the rendered image square
+    bpy.context.scene.render.resolution_x = 1920
+    bpy.context.scene.render.resolution_y = 1080
+    setup_render()
+    if quick:
+        # reduce the number of samples
+        bpy.context.scene.cycles.samples = 128
+        bpy.context.scene.cycles.preview_samples = 128
+        bpy.context.scene.cycles.max_bounces = 1
+        bpy.context.scene.cycles.min_bounces = 1
+        bpy.context.scene.cycles.diffuse_bounces = 1
+        bpy.context.scene.cycles.glossy_bounces = 1
+    else:
+        bpy.context.scene.cycles.samples = 1024
+        bpy.context.scene.cycles.preview_samples = 1024
+        bpy.context.scene.cycles.max_bounces = 4
+        bpy.context.scene.cycles.min_bounces = 4
+        bpy.context.scene.cycles.diffuse_bounces = 4
+        bpy.context.scene.cycles.glossy_bounces = 4
+    # output path
+    bpy.context.scene.render.filepath = output_path
+    if output_path.endswith('.mp4'):
+        # render a mp4 video
+        bpy.context.scene.render.image_settings.file_format = 'FFMPEG'
+        bpy.context.scene.render.ffmpeg.format = 'MPEG4'
+        bpy.context.scene.render.ffmpeg.codec = 'H264'
+    bpy.ops.render.render(animation=True, write_still=True)
+def render_animspin(output_path, co, armature, arti, shadow=True, shading=True, quick=False):
+    # enter pose mode
+    print('Arti shape', arti.shape)
+    bpy.ops.object.mode_set(mode='POSE')
+    print('total bones', len(armature.pose.bones))
+    for i, pose_bone in enumerate(armature.pose.bones):
+        pose_bone.rotation_mode = 'XYZ'
+        pose_bone.keyframe_insert(data_path="rotation_euler", frame=0)
+        pose_bone.rotation_euler = arti[i]
+        pose_bone.keyframe_insert(data_path="rotation_euler", frame=30)
+        pose_bone.rotation_euler = Vector((0, 0, 0))
+        pose_bone.keyframe_insert(data_path="rotation_euler", frame=60)
+        pose_bone.rotation_euler = arti[i]
+        pose_bone.keyframe_insert(data_path="rotation_euler", frame=90)
+        pose_bone.keyframe_insert(data_path="rotation_euler", frame=150)
+        pose_bone.rotation_euler = Vector((0, 0, 0))
+        pose_bone.keyframe_insert(data_path="rotation_euler", frame=180)
+    bpy.ops.object.mode_set(mode='OBJECT')
+    # create a new coordinate system at the origin
+    new_co = add_co(location=(0, 0, 0), rotation=(0, 0, 0), scale=(1, 1, 1))
+    # set the object to be the child of the new coordinate system
+    co.parent = new_co
+    # add spin animation to the new coordinate system
+    new_co.rotation_mode = 'XYZ'
+    new_co.rotation_euler = (0, 0, 0)
+    new_co.keyframe_insert(data_path='rotation_euler', index=2, frame=90)
+    new_co.rotation_euler = (0, 0, 2*np.pi)
+    new_co.keyframe_insert(data_path='rotation_euler', index=2, frame=150)
+    if shadow:
+        add_floor()
+    if shading:
+        # create a sun light
+        bpy.ops.object.light_add(type='SUN', radius=1, align='WORLD', location=(-1, -1, 3))
+        light = bpy.context.object
+        light.data.energy = 5
+        # angle pointing to the origin
+        light.rotation_euler = (0.1*np.pi, 0, 0)
+        # set angle
+        light.data.angle = 0.08*np.pi
+    else:
+        # global illumination by create world light
+        world = bpy.data.worlds.new('World')
+        bpy.context.scene.world = world
+        world.use_nodes = True
+        world_light = world.node_tree.nodes['Background']
+        world_light.inputs['Strength'].default_value = 1
+        world_light.inputs['Color'].default_value = (1, 1, 1, 1)
+    # create a camera
+    cam = bpy.data.cameras.new("Camera")
+    cam_ob = bpy.data.objects.new("Camera", cam)
+    camera = bpy.data.objects['Camera']
+    bpy.context.scene.collection.objects.link(camera)
+    camera.location = Vector((2, -1.5, 2))
+    look_at = Vector((0, 0, 0.36))
+    # compute the rotation
+    camera.rotation_mode = 'QUATERNION'
+    camera.rotation_quaternion = (camera.location - look_at).to_track_quat('Z', 'Y')
+    # set size
+    camera.data.sensor_width = 26
+    # set the camera to be active
+    bpy.context.scene.camera = camera
+    # render the animation
+    bpy.context.scene.frame_start = 0
+    bpy.context.scene.frame_end = 180
+    # make the rendered image square
+    bpy.context.scene.render.resolution_x = 1024
+    bpy.context.scene.render.resolution_y = 1024
+    setup_render()
+    if quick:
+        # reduce the number of samples
+        bpy.context.scene.cycles.samples = 128
+        bpy.context.scene.cycles.preview_samples = 128
+        bpy.context.scene.cycles.max_bounces = 1
+        bpy.context.scene.cycles.min_bounces = 1
+        bpy.context.scene.cycles.diffuse_bounces = 1
+        bpy.context.scene.cycles.glossy_bounces = 1
+    else:
+        bpy.context.scene.cycles.samples = 512
+        bpy.context.scene.cycles.preview_samples = 512
+        bpy.context.scene.cycles.max_bounces = 4
+        bpy.context.scene.cycles.min_bounces = 4
+        bpy.context.scene.cycles.diffuse_bounces = 4
+        bpy.context.scene.cycles.glossy_bounces = 4
+    # output path
+    bpy.context.scene.render.filepath = output_path
+    if output_path.endswith('.mp4'):
+        # render a mp4 video
+        bpy.context.scene.render.image_settings.file_format = 'FFMPEG'
+        bpy.context.scene.render.ffmpeg.format = 'MPEG4'
+        bpy.context.scene.render.ffmpeg.codec = 'H264'
+    bpy.ops.render.render(animation=True, write_still=True)
+def render_scene(output_path, shadow=True):
+    if shadow:
+        add_floor()
+    # create a sun light
+    bpy.ops.object.light_add(type='SUN', radius=1, align='WORLD', location=(-1, -1, 3))
+    light = bpy.context.object
+    light.data.energy = 5
+    # angle pointing to the origin
+    light.rotation_euler = (50/180*np.pi, 0, -20/180*np.pi)
+    # set angle
+    light.data.angle = 12/180*np.pi
+    # create a camera
+    cam = bpy.data.cameras.new("Camera")
+    cam_ob = bpy.data.objects.new("Camera", cam)
+    camera = bpy.data.objects['Camera']
+    bpy.context.scene.collection.objects.link(camera)
+    camera.location = Vector((0, -10, 5))
+    camera.rotation_euler = Vector((1.22, 0, 0))
+    # set size
+    camera.data.sensor_width = 26
+    # set the camera to be active
+    bpy.context.scene.camera = camera
+    # make the rendered image square
+    bpy.context.scene.render.resolution_x = 1920
+    bpy.context.scene.render.resolution_y = 1080
+    setup_render()
+    # output path
+    # output_path = '/home/ydengbd/objaverse/test.png'
+    bpy.context.scene.render.filepath = output_path
+    bpy.ops.render.render(write_still=True)
+def render_teaser(output_path, shadow=True, quick=False):
+    if shadow:
+        add_floor(back=True)
+    # create a sun light
+    bpy.ops.object.light_add(type='SUN', radius=1, align='WORLD', location=(-1, -1, 3))
+    light = bpy.context.object
+    light.data.energy = 5
+    # angle pointing to the origin
+    light.rotation_euler = (50/180*np.pi, 0, -20/180*np.pi)
+    # set angle
+    light.data.angle = 12/180*np.pi
+    # create a camera
+    cam = bpy.data.cameras.new("Camera")
+    cam_ob = bpy.data.objects.new("Camera", cam)
+    camera = bpy.data.objects['Camera']
+    bpy.context.scene.collection.objects.link(camera)
+    camera.location = Vector((0, -3, 1.3))
+    camera.rotation_euler = Vector((80/180*np.pi, 0, 0))
+    # set size
+    camera.data.sensor_width = 48
+    # set the camera to be active
+    bpy.context.scene.camera = camera
+    # render the animation
+    bpy.context.scene.frame_start = 0
+    bpy.context.scene.frame_end = 60
+    # make the rendered image square
+    bpy.context.scene.render.resolution_x = 2400
+    bpy.context.scene.render.resolution_y = 1080
+    setup_render()
+    if quick:
+        # reduce the number of samples
+        bpy.context.scene.cycles.samples = 128
+        bpy.context.scene.cycles.preview_samples = 128
+        bpy.context.scene.cycles.max_bounces = 1
+        bpy.context.scene.cycles.min_bounces = 1
+        bpy.context.scene.cycles.diffuse_bounces = 1
+        bpy.context.scene.cycles.glossy_bounces = 1
+    else:
+        bpy.context.scene.cycles.samples = 1024
+        bpy.context.scene.cycles.preview_samples = 1024
+        bpy.context.scene.cycles.max_bounces = 4
+        bpy.context.scene.cycles.min_bounces = 4
+        bpy.context.scene.cycles.diffuse_bounces = 4
+        bpy.context.scene.cycles.glossy_bounces = 4
+    # output path
+    bpy.context.scene.render.filepath = output_path
+    if output_path.endswith('.mp4'):
+        # render a mp4 video
+        bpy.context.scene.render.image_settings.file_format = 'FFMPEG'
+        bpy.context.scene.render.ffmpeg.format = 'MPEG4'
+        bpy.context.scene.render.ffmpeg.codec = 'H264'
+    bpy.ops.render.render(animation=True, write_still=True)
+def setup_armature(path, tex=False, save=True):
+    joints_matrix = torch.load(os.path.join(path, 'joints.pt'))
+    connectivity = torch.load(os.path.join(path, 'conns.pt'))
+    skinning_weights = torch.load(os.path.join(path, 'skins.pt'))
+    obj_file_path = os.path.join(path, 'object.obj')
+    # bpy.ops.wm.obj_import(filepath=obj_file_path)
+    add_mesh(obj_file_path, tex=tex)
+    mesh_object = bpy.context.selected_objects[0]
+    # pack textures
+    bpy.ops.file.pack_all()
+    temp = torch.tensor(joints_matrix)[:, 1].clone()
+    joints_matrix[:, 1] = -joints_matrix[:, 2]
+    joints_matrix[:, 2] = temp
+    bpy.ops.object.armature_add()
+    armature_obj = bpy.context.object
+    bpy.ops.object.mode_set(mode='EDIT')
+    bpy.ops.armature.select_all(action='SELECT')
+    bpy.ops.armature.delete()
+    world_matrix = Matrix([[1, 0, 0, 0],
+                                     [0, 1, 0, 0],
+                                     [0, 0, 1, 0],
+                                     [0, 0, 0, 1]])
+    armature_obj.matrix_world = world_matrix
+    bone_dict = {}
+    i_name = 0
+    for i in range(len(joints_matrix)):
+        if connectivity[i] == i:
+            continue
+        bone_name = str(i_name)
+        bone = armature_obj.data.edit_bones.new(bone_name)
+        bone.head = joints_matrix[connectivity[i]].cpu().numpy()
+        bone.tail = joints_matrix[i].cpu().numpy()
+        bone_dict[bone_name] = bone
+        i_name += 1
+    for bone_name, bone in bone_dict.items():
+        # Find parent bone by checking if current bone's head matches any other bone's tail
+        for other_bone_name, other_bone in bone_dict.items():
+            if other_bone != bone and bone.head == other_bone.tail:
+                bone.parent = other_bone
+                break
+    assert i_name == skinning_weights.shape[1]
+    for i, skinning_weight in enumerate(skinning_weights):
+        # print("skinning_weight", skinning_weight)
+        vertex_index = i
+        for j,weight in enumerate(skinning_weight):
+            bone_name = str(j)
+            bone_weight = float(weight)
+            vertex_group_name = f"{bone_name}"
+            vertex_group = mesh_object.vertex_groups.get(vertex_group_name)
+            if vertex_group is None:
+                vertex_group = mesh_object.vertex_groups.new(name=vertex_group_name)
+            vertex_group.add([vertex_index], bone_weight, 'ADD')
+    # for obj in bpy.context.scene.objects:
+    #     if obj.type == 'MESH':
+    modifier = mesh_object.modifiers.new(name="Armature", type='ARMATURE')
+    modifier.object = armature_obj
+    modifier.use_vertex_groups = True
+    print("Armature modifier added to mesh:", mesh_object.name)
+    bpy.ops.object.mode_set(mode='OBJECT')
+    if save:
+        bpy.ops.wm.save_as_mainfile(filepath= os.path.join(path, 'blender_output.blend'))
+    return armature_obj
+def reload_tensor_skinning(data, bone_name_list):
+    # with open(json_file, "r") as f:
+    #     skinning_data = json.load(f)
+    armature_obj = bpy.data.objects.get("Armature")
+    if not armature_obj:
+        print("Error: Armature object 'Armature' not found.")
+        return
+    # 将所有网格对象放置在骨骼对象的子集中
+    count = 0
+    for obj in bpy.context.scene.objects:
+        if obj.type == 'MESH':
+            obj.parent = armature_obj
+            count += 1
+    print("total mesh count:", count)
+    for obj in bpy.context.scene.objects:
+        vertex_index = 0
+        if obj.type == 'MESH':
+            # mesh_name = obj.name
+            # if mesh_name in skinning_data:
+            #     skinning_info = skinning_data[mesh_name]
+            #     if "weight" in skinning_info:
+            #         print("Applying skinning data for mesh:", mesh_name)
+            #         vertex_index = 0
+            #         for vertex_weight in skinning_info["weight"]:
+            #             for bone_name, weight_value in vertex_weight.items():
+            #                 vertex_group = obj.vertex_groups.get(bone_name)
+            #                 if vertex_group is None:
+            #                     vertex_group = obj.vertex_groups.new(name=bone_name)
+            #                     print("Vertex group created:", bone_name)
+            #                 vertex_group.add([vertex_index], weight_value, 'REPLACE')
+            #             vertex_index += 1
+            # else:
+            #     print("No skinning data found for mesh:", mesh_name)
+            for i, v in enumerate(obj.data.vertices):
+                v_co = np.array(v.co)
+                pc = data['pc'][:, :3].numpy()
+                y_max = pc[:, 1].max()
+                pc = pc + np.array([0, y_max, 0])
+                pc = pc / 2
+                dist = np.linalg.norm(pc - v_co, axis=1)
+                # min_idx = np.argmin(dist)
+                # sort, and then get top 3 index
+                min_idx_list = np.argsort(dist)[:3]
+                for min_idx in min_idx_list:
+                    # get inverse distance weight
+                    interpolate_weight = np.square(1 / dist[min_idx]) / np.square(1 / dist[min_idx_list]).sum()
+                    for idx, j in enumerate(data['skins_index'][min_idx]):
+                        if j == -1:
+                            break
+                        bone_name = bone_name_list[j]
+                        vertex_group = obj.vertex_groups.get(str(int(bone_name)))
+                        if vertex_group is None:
+                            vertex_group = obj.vertex_groups.new(name=str(int(bone_name)))
+                            print("Vertex group created:", bone_name)
+                        vertex_group.add([i], interpolate_weight * data['skins_weight'][min_idx][idx], 'ADD')
+    for obj in bpy.context.scene.objects:
+        if obj.type == 'MESH':
+            modifier = obj.modifiers.new(name="Armature", type='ARMATURE')
+            modifier.object = armature_obj
+            modifier.use_vertex_groups = True
+            print("Armature modifier added to mesh:", obj.name)
+def reload_tensor(data, root='data', save=True):
+    joints_matrix = data['joints'].clone()
+    connectivity = data['conns']
+    obj_file_path = os.path.join(root, data['name'], 'object.obj')
+    # bpy.ops.wm.obj_import(filepath=obj_file_path)
+    add_mesh(obj_file_path)
+    mesh_object = bpy.context.selected_objects[0]
+    # pack textures
+    bpy.ops.file.pack_all()
+    y_max = data['pc'][:, 1].max()
+    joints_matrix = joints_matrix + torch.tensor([0, y_max, 0])
+    joints_matrix = joints_matrix / 2
+    temp = joints_matrix[:, 1].clone()
+    joints_matrix[:, 1] = -joints_matrix[:, 2]
+    joints_matrix[:, 2] = temp
+    bpy.ops.object.armature_add()
+    armature_obj = bpy.context.object
+    bpy.ops.object.mode_set(mode='EDIT')
+    bpy.ops.armature.select_all(action='SELECT')
+    bpy.ops.armature.delete()
+    world_matrix = Matrix([[1, 0, 0, 0],
+                                     [0, 1, 0, 0],
+                                     [0, 0, 1, 0],
+                                     [0, 0, 0, 1]])
+    armature_obj.matrix_world = world_matrix
+    bone_dict = {}
+    bone_name_list = np.zeros(data['bones_num'])
+    i_name = 0
+    for i in range(len(joints_matrix)):
+        if connectivity[i] == i:
+            continue
+        bone_name = str(i_name)
+        bone = armature_obj.data.edit_bones.new(bone_name)
+        bone.head = joints_matrix[connectivity[i]].cpu().numpy()
+        bone.tail = joints_matrix[i].cpu().numpy()
+        bone_dict[bone_name] = bone
+        for j, skinbone in enumerate(data['bones']):
+            if torch.equal(skinbone[:3], data['joints'][connectivity[i]]) and torch.equal(skinbone[3:], data['joints'][i]):
+                bone_name_list[j] = i_name
+        i_name += 1
+    for bone_name, bone in bone_dict.items():
+        # Find parent bone by checking if current bone's head matches any other bone's tail
+        for other_bone_name, other_bone in bone_dict.items():
+            if other_bone != bone and bone.head == other_bone.tail:
+                bone.parent = other_bone
+                break
+    print(bone_name_list)
+    reload_tensor_skinning(data, bone_name_list)
+    print("Armature modifier added to mesh:", mesh_object.name)
+    bpy.ops.object.mode_set(mode='OBJECT')
+    if save:
+        bpy.ops.wm.save_as_mainfile(filepath= os.path.join('/data2/ydengbd/Anymate/Anymate/data', data['name'], 'blender_output.blend'))
+    return armature_obj
+def load_blender(blender_path):
+    bpy.ops.wm.read_homefile(use_empty=True)
+    # bpy.ops.wm.append(directory=object_path, link=False)
+    # load_object(object_path)
+    bpy.ops.wm.open_mainfile(filepath=blender_path)
+    armature_obj = []
+    mesh_obj = []
+    for obj in bpy.context.scene.objects:
+        if obj.type == "ARMATURE":
+            armature_obj.append(obj)
+        if obj.type == "MESH":
+            mesh_obj.append(obj)
+    print('mesh obj:', len(mesh_obj))
+    # start retrieve the information of mesh, skining and rigging
+    #1. retrieve the information of rigging, save the world matrix of the amature object
+    total_armature_info = {}
+    joints_matrix = []
+    bone_dict = {}
+    parent_name= []
+    bone_count = 0
+    for obj in armature_obj:
+        # depsgraph = bpy.context.evaluated_depsgraph_get()
+        # obj = obj.evaluated_get(depsgraph)
+        armature_info = {}
+        armature_info["world_matrix"] = [list(row) for row in obj.matrix_world.copy()]
+        translation = obj.matrix_world.translation
+        for bone in obj.pose.bones:
+            joints_matrix.append(np.array(list((obj.matrix_world.to_3x3() @ bone.head+translation).copy())))
+            if bone.parent:
+                parent_name.append(bone.parent.name)
+            else:
+                parent_name.append('root')
+            bone_dict[bone.name] = bone_count
+            bone_count += 1
+    connectivity = torch.zeros(bone_count, dtype=torch.int32)
+    for i, bone_name in enumerate(parent_name):
+        if bone_name == 'root':
+            connectivity[i] = i
+        else:
+            connectivity[i] = bone_dict[bone_name]
+    joints_matrix = torch.from_numpy(np.array(joints_matrix))
+    skinning_weight = torch.zeros(len(mesh_obj[0].data.vertices), joints_matrix.shape[0])
+    vertex_index = 0
+    for obj in mesh_obj:
+        vertex_groups = obj.vertex_groups
+        for vertex in obj.data.vertices:
+            vertex_info = {}
+            for group in vertex.groups:
+                name = vertex_groups[group.group].name
+                weight = group.weight
+                skinning_weight[vertex.index][bone_dict[name]] = weight
+    obj_save_path = blender_path.replace('.blend', '.obj')
+    bpy.ops.wm.obj_export(filepath=obj_save_path, export_materials=False)
+    return joints_matrix,connectivity, skinning_weight
+def save_scene(scene_path):
+    # export the scene as a glb file
+    if scene_path.endswith('.glb'):
+        bpy.ops.export_scene.gltf(filepath=scene_path)
+        bpy.ops.wm.save_as_mainfile(filepath=scene_path.replace('.glb', '.blend'))
+    elif scene_path.endswith('.blend'):
+        bpy.ops.wm.save_as_mainfile(filepath=scene_path)
+    elif scene_path.endswith('.obj'):
+        bpy.ops.wm.obj_export(filepath=scene_path, export_materials=False)
+    else:
+        raise ValueError(f"Unsupported file extension: {scene_path}")
+if __name__ == '__main__':
+    # load the mesh
+    empty()
+    add_mesh('/home/ydengbd/objaverse/obj/0001.obj')
+    # load the joints
+    joints_matrix = np.load('/home/ydengbd/objaverse/joints/0001.npy')
+    add_joint(joints_matrix)
+    # load the connections
+    con_index = np.load('/home/ydengbd/objaverse/connections/0001.npy')
+    add_conn(con_index)
+    # load the skin

Anymate/utils/train_utils.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import os
+import numpy as np
+from tqdm import tqdm
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+from torch.utils.data import DataLoader
+from Anymate.dataset import AnymateDataset, my_collate
+from Anymate.model import EncoderDecoder
+from Anymate.utils.loss_utils import cross_entropy_with_probs_batch, cos_loss, cos_loss_clamp, chamfer_distance_with_average
+from Anymate.utils.vol_utils import get_co, get_gt, extract_keypoints
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import init_process_group, destroy_process_group
+from torch.utils.data.distributed import DistributedSampler
+import point_cloud_utils as pcu
+from sklearn.cluster import DBSCAN
+from diffusers import DDPMScheduler, DDIMScheduler
+import torch.nn.functional as F
+from Anymate.utils.diffusion_utils import my_collate_diff, randn_tensor
+def ddp_setup(rank: int, world_size: int, port: int):
+  """
+  Args:
+      rank: Unique identifier of each process
+     world_size: Total number of processes
+  """
+  os.environ["MASTER_ADDR"] = "localhost"
+  os.environ["MASTER_PORT"] = str(port)
+  torch.cuda.set_device(rank)
+  init_process_group(backend="nccl", rank=rank, world_size=world_size)
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0.0
+        self.avg = 0.0
+        self.sum = 0.0
+        self.count = 0.0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+    def accumulate(self, val, n=1):
+        self.val = val
+        self.sum += val
+        self.count += n
+        self.avg = self.sum / self.count
+def save_checkpoint(state, is_best, checkpoint='checkpoint', filename='model_best.pth.tar', snapshot=None):
+    filepath = os.path.join(checkpoint, filename)
+    if is_best:
+        torch.save(state, filepath)
+    if snapshot and state['epoch'] % snapshot == 0:
+        torch.save(state, os.path.join(checkpoint, 'checkpoint_{}.pth.tar'.format(state['epoch'])))
+def train_model(rank, world_size, config, args, shared_dict, port=12355):
+    ddp_setup(rank, world_size, port)
+    lowest_loss = 1e20
+    model_config = config['model']
+    model = EncoderDecoder(device=f'cuda:{rank}', dtype=torch.float32, **model_config)
+    model.to(f'cuda:{rank}')
+    if rank == 0:
+        print('only_embed', model.only_embed)
+        print('return_latents', model.return_latents)
+        print(model)
+    if not args.finetune:
+        model.encoder.requires_grad_(False)
+    model = DDP(model, device_ids=[rank])
+    optimizer_config = config['optimizer']
+    if args.finetune:
+        optimizer = torch.optim.Adam(model.module.parameters(), **optimizer_config)
+    else:
+        if args.encoder == 'miche':
+            optimizer = torch.optim.Adam(model.module.decoder.parameters(), **optimizer_config)
+        elif args.encoder == 'bert':
+            optimizer = torch.optim.Adam(list(model.module.decoder.parameters()) + list(model.module.point_proj.parameters()), **optimizer_config)
+    # optionally resume from a checkpoint
+    if args.resume:
+        try:
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume)
+            args.start_epoch = checkpoint['epoch']
+            lowest_loss = checkpoint['lowest_loss']
+            model.module.load_state_dict(checkpoint['state_dict'], strict=True)
+            print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
+        except:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+    cudnn.benchmark = True
+    print('    Total params: %.2fM' % (sum(p.numel() for p in optimizer.param_groups[0]['params']) / 1000000.0))
+    my_collate_func = my_collate_diff if args.mode == 'diffusion' else my_collate
+    if world_size > 1:
+        if not args.split:
+            train_dataset = shared_dict['train_dataset']
+            train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
+            train_loader = DataLoader(train_dataset, batch_size=args.train_batch, sampler=train_sampler, collate_fn= my_collate_func)
+        else:
+            train_dataset = AnymateDataset(name=args.trainset + f'_{rank}', root=args.root) #should changed to dpp version
+            train_loader = DataLoader(train_dataset, batch_size=args.train_batch, shuffle=True, collate_fn= my_collate_func)
+    else:
+        train_dataset = AnymateDataset(name=args.trainset, root=args.root)
+        train_loader = DataLoader(train_dataset, batch_size=args.train_batch, shuffle=True, collate_fn= my_collate_func)
+    if rank == 0:
+        test_loader = DataLoader(AnymateDataset(name=args.testset, root=args.root), batch_size=args.test_batch, shuffle=False, collate_fn= my_collate_func )
+    if not args.schedule:
+        args.schedule = [args.epochs//2]
+    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.schedule, gamma=args.gamma)
+    # step the scheduler to the start epoch
+    for _ in range(args.start_epoch):
+        scheduler.step()
+    if rank == 0:
+        logger = SummaryWriter(log_dir=args.logdir)
+        print('start ')
+        print('test_frequency', args.test_freq)
+        print('start from epoch', args.start_epoch)
+    # start training
+    for epoch in range(args.start_epoch, args.epochs):
+        test_dict = None
+        is_best = False
+        lr = scheduler.get_last_lr()
+        if rank == 0:
+            print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr[0]))
+        train_loss, grad_norm = train(train_loader, model, optimizer, args)
+        if rank == 0 and (epoch == 0 or (epoch+1)%args.test_freq== 0):
+            print('Testing epoch', epoch+1)
+            test_dict = test(test_loader, model, args, world_size=world_size)
+        scheduler.step()
+        if rank == 0:
+            print('Epoch{:d}. train_loss: {:.6f}.'.format(epoch + 1, train_loss))
+            print('Epoch{:d}. grad_norm: {:.6f}.'.format(epoch + 1, grad_norm))
+            info = {'train_loss': train_loss, 'grad_norm': grad_norm, 'lr': lr[0]}
+            # print('Epoch{:d}. val_loss: {:.6f}.'.format(epoch + 1, val_loss))
+            if test_dict is not None:
+                for key, value in test_dict.items():
+                    print('Epoch{:d}. {:s}: {:.6f}.'.format(epoch + 1, key, value))
+                test_loss = test_dict['test loss'] if not args.mode == 'diffusion' else test_dict['chamfer']
+                is_best = test_loss < lowest_loss
+                lowest_loss = min(test_loss, lowest_loss)
+                for key, value in test_dict.items():
+                    info[key] = value
+            for tag, value in info.items():
+                logger.add_scalar(tag, value, epoch+1)
+            save_dict = {'epoch': epoch + 1, 'state_dict': model.module.state_dict(), 'lowest_loss': lowest_loss, 'optimizer': optimizer.state_dict(), 'model_config': model_config}
+            save_checkpoint(save_dict, is_best=is_best, checkpoint=args.checkpoint, snapshot=args.epochs//20)
+def get_criterion(args):
+    if args.loss == 'cos':
+        criterion = cos_loss
+    elif args.loss == 'ce':
+        criterion = cross_entropy_with_probs_batch
+    elif args.loss == 'cos_clamp':
+        criterion = cos_loss_clamp
+    else:
+        criterion = chamfer_distance_with_average
+    return criterion
+def get_train_loss(model, data, args):
+    criterion = get_criterion(args)
+    loss = 0.0
+    if args.mode == 'skin':
+        y_pred, idx = model(data, downsample=1024)
+        y_pred = torch.softmax(y_pred, dim=-1)
+        y = data['skins'].to(args.device)
+        y = y[:, idx]
+        loss = criterion(y_pred, y)
+    elif args.mode == 'conn':
+        y_pred = model(data, args.device)
+        y_pred = torch.softmax(y_pred, dim=-1)
+        y = data['conns'].to(args.device)
+        y = y[:, :y_pred.shape[1], :y_pred.shape[1]].float()
+        loss = criterion(y_pred, y)
+    elif args.mode == 'joints': # joints mode
+        if args.decoder == 'transformer_latent':
+            y_pred = model(data, args.device)
+            joints_gt = data['joints'].to(args.device)
+            loss = 0.0
+            for i in range(joints_gt.shape[0]):
+                joints_gt_i = joints_gt[i,:data['joints_num'][i], :3]
+                loss += criterion(y_pred[i:i+1], joints_gt_i.unsqueeze(0))
+            loss /= joints_gt.shape[0]
+        elif args.decoder == 'triplane' or args.decoder == 'implicit_transformer':
+            criterion = torch.nn.BCEWithLogitsLoss()
+            y_pred = model(data, args.device, downsample=True)
+            joints_gt = data['joints'].to(args.device)
+            for i in range(joints_gt.shape[0]):
+                joints_gt_i = joints_gt[i,:data['joints_num'][i], :3]
+                vol = get_co(data['vox'][i])
+                if data['vox'][i].shape[0] > 50000:
+                    vol = vol[y_pred[i][1]]
+                    gt = get_gt(vol.to(args.device), joints_gt_i)
+                    loss += criterion(y_pred[i][0].squeeze(-1).unsqueeze(0), gt.unsqueeze(0))
+                else:
+                    gt = get_gt(vol.to(args.device), joints_gt_i)
+                    loss += criterion(y_pred[i].squeeze(-1).unsqueeze(0), gt.unsqueeze(0))
+            loss /= joints_gt.shape[0]
+    elif args.mode == 'diffusion':
+        noise_scheduler = DDIMScheduler(num_train_timesteps=args.num_train_step)
+        samples = data['joints_repeat'].to(model.device).float()
+        #use 256 input joints
+        samples = samples[...,:args.num_training_points,:]
+        samples = samples.to(model.device)
+        noise = torch.randn(samples.shape, device=samples.device)
+        assert samples.device == noise.device
+        bs = samples.shape[0]
+        # Sample a random timestep for each image
+        timesteps = torch.randint(
+            0, noise_scheduler.config.num_train_timesteps, (bs,), device=samples.device,
+            dtype=torch.int64
+        )
+        noisy_joints = noise_scheduler.add_noise(samples, noise, timesteps)
+        noisy_joints = noisy_joints.to(model.device)
+        noisy_joints = noisy_joints.permute(0, 2, 1)
+        noise_pred = model(data, noisy_joints=noisy_joints, timesteps = timesteps)
+        noise_pred = noise_pred.permute(0, 2, 1)
+        loss = F.mse_loss(noise_pred, noise)
+    return loss
+def train(train_loader, model, optimizer, args):
+    if not args.finetune:
+        model.train()
+        model.module.encoder.eval()
+    else:
+        model.train()
+    loss_meter = AverageMeter()
+    grad_norm_meter = AverageMeter()
+    for data in tqdm(train_loader):
+        loss = get_train_loss(model, data, args)
+        optimizer.zero_grad()
+        loss.backward()
+        grad_norm = 0
+        for p in optimizer.param_groups[0]['params']:
+            grad_norm += p.grad.data.norm(2).item()
+        grad_norm_meter.update(grad_norm)
+        optimizer.step()
+        loss_meter.update(loss.item())
+    return loss_meter.avg, grad_norm_meter.avg
+def test(test_loader, model, args, world_size=1):
+    model.eval()
+    assert args.mode in ['skin', 'joints', 'conn', 'diffusion'], 'mode should be choose from [skin, joints, conn, diffusion], got {}'.format(args.mode)
+    if args.mode == 'skin' or args.mode == 'conn':
+        loss_meter = AverageMeter()
+        cos_sim_meter = AverageMeter()
+        cos_clamp_meter = AverageMeter()
+        for i, data in enumerate(tqdm(test_loader)):
+            if world_size > 1 and i > 1000:
+                break
+            with torch.no_grad():
+                y_pred = model(data, args.device)
+                y_pred = torch.softmax(y_pred, dim=-1)
+                if args.mode == 'skin':
+                    y = data['skins'].to(args.device)
+                elif args.mode == 'conn':
+                    y = data['conns'].to(args.device)
+                    y = y[:, :y_pred.shape[1], :y_pred.shape[1]].float()
+                loss = 0.0
+                loss = cross_entropy_with_probs_batch(y_pred, y)
+                loss_meter.update(loss.item())
+                cos_sim = cos_loss(y_pred, y)
+                cos_sim_meter.update(cos_sim.mean().item())  # 1 - loss.item()
+                cos_clamp = cos_loss_clamp(y_pred, y)
+                cos_clamp_meter.update(cos_clamp.mean().item())
+        loss_dict = {'test loss': loss_meter.avg, 'cos_sim': cos_sim_meter.avg, 'cos_clamp': cos_clamp_meter.avg}
+    # get the loss of the joints prediction
+    elif args.mode == 'joints':
+        if args.decoder == 'transformer_latent':
+            loss_meter = AverageMeter()
+            emd_meter = AverageMeter()
+            for i, data in tqdm(enumerate(test_loader)):
+                if world_size > 1 and i > 1000:
+                    break
+                with torch.no_grad():
+                    y_pred = model(data, args.device)
+                    joints_gt = data['joints'].to(args.device)
+                    loss = 0.0
+                    emd = 0.0
+                    for i in range(joints_gt.shape[0]):
+                        joints_gt_i = joints_gt[i,:data['joints_num'][i], :3]
+                        y_pred_i = y_pred[i]
+                        y_pred_i = y_pred[i].detach().cpu().numpy()
+                        clustering = DBSCAN(eps=0.03, min_samples=1).fit(y_pred_i) # Consider add eps and min_samples as arguments
+                        cluster_centers = []
+                        for cluster in set(clustering.labels_):
+                            cluster_centers.append(y_pred_i[clustering.labels_ == cluster].mean(axis=0))
+                        y_pred_i = torch.from_numpy(np.array(cluster_centers)).to(args.device)
+                        if y_pred_i.shape[0] < 2:
+                            print(data['name'][i] + ' has less than 2 points')
+                            continue
+                        loss += chamfer_distance_with_average(y_pred_i.unsqueeze(0), joints_gt_i.unsqueeze(0))
+                        emd_i, pi = pcu.earth_movers_distance(y_pred_i.cpu().numpy().astype(np.float64), joints_gt_i.cpu().numpy().astype(np.float64))
+                        emd += emd_i
+                    if loss == 0 or emd == 0:
+                        continue
+                    loss /= joints_gt.shape[0]
+                    loss_meter.update(loss.item())
+                    emd_meter.update(emd)
+                loss_dict = {'test loss': loss_meter.avg, 'emd': emd_meter.avg}
+        elif args.decoder == 'triplane' or 'implicit_transformer':
+            loss_meter = AverageMeter()
+            emd_meter = AverageMeter()
+            chamfer_meter = AverageMeter()
+            criterion = torch.nn.BCEWithLogitsLoss()
+            for data in tqdm(test_loader):
+                with torch.no_grad():
+                    y_pred = model(data, args.device)
+                    joints_gt = data['joints'].to(args.device)
+                    loss = 0.0
+                    emd = 0.0
+                    chamfer = 0.0
+                    for i in range(joints_gt.shape[0]):
+                        joints_gt_i = joints_gt[i,:data['joints_num'][i], :3]
+                        vol = get_co(data['vox'][i])
+                        gt = get_gt(vol.to(args.device), joints_gt_i)
+                        loss += criterion(y_pred[i].squeeze(-1).unsqueeze(0), gt.unsqueeze(0))
+                        key_points = extract_keypoints(y_pred[i].cpu(), data['vox'][i])
+                        if len(key_points) < 2:
+                            continue
+                        key_points = key_points / 32 - 1
+                        chamfer += chamfer_distance_with_average(torch.from_numpy(key_points).unsqueeze(0).to(joints_gt_i.device), joints_gt_i.unsqueeze(0))
+                        emd_i, _ = pcu.earth_movers_distance(key_points.astype(np.float64), joints_gt_i.cpu().numpy().astype(np.float64))
+                        emd += emd_i
+                    if loss == 0 or emd == 0 or chamfer == 0:
+                        continue
+                    loss /= joints_gt.shape[0]
+                    loss_meter.update(loss.item())
+                    emd_meter.update(emd)
+                    chamfer_meter.update(chamfer.item())
+            loss_dict = {'test loss': loss_meter.avg, 'emd': emd_meter.avg, 'chamfer': chamfer_meter.avg}
+    elif args.mode == 'diffusion':
+        loss_meter = AverageMeter()
+        emd_meter = AverageMeter()
+        chamfer_meter = AverageMeter()
+        generator=torch.Generator(device='cpu').manual_seed(args.seed+1)
+        scheduler = DDIMScheduler(num_train_timesteps=args.num_train_step)
+        scheduler.set_timesteps(args.num_train_step)
+        points_shape = [args.test_batch, args.num_training_points, 3]
+        for data in tqdm(test_loader):
+            joints_gt = data['joints'].to(dtype=torch.float64)
+            points_noise = randn_tensor(points_shape, generator=generator)
+            points = points_noise.permute(0, 2, 1).to(model.device)
+            for t in scheduler.timesteps:
+                with torch.no_grad():
+                    time_steps = torch.ones(args.test_batch, 1, dtype=torch.long) * t
+                    time_steps = time_steps.to(model.device)
+                    model_output = model(data, noisy_joints=points, timesteps = time_steps)
+                    points = scheduler.step(model_output, t, points, generator=generator).prev_sample
+            points = points.permute(0, 2, 1).cpu()
+            chamfer_sum = 0.0
+            emd_sum = 0.0
+            for i in range(args.test_batch):
+                joints_gt_i = joints_gt[i,:data['joints_num'][i], :3]
+                points_i = points[i]
+                points_i = points_i.reshape( -1, 3)
+                emd, p = pcu.earth_movers_distance(points_i.cpu().numpy(),joints_gt_i[:,:3].cpu().numpy())
+                emd_sum += emd
+                chamfer_sum += chamfer_distance_with_average(points_i.unsqueeze(0),joints_gt_i[:,:3].unsqueeze(0))
+            emd_meter.update(emd_sum)
+            chamfer_meter.update(chamfer_sum.item())
+        loss_dict = {'chamfer': chamfer_meter.avg, 'emd': emd_meter.avg}
+    return loss_dict

Anymate/utils/ui_utils.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import trimesh
+import numpy as np
+import torch
+import os
+import matplotlib.pyplot as plt
+import gradio as gr
+import time
+bone_colors = plt.get_cmap('tab10')
+from Anymate.utils.utils import load_checkpoint, get_joint, get_connectivity, get_skinning
+from Anymate.utils.dataset_utils import obj2mesh
+from Anymate.args import anymate_args
+# from Anymate.utils.render_utils import empty, add_co, add_mesh, add_joint, add_conn, add_skin, setup_armature
+def visualize_results(mesh_file=None, joints=None, conns=None, skins=None):
+    # Create a scene with both original and processed meshes
+    scene = trimesh.Scene()
+    vis_file = mesh_file.replace('object.obj', 'vis.glb')
+    if mesh_file is not None:
+        # Load the original mesh (in blue) with transparency
+        # original_mesh = trimesh.load(mesh_file)
+        original_mesh = obj2mesh(mesh_file)
+        if skins is not None:
+            # pdb.set_trace()
+            # Get per-vertex colors based on skinning weights
+            vertex_colors = np.zeros((len(original_mesh.vertices), 4))
+            # Convert skinning weights to numpy if needed
+            if isinstance(skins, torch.Tensor):
+                skins = skins.cpu().numpy()
+            # For each bone, blend colors based on skinning weights
+            for bone_idx in range(skins.shape[1]):
+                bone_color = np.array(bone_colors(bone_idx % 10))  # Get base color for this bone
+                weights = skins[:, bone_idx]
+                vertex_colors += np.outer(weights, bone_color)  # Blend weighted colors
+            # Normalize and clip colors
+            vertex_colors = np.clip(vertex_colors, 0, 1)
+            # Convert to vertex colors and set alpha
+            vertex_colors = (vertex_colors * 255).astype(np.uint8)
+            vertex_colors[:, 3] = 255  # Set alpha to 100 for transparency
+            # print(vertex_colors.shape)
+            # print(vertex_colors.max(axis=0), vertex_colors.min(axis=0), vertex_colors.mean(axis=0))
+            # Apply colors directly to vertices
+            original_mesh.visual.vertex_colors = vertex_colors
+            # face_colors = np.zeros((len(original_mesh.faces), 4))
+            # processed_mesh = trimesh.load(mesh_file)
+            processed_mesh = obj2mesh(mesh_file)
+            # Assign vertex colors from original_mesh to processed_mesh
+            # Since they might have different number of vertices, we need to find closest vertices
+            # Get vertices from both meshes
+            orig_vertices = original_mesh.vertices
+            proc_vertices = processed_mesh.vertices
+            # For each vertex in processed_mesh, find the closest vertex in original_mesh
+            closest_indices = []
+            for proc_vertex in proc_vertices:
+                # Calculate distances to all original vertices
+                distances = np.linalg.norm(orig_vertices - proc_vertex, axis=1)
+                # Find index of closest vertex
+                closest_idx = np.argmin(distances)
+                closest_indices.append(closest_idx)
+            proc_vertex_colors = original_mesh.visual.vertex_colors[closest_indices]
+            processed_mesh.visual.vertex_colors = proc_vertex_colors
+            original_mesh = processed_mesh
+        else:
+            original_mesh.visual.face_colors = [255, 255, 255, 100]  # Blue with alpha=100 for transparency
+        scene.add_geometry(original_mesh)
+    if joints is not None:
+        # create a sphere for each joint
+        for position in joints:
+            sphere = trimesh.primitives.Sphere(radius=0.02)
+            sphere.visual.face_colors = [255, 0, 0, 255]  # Red with transparency
+            sphere.apply_translation(position.cpu().numpy())
+            scene.add_geometry(sphere)
+        if conns is not None:
+            # create a line for each connectivity
+            for i, conn in enumerate(conns):
+                if i == conn:
+                    continue
+                # Create cylinder between joints
+                points = [joints[i].cpu().numpy(), joints[conn].cpu().numpy()]
+                direction = points[1] - points[0]
+                height = np.linalg.norm(direction)
+                cylinder = trimesh.primitives.Cylinder(radius=0.01, height=height)
+                # Calculate rotation matrix to align cylinder with direction
+                direction = direction / height  # Normalize direction vector
+                up_vector = np.array([0, 0, 1])
+                rotation_matrix = trimesh.geometry.align_vectors(up_vector, direction)
+                # Apply rotation and translation to cylinder
+                cylinder.apply_transform(rotation_matrix)
+                cylinder.apply_translation(points[0] + direction * height/2)
+                cylinder.visual.face_colors = [0, 0, 255, 255]  # Blue
+                scene.add_geometry(cylinder)
+    # Export the scene
+    scene.export(vis_file)
+    return vis_file
+def process_mesh_to_pc(obj_path, sample_num = 8192, save_path = None):
+    # mesh_list : list of trimesh
+    try :
+        mesh = trimesh.load_mesh(obj_path)
+        points, face_idx = mesh.sample(sample_num, return_index=True)
+        normals = mesh.face_normals[face_idx]
+        pc_normal = np.concatenate([points, normals], axis=-1, dtype=np.float16)
+        if save_path is not None:
+            np.save(save_path, pc_normal)
+        return pc_normal
+    except Exception as e:
+        print(f"Error: {obj_path} {e}")
+        return None
+def normalize_mesh(mesh):
+    # Check if input is a scene with multiple meshes
+    if isinstance(mesh, trimesh.Scene):
+        # Combine all meshes in the scene into a single mesh
+        meshes = []
+        for geometry in mesh.geometry.values():
+            if isinstance(geometry, trimesh.Trimesh):
+                # Transform mesh to scene coordinates
+                transform = mesh.graph[mesh.graph.nodes_geometry[0]][0]
+                geometry.apply_transform(transform)
+                meshes.append(geometry)
+        # Combine all meshes
+        mesh = trimesh.util.concatenate(meshes)
+    # Get vertices and compute bounding box
+    vertices = mesh.vertices
+    bbox_min = vertices.min(axis=0)
+    bbox_max = vertices.max(axis=0)
+    # Find center and scale
+    center = (bbox_min + bbox_max) * 0.5
+    scale = 2.0 / (bbox_max - bbox_min).max()
+    # Center and scale vertices
+    vertices = (vertices - center) * scale
+    # Create new mesh with normalized vertices
+    normalized_mesh = trimesh.Trimesh(vertices=vertices,
+                                    faces=mesh.faces,
+                                    face_normals=mesh.face_normals,
+                                    vertex_normals=mesh.vertex_normals,
+                                    process=False)
+    # # Copy texture from original mesh if it exists
+    # if hasattr(mesh, 'visual') and hasattr(mesh.visual, 'material'):
+    #     print("copy material")
+    #     normalized_mesh.visual.material = mesh.visual.material
+    # if hasattr(mesh, 'visual') and hasattr(mesh.visual, 'texture'):
+    #     print("copy texture")
+    #     normalized_mesh.visual.texture = mesh.visual.texture
+    # if hasattr(mesh, 'visual') and hasattr(mesh.visual, 'uv'):
+    #     print("copy uv")
+    #     normalized_mesh.visual.uv = mesh.visual.uv
+    return normalized_mesh
+def vis_joint(normalized_mesh_file, joints):
+    if normalized_mesh_file is None or joints is None:
+        return None, None
+    vis_file = visualize_results(mesh_file=normalized_mesh_file, joints=joints)
+    return vis_file, vis_file
+def vis_connectivity(normalized_mesh_file, joints, conns):
+    if normalized_mesh_file is None or joints is None or conns is None:
+        return None, None
+    vis_file = visualize_results(mesh_file=normalized_mesh_file, joints=joints, conns=conns)
+    return vis_file, vis_file
+def vis_skinning(normalized_mesh_file, joints, conns, skins):
+    if normalized_mesh_file is None or joints is None or conns is None or skins is None:
+        return None, None
+    vis_file = visualize_results(mesh_file=normalized_mesh_file, joints=joints, conns=conns, skins=skins)
+    return vis_file, vis_file
+def prepare_blender_file(normalized_mesh_file):
+    if normalized_mesh_file is None:
+        return None
+    if not os.path.exists(normalized_mesh_file) or not os.path.exists(normalized_mesh_file.replace('object.obj', 'joints.pt')) or not os.path.exists(normalized_mesh_file.replace('object.obj', 'conns.pt')) or not os.path.exists(normalized_mesh_file.replace('object.obj', 'skins.pt')):
+        return None
+    folder = normalized_mesh_file.replace('object.obj', '')
+    abs_folder = os.path.abspath(folder)
+    os.system(f"python Render.py --path {abs_folder}")
+    blender_file = os.path.join(folder, 'blender_output.blend')
+    while not os.path.exists(blender_file):
+        time.sleep(1)
+    return blender_file
+def process_input(mesh_file):
+    """
+    Function to handle input changes and initialize visualization
+    Args:
+        mesh_file: Path to input mesh file
+        joint_checkpoint: Path to joint prediction checkpoint
+        conn_checkpoint: Path to connectivity prediction checkpoint
+        skin_checkpoint: Path to skinning prediction checkpoint
+    Returns:
+        vis_file: Path to visualization file
+    """
+    # For now just visualize the input mesh
+    if mesh_file is None:
+        return None, None, None, None, None, None, None, None
+    # make folder for tmp files
+    os.makedirs(f"Anymate/tmp/{mesh_file.split('/')[-1].replace('.obj', '')}", exist_ok=True)
+    normalized_mesh = normalize_mesh(obj2mesh(mesh_file))
+    normalized_mesh_file = f"Anymate/tmp/{mesh_file.split('/')[-1].replace('.obj', '')}/object.obj"
+    normalized_mesh.export(normalized_mesh_file)
+    # normalized_mesh.export(mesh_file)
+    vis_file = visualize_results(mesh_file=normalized_mesh_file)
+    pc = process_mesh_to_pc(normalized_mesh_file)
+    pc = torch.from_numpy(pc).to(anymate_args.device).to(torch.float32)
+    # print(pc.shape, pc.max(dim=0), pc.min(dim=0))
+    return normalized_mesh_file, vis_file, vis_file, None, pc, None, None, None
+def get_model(checkpoint):
+    model = load_checkpoint(checkpoint, anymate_args.device, anymate_args.num_joints)
+    return model, True
+def get_result_joint(mesh_file, model, pc, eps=0.03, min_samples=1):
+    return get_joint(pc, model, device=anymate_args.device, save=mesh_file.replace('object.obj', 'joints.pt'), eps=eps, min_samples=min_samples)
+def get_result_connectivity(mesh_file, model, pc, joints):
+    return get_connectivity(pc, joints, model, device=anymate_args.device, save=mesh_file.replace('object.obj', 'conns.pt'))
+def get_result_skinning(mesh_file, model, pc, joints, conns):
+    # mesh = trimesh.load(mesh_file)
+    mesh = obj2mesh(mesh_file)
+    vertices = torch.from_numpy(mesh.vertices).to(anymate_args.device).to(torch.float32)
+    vertex_normals = torch.from_numpy(mesh.vertex_normals).to(anymate_args.device).to(torch.float32)
+    vertices = torch.cat([vertices, vertex_normals], dim=-1)
+    return get_skinning(pc, joints, conns, model, vertices=vertices, device=anymate_args.device, save=mesh_file.replace('object.obj', 'skins.pt'))
+def get_all_models(checkpoint_joint, checkpoint_conn, checkpoint_skin):
+    model_joint = load_checkpoint(checkpoint_joint, anymate_args.device, anymate_args.num_joints)
+    model_connectivity = load_checkpoint(checkpoint_conn, anymate_args.device, anymate_args.num_joints)
+    model_skin = load_checkpoint(checkpoint_skin, anymate_args.device, anymate_args.num_joints)
+    return model_joint, model_connectivity, model_skin, True, True, True
+def get_all_results(mesh_file, model_joint, model_connectivity, model_skin, pc, eps=0.03, min_samples=1):
+    joints = get_result_joint(mesh_file, model_joint, pc, eps=eps, min_samples=min_samples)
+    conns = get_result_connectivity(mesh_file, model_connectivity, pc, joints)
+    skins = get_result_skinning(mesh_file, model_skin, pc, joints, conns)
+    return joints, conns, skins

Anymate/utils/ui_utils_bpy.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import trimesh
+import numpy as np
+import torch
+from Anymate.utils.utils import load_checkpoint, get_joints, get_connectivity
+from Anymate.args import anymate_args
+from Anymate.utils.render_utils import empty, add_co, add_mesh, add_joints, add_conn, add_skin, setup_armature, save_scene
+def visualize_results(mesh_file=None, joints=None, connectivity=None, skinning=None):
+    import bpy
+    # Create a scene with both original and processed meshes
+    vis_file = "Anymate/tmp/vis_scene.glb"
+    print('fffffffff')
+    # empty()
+    bpy.ops.wm.read_homefile(use_empty=True)
+    if mesh_file is not None:
+        # add_mesh(mesh_file)
+        bpy.ops.wm.obj_import(filepath=mesh_file)
+    if joints is not None:
+        add_joints(joints)
+        if connectivity is not None:
+            add_conn(connectivity, joints)
+    if skinning is not None:
+        add_skin(mesh_file, skinning)
+    # setup_armature()
+    # save_scene(vis_file)
+    bpy.ops.wm.save_as_mainfile(filepath=vis_file)
+    return vis_file
+def process_mesh_to_pc(obj_path, sample_num = 8192, save_path = None):
+    # mesh_list : list of trimesh
+    try :
+        mesh = trimesh.load_mesh(obj_path)
+        points, face_idx = mesh.sample(sample_num, return_index=True)
+        normals = mesh.face_normals[face_idx]
+        pc_normal = np.concatenate([points, normals], axis=-1, dtype=np.float16)
+        if save_path is not None:
+            np.save(save_path, pc_normal)
+        return pc_normal
+    except Exception as e:
+        print(f"Error: {obj_path} {e}")
+        return None
+def normalize_mesh(mesh):
+    # Get vertices and compute bounding box
+    vertices = mesh.vertices
+    bbox_min = vertices.min(axis=0)
+    bbox_max = vertices.max(axis=0)
+    # Find center and scale
+    center = (bbox_min + bbox_max) * 0.5
+    scale = 2.0 / (bbox_max - bbox_min).max()
+    # Center and scale vertices
+    vertices = (vertices - center) * scale
+    # Create new mesh with normalized vertices
+    normalized_mesh = trimesh.Trimesh(vertices=vertices,
+                                    faces=mesh.faces,
+                                    face_normals=mesh.face_normals,
+                                    vertex_normals=mesh.vertex_normals)
+    return normalized_mesh
+def vis_joint(normalized_mesh_file, joints):
+    vis_file = visualize_results(mesh_file=normalized_mesh_file, joints=joints)
+    return vis_file
+def vis_connectivity(normalized_mesh_file, joints, connectivity):
+    vis_file = visualize_results(mesh_file=normalized_mesh_file, joints=joints, connectivity=connectivity)
+    return vis_file
+def vis_skinning(skinning):
+    vis_file = visualize_results(skinning=skinning)
+    return vis_file
+def process_input(mesh_file):
+    """
+    Function to handle input changes and initialize visualization
+    Args:
+        mesh_file: Path to input mesh file
+        joint_checkpoint: Path to joint prediction checkpoint
+        conn_checkpoint: Path to connectivity prediction checkpoint
+        skin_checkpoint: Path to skinning prediction checkpoint
+    Returns:
+        vis_file: Path to visualization file
+    """
+    # For now just visualize the input mesh
+    normalized_mesh = normalize_mesh(trimesh.load(mesh_file))
+    normalized_mesh_file = "Anymate/tmp/normalized_mesh.obj"
+    normalized_mesh.export(normalized_mesh_file)
+    vis_file = visualize_results(mesh_file=normalized_mesh_file)
+    pc = process_mesh_to_pc(normalized_mesh_file)
+    pc = torch.from_numpy(pc).to(anymate_args.device).to(torch.float32)
+    print(pc.shape, pc.max(dim=0), pc.min(dim=0))
+    return normalized_mesh_file, vis_file, pc, None, None, None
+def get_model(checkpoint):
+    model = load_checkpoint(checkpoint, anymate_args.device, anymate_args.num_joints)
+    return model, True
+def get_result_joint(model, pc):
+    return get_joints(pc, model, anymate_args.device)
+def get_result_connectivity(model, pc, joints):
+    return get_connectivity(pc, joints, model, anymate_args.device)
+def get_result_skinning(model, pc):
+    with torch.no_grad():
+        skinning = model(pc)
+        return skinning

Anymate/utils/utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+from Anymate.model import EncoderDecoder
+from sklearn.cluster import DBSCAN
+def load_checkpoint(path, device, num_joints):
+    print(f"Loading model from {path}")
+    model_state = torch.load(path)
+    model_weights = model_state['state_dict']
+    try:
+        model_config = model_state['model_config']
+        model = EncoderDecoder(device=device, dtype=torch.float32, **model_config)
+        model.to(device)
+        model.load_state_dict(model_weights, strict=True)
+    except:
+        encoder = path.split('/')[-1].split('.')[0].split('-')[0]
+        decoder = path.split('/')[-1].split('.')[0].split('-')[1]
+        model = EncoderDecoder(encoder=encoder, decoder=decoder, device=device, dtype=torch.float32, num_joints=num_joints)
+        model.to(device)
+        model.load_state_dict(model_weights, strict=True)
+    print(f"Loaded model from {path}")
+    return model
+def get_joint(pc, model, device='cuda', save=None, vox=None, eps=0.03, min_samples=1):
+    model.eval()
+    data = {'points_cloud': pc.unsqueeze(0)}
+    if vox is not None:
+        data['vox'] = vox.unsqueeze(0)
+    with torch.no_grad():
+        model.decoder.inference_mode(eps=eps, min_samples=min_samples)
+        joints = model(data, device=device)
+        joints = torch.tensor(joints, dtype=torch.float32).to(device)
+        if save is not None:
+            torch.save(joints, save)
+        return joints
+def get_connectivity(pc, joints, model, device='cuda',return_prob=False, save=None):
+    model.eval()
+    data = {'points_cloud': pc.unsqueeze(0), 'joints': joints.unsqueeze(0), 'joints_num': torch.tensor([joints.shape[0]]),
+            'joints_mask': torch.ones(joints.shape[0], device=device).unsqueeze(0)}
+    with torch.no_grad():
+        conns = model(data, device=device).softmax(dim=-1)
+        conns = conns.squeeze(0) if return_prob else torch.argmax(conns, dim=-1).squeeze(0)
+        if save is not None:
+            torch.save(conns, save)
+        return conns
+def get_skinning(pc, joints, conns, model, vertices=None, bones=None, device='cuda', save=None):
+    model.eval()
+    if bones is None:
+        bones = []
+        for i in range(joints.shape[0]):
+            if conns[i] != i:
+                bones.append(torch.cat((joints[conns[i]], joints[i]), dim=-1))
+        bones = torch.stack(bones, dim=0)
+    data = {'points_cloud': pc.unsqueeze(0), 'bones': bones.unsqueeze(0), 'bones_num': torch.tensor([bones.shape[0]]),
+            'bones_mask': torch.ones(bones.shape[0], device=device).unsqueeze(0)}
+    if vertices is not None:
+        data['vertices'] = vertices.unsqueeze(0)
+        model.decoder.inference = True
+    with torch.no_grad():
+        skins = model(data, device=device).softmax(dim=-1).squeeze(0)
+        if save is not None:
+            torch.save(skins, save)
+        return skins

Anymate/utils/vol_utils.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import numpy as np
+import torch
+from ThirdParty.michelangelo.graphics.primitives import generate_dense_grid_points
+from sklearn.cluster import DBSCAN
+def get_vol(bounds=(-0.5, 0.0, -0.5, 0.5, 1.0, 0.5), octree_depth=6):
+    bbox_min = np.array(bounds[0:3])
+    bbox_max = np.array(bounds[3:6])
+    bbox_size = bbox_max - bbox_min
+    xyz_samples, grid_size, length = generate_dense_grid_points(
+        bbox_min=bbox_min,
+        bbox_max=bbox_max,
+        octree_depth=octree_depth,
+        indexing="ij"
+    )
+    xyz_samples = torch.FloatTensor(xyz_samples)  # ((2^d)+1)^3
+    return xyz_samples
+def get_co(vox, bounds=(-1.0, -1.0, -1.0, 1.0, 1.0, 1.0), dtype = torch.float32):
+    bbox_min = torch.tensor(bounds[0:3]).to(vox.device)
+    bbox_max = torch.tensor(bounds[3:6]).to(vox.device)
+    bbox_size = bbox_max - bbox_min
+    # ind = torch.argwhere(vox)
+    # ind = ind.to(dtype) / (vox.shape[0]) * bbox_size + bbox_min
+    ind = vox
+    ind = ind.to(dtype) / 64 * bbox_size + bbox_min
+    return ind.to(dtype)
+def get_gt(vol, joints, octree_depth=6):
+    sigma = 2 / 2**octree_depth
+    dist = torch.cdist(vol, joints)
+    # print(dist.min(), dist.max())
+    dist = dist.min(dim=1).values
+    gt = torch.exp(-dist**2 / 2 / sigma**2)
+    return gt
+def project_onto_planes(planes, coordinates):
+    """
+    Does a projection of a 3D point onto a batch of 2D planes,
+    returning 2D plane coordinates.
+    Takes plane axes of shape n_planes, 3, 3
+    # Takes coordinates of shape N, M, 3
+    # returns projections of shape N*n_planes, M, 2
+    """
+    N, M, C = coordinates.shape
+    n_planes, _, _ = planes.shape
+    coordinates = coordinates.unsqueeze(1).expand(-1, n_planes, -1, -1).reshape(N*n_planes, M, 3)
+    inv_planes = torch.linalg.inv(planes).unsqueeze(0).expand(N, -1, -1, -1).reshape(N*n_planes, 3, 3)
+    projections = torch.bmm(coordinates, inv_planes)
+    return projections[..., :2]
+def sample_from_planes(plane_axes, plane_features, coordinates, mode='bilinear', padding_mode='zeros', box_warp=None):
+    assert padding_mode == 'zeros'
+    N, n_planes, C, H, W = plane_features.shape
+    _, M, _ = coordinates.shape
+    plane_features = plane_features.view(N*n_planes, C, H, W)
+    # coordinates = (2/box_warp) * coordinates # TODO: add specific box bounds
+    projected_coordinates = project_onto_planes(plane_axes, coordinates).unsqueeze(1)
+    output_features = torch.nn.functional.grid_sample(plane_features, projected_coordinates.float(), mode=mode, padding_mode=padding_mode, align_corners=False).permute(0, 3, 2, 1).reshape(N, n_planes, M, C)
+    return output_features
+def generate_planes():
+    """
+    Defines planes by the three vectors that form the "axes" of the
+    plane. Should work with arbitrary number of planes and planes of
+    arbitrary orientation.
+    """
+    return torch.tensor([[[1, 0, 0],
+                            [0, 1, 0],
+                            [0, 0, 1]],
+                            [[1, 0, 0],
+                            [0, 0, 1],
+                            [0, 1, 0]],
+                            [[0, 0, 1],
+                            [1, 0, 0],
+                            [0, 1, 0]]], dtype=torch.float32)
+def extract_keypoints(y_pred, vox):
+    y_pred = y_pred.detach().cpu().numpy()
+    vox = vox.detach().cpu().numpy()
+    volume = np.zeros([64, 64, 64])
+    volume[...] = -100
+    volume[vox[:, 0], vox[:, 1], vox[:, 2]] = y_pred.squeeze(-1)
+    clusters = []
+    cluster_model = DBSCAN(eps=1.8, min_samples=1)
+    level = min((0.85 * y_pred.max() + 0.15 * y_pred.min()).item(), 0)
+    potential_points = np.argwhere(volume >= level)
+    clustering = cluster_model.fit(potential_points)
+    for cluster in set(clustering.labels_):
+        if cluster == -1:
+            print('got noise', len(potential_points[clustering.labels_ == cluster]))
+            continue
+        clusters.append(potential_points[clustering.labels_ == cluster])
+    while True:
+        if np.all(np.array([(len(cluster) < 10) for cluster in clusters])):
+            break
+        new_clusters = []
+        for points in clusters:
+            if len(points) < 10:
+                new_clusters.append(points)
+                continue
+            value = volume[points[:, 0], points[:, 1], points[:, 2]]
+            potential_points = points[value >= (0.1 * value.max() + 0.9 * value.min())]
+            clustering = cluster_model.fit(potential_points)
+            for cluster in set(clustering.labels_):
+                if cluster == -1:
+                    print('got noise', len(potential_points[clustering.labels_ == cluster]))
+                    continue
+                new_clusters.append(potential_points[clustering.labels_ == cluster])
+        clusters = new_clusters
+    key_points = np.array([cluster.mean(axis=0) for cluster in clusters])
+    key_points = key_points / 32 - 1
+    return key_points

Render.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import argparse
+import bpy
+import mathutils
+from Anymate.utils.render_utils import empty, setup_armature
+def parse_args():
+    parser = argparse.ArgumentParser(description='Anymate rendering script')
+    parser.add_argument('--path', type=str, required=True, help='Path to the model file')
+    return parser.parse_args()
+args = parse_args()
+print(f"Starting converting {args.path} to blender format...")
+empty()
+setup_armature(args.path)

ThirdParty/PointLLM/.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+__pycache__
+*.egg-info
+.vscode
+checkpoints
+outputs
+wandb
+anno_data
+objaverse_data
+modelnet40_data
+*.zip
+*.ipynb
+serving_workdirs

ThirdParty/PointLLM/README.md ADDED Viewed

	@@ -0,0 +1,353 @@

+<br>
+<p align="center">
+<h1 align="center"><img src="assets/icon.png" align="center" width="6.5%"><strong>PointLLM: Empowering Large Language Models to Understand Point Clouds</strong></h1>
+  <p align="center">
+    <a href='https://runsenxu.com/' target='_blank'>Runsen Xu</a>&emsp;
+    <a href='https://guanfang12.github.io/' target='_blank'>Xiaolong Wang</a>&emsp;
+    <a href='https://tai-wang.github.io/' target='_blank'>Tai Wang</a>&emsp;
+    <a href='http://yilunchen.com/about' target='_blank'>Yilun Chen</a>&emsp;
+    <a href='https://oceanpang.github.io/' target='_blank'>Jiangmiao Pang*</a>&emsp;
+    <a href='http://dahua.site/' target='_blank'>Dahua Lin</a>&emsp;
+    <br>
+    The Chinese University of Hong Kong&emsp;Shanghai AI Laboratory&emsp;Zhejiang University
+  </p>
+</p>
+<p align="center">
+  <a href="http://arxiv.org/abs/2308.16911" target='_**blank**'>
+    <img src="https://img.shields.io/badge/arXiv-2308.16911-blue?">
+  </a>
+  <a href="https://arxiv.org/pdf/2308.16911.pdf" target='_blank'>
+    <img src="https://img.shields.io/badge/Paper-📖-blue?">
+  </a>
+  <a href="https://runsenxu.com/projects/PointLLM" target='_blank'>
+    <img src="https://img.shields.io/badge/Project-&#x1F680-blue">
+  </a>
+  <a href="http://101.230.144.196" target='_blank'>
+    <img src="https://img.shields.io/badge/Demo-&#x1f917-blue">
+  </a>
+  <a href="" target='_blank'>
+    <img src="https://visitor-badge.laobi.icu/badge?page_id=OpenRobotLab.pointllm&left_color=gray&right_color=blue">
+  </a>
+  <a href="https://openxlab.org.cn/apps/detail/openxlab-app/PointLLM" target='_blank'>
+    <img src="https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg">
+  </a>
+</p>
+## 🏠 About
+<!-- ![Teaser](assets/teaser.jpg) -->
+<div style="text-align: center;">
+    <img src="assets/teaser.jpg" alt="Dialogue_Teaser" width=100% >
+</div>
+We introduce <b>PointLLM, a multi-modal large language model capable of understanding colored point clouds of objects.</b> It perceives object types, geometric structures, and appearance without concerns for ambiguous depth, occlusion, or viewpoint dependency. <b>We collect a novel dataset comprising 660K simple and 70K complex point-text instruction pairs</b> to enable a two-stage training strategy. To rigorously evaluate our model's perceptual abilities and its generalization capabilities, <b>we establish two benchmarks: Generative 3D Object Classification and 3D Object Captioning, assessed through three different evaluation methods.</b>
+## 🔥 News
+- [2024-09-06] We have uploaded the camera-ready version of PointLLM for ECCV 2024, which includes clearer writing and additional experimental results. Please check the paper [here](https://arxiv.org/abs/2308.16911).
+- [2024-07-01] PointLLM has been accepted by ECCV 2024 with all "strong-accept" recommendation. 🎉 We are looking for self-motivated students to conduct research regarding PointLLM. Please send an email to [email protected] with your CV if you are interested!
+- [2023-12-29] We release the codes of our online Gradio demo.
+- [2023-12-26] We release the codes for model evaluation, including ChatGPT/GPT-4 evaluation and traditional metric evaluation.
+- [2023-12-08] We release the codes for training and PointLLM-v1.2. The online demo has also been upgraded to the v1.2 version. Please enjoy! &#x1F389;
+- [2023-12-01] We have released an updated version of our paper (v2), which includes additional baseline comparisons, enhanced human-evaluation metrics, improved model performance (PointLLM-v1.2), and other refinements. Please check the updated version [here](https://arxiv.org/abs/2308.16911).
+- [2023-10-18] We release our instruction-following data, including both the simple-description and complex instructions. Download [here](https://huggingface.co/datasets/RunsenXu/PointLLM).
+- [2023-09-26] We release the inferencing codes with checkpoints as well as the Objaverse colored point cloud files we use. You can chat with PointLLM with your own machines.
+- [2023-08-31] We release the [paper](http://arxiv.org/abs/2308.16911) of PointLLM and an online gradio [demo](http://101.230.144.196). Try it! &#x1F389;
+<!-- contents with emoji -->
+## 📋 Contents
+- [🤖 Online Demo](#-online-demo)
+- [💬 Dialogue Examples](#-dialogue-examples)
+- [🔍 Overview](#-overview)
+- [📦 Training and Evaluation](#-training-and-evaluation)
+- [📝 TODO List](#-todo-list)
+- [🔗 Citation](#-citation)
+- [📄 License](#-license)
+- [📚 Related Work](#-related-work)
+- [👏 Acknowledgements](#-acknowledgements)
+## 🤖 Online Demo
+<b>PointLLM is online! Try it at [http://101.230.144.196](http://101.230.144.196) or at [OpenXLab/PointLLM](https://openxlab.org.cn/apps/detail/openxlab-app/PointLLM).</b>
+You can chat with PointLLM about the models of the [Objaverse](https://objaverse.allenai.org) dataset or about your own point clouds!
+Please do not hesitate to tell us if you have any feedback! 😃
+## 💬 Dialogue Examples
+| Dialogue 1 | Dialogue 2| Dialogue 3 | Dialogue 4
+| :-: | :-: | :-: | :-: |
+| <img width="100%" src="assets/dialogue_1.jpg"> |  <img width="100%" src="assets/dialogue_2.jpg"> |  <img width="100%" src="assets/dialogue_3.jpg"> | <img width="100%" src="assets/dialogue_4.jpg"> |
+## 🔍 Overview
+### Model
+<p align="center">
+  <img src="assets/model.jpg" align="center" width="100%">
+</p>
+The point encoder extracts features from the input point cloud and projects them to the latent space of the LLM backbone. The LLM backbone processes sequences of point tokens and text tokens, and generates the predicted tokens as the output.
+### Experiment Results
+#### Quantitative Comparisons with baselines.
+Please refer to our paper for more results.
+<p align="center">
+  <img src="assets/cls_results.png" align="center" width="100%">
+</p>
+<p align="center">
+  <img src="assets/caption_results.png" align="center" width="100%">
+</p>
+<b>!!!Note: Traditional metrics such as BLEU-1, ROUGE-L, and METEOR tend to favor shorter responses and may not effectively capture semantic accuracy. For a detailed discussion on this, please refer to our paper. We suggest the community not solely rely on these metrics for evaluation.</b>
+#### Qualitative Comparisons with baselines.
+Please refer to our paper for more results.
+<p align="center">
+  <img src="assets/qualitative_comparisons_v2.png" align="center" width="100%">
+</p>
+## 📦 Training and Evaluation
+### Installation
+We test our codes under the following environment:
+- Ubuntu 20.04
+- NVIDIA Driver: 515.65.01
+- CUDA 11.7
+- Python 3.10.13
+- PyTorch 2.0.1
+- Transformers 4.28.0.dev(transformers.git@cae78c46)
+To start:
+1. Clone this repository.
+```bash
+git clone [email protected]:OpenRobotLab/PointLLM.git
+cd PointLLM
+```
+2. Install packages
+```bash
+conda create -n pointllm python=3.10 -y
+conda activate pointllm
+pip install --upgrade pip  # enable PEP 660 support
+pip install -e .
+# * for training
+pip install ninja
+pip install flash-attn
+```
+### Data Preparation
+#### Objaverse Training Data
+1. Download the two compressed files of 660K Objaverse colored point clouds [here](https://huggingface.co/datasets/RunsenXu/PointLLM/tree/main). They require about 77GB of storage space.
+2. Run the following command to merge the two files into one and uncompress it. This will produce a folder named `8192_npy` containing 660K point cloud files named `{Objaverse_ID}_8192.npy`. Each file is a numpy array with dimensions (8192, 6), where the first three dimensions are `xyz` and the last three dimensions are `rgb` in [0, 1] range.
+```bash
+cat Objaverse_660K_8192_npy_split_a* > Objaverse_660K_8192_npy.tar.gz
+tar -xvf Objaverse_660K_8192_npy.tar.gz
+```
+3. In `PointLLM` folder, create a folder `data` and create a soft link to the uncompressed file in the directory.
+```bash
+cd PointLLM
+mkdir data
+ln -s /path/to/8192_npy data/objaverse_data
+```
+#### Instruction-Following Data
+1. In `PointLLM/data` folder, create a directory named `anno_data`.
+2. Our instruction-following data, including both the simple-description and complex instructions, can be downloaded [here](https://huggingface.co/datasets/RunsenXu/PointLLM). If you have difficulty downloading the data (e.g. network issue), please email the authors.
+- The simple-description data has 660K samples and the complex instructions have 70K samples.
+- Both training data are based on the Objaverse dataset.
+- The complex instructions are generated with GPT-4.
+3. Put the data files in the `anno_data` directory. The directory should look like this:
+```bash
+PointLLM/data/anno_data
+├── PointLLM_brief_description_660K_filtered.json
+├── PointLLM_brief_description_660K.json
+└── PointLLM_complex_instruction_70K.json
+```
+4. Note, the `PointLLM_brief_description_660K_filtered.json` is filtered from `PointLLM_brief_description_660K.json` by removing the 3000 objects we reserved as the validation set. If you want to reproduce the results in our paper, you should use the `PointLLM_brief_description_660K_filtered.json` for training. The `PointLLM_complex_instruction_70K.json` contains objects from the training set.
+5. If you want to generate the complex instructions by yourself, please refer to our paper for other details. The system prompt is at `pointllm/data/data_generation/system_prompt_gpt4_0613.txt`.
+#### Evaluation Data
+1. Download the referencing GT `PointLLM_brief_description_val_200_GT.json` we use for the benchmarks on Objaverse dataset [here](https://huggingface.co/datasets/RunsenXu/PointLLM/blob/main/PointLLM_brief_description_val_200_GT.json), and put it in `PointLLM/data/anno_data`. We also provide the 3000 object ids we filter during training [here](https://huggingface.co/datasets/RunsenXu/PointLLM/blob/main/val_object_ids_3000.txt) and their corresponding referencing GT [here](https://huggingface.co/datasets/RunsenXu/PointLLM/blob/main/PointLLM_brief_description_val_3000_GT.json), which can be used to evaluate on all the 3000 objects.
+2. Create a directory named `modelnet40_data` in `PointLLM/data`. Download the test split of ModelNet40 point clouds `modelnet40_test_8192pts_fps.dat` [here](https://huggingface.co/datasets/RunsenXu/PointLLM/blob/main/modelnet40_test_8192pts_fps.dat) and put it in `PointLLM/data/modelnet40_data`.
+### Training
+#### Download the Initial LLM and Point Encoder Weights
+1. In `PointLLM` folder, create a directory named `checkpoints`.
+2. Download the pre-trained LLM and point encoder: [
+PointLLM_7B_v1.1_init](https://huggingface.co/RunsenXu/PointLLM_7B_v1.1_init/tree/main) or [PointLLM_13B_v1.1_init](https://huggingface.co/RunsenXu/PointLLM_13B_v1.1_init/tree/main). Put them in the `checkpoints` directory.
+3. Note that the above "v1.1" means we use the Vicuna-v1.1 checkpoints, and you do **not** need to download the original LLaMA weights again.
+#### Start Training
+1. For stage-1 training, simply run:
+```bash
+cd PointLLM
+scripts/PointLLM_train_stage1.sh
+```
+2. After stage-1 training, start stage-2 training:
+```bash
+scripts/PointLLM_train_stage2.sh
+```
+#### PointLLM-v1.1 and PointLLM-v1.2
+Usually, you do not have to care about the following contents. They are only for reproducing the results in our v1 paper (PointLLM-v1.1). If you want to compare with our models or use our models for downstream tasks, please use PointLLM-v1.2 (refer to our v2 paper), which has better performance.
+<details>
+  <summary>The following steps are for reproducing PointLLM-v1.1 (click to expand)</summary>
+1. PointLLM v1.1 and v1.2 use slightly different pre-trained point encoders and projectors. If you want to reproduce PointLLM v1.1, edit the `config.json` file in the directory of initial LLM and point encoder weights, for example, `vim checkpoints/PointLLM_7B_v1.1_init/config.json`.
+2. Change the key `"point_backbone_config_name"` to specify another point encoder config:
+    ```bash
+    # change from
+    "point_backbone_config_name": "PointTransformer_8192point_2layer" # v1.2
+    # to
+    "point_backbone_config_name": "PointTransformer_base_8192point", # v1.1
+    ```
+3. Edit the checkpoint path of the point encoder in `scripts/train_stage1.sh`:
+    ```bash
+    # change from
+    point_backbone_ckpt=$model_name_or_path/point_bert_v1.2.pt # v1.2
+    # to
+    point_backbone_ckpt=$model_name_or_path/point_bert_v1.1.pt # v1.1
+    ```
+</details>
+### Chatting
+1. The trained model checkpoints are available [here](https://huggingface.co/RunsenXu) (including different versions of PointLLM).
+2. Run the following command to launch a chatbot using the `torch.float32` data type for chatting about 3D models of Objaverse. The model checkpoints will be downloaded automatically. You can also manually download the model checkpoints and specify their paths. Here is an example:
+```bash
+cd PointLLM
+PYTHONPATH=$PWD python pointllm/eval/PointLLM_chat.py --model_name RunsenXu/PointLLM_7B_v1.2 --data_name data/objaverse_data --torch_dtype float32
+```
+3. You can also easily modify the codes for using point clouds other than those from Objaverse, as long as the point clouds input to the model have dimensions (N, 6), where the first three dimensions are `xyz` and the last three dimensions are `rgb` (in [0, 1] range). You may sample the point clouds to have 8192 points, as our model is trained on such point clouds.
+4. The following table shows GPU requirements for different models and data types. We recommend using `torch.bfloat16` if applicable, which is used in the experiments in our paper.
+    |  Model   | Data Type | GPU Memory |
+    |:--------:|:---------:|:----------:|
+    | PointLLM-7B  | torch.float16 |    14GB    |
+    | PointLLM-7B  | torch.float32 |    28GB    |
+    | PointLLM-13B | torch.float16 |    26GB    |
+    | PointLLM-13B | torch.float32 |    52GB    |
+### Gradio Demo
+1. We provide the codes for our online Gradio demo. You can run the following commands to launch the demo locally for chatting and visualization.
+```bash
+cd PointLLM
+PYTHONPATH=$PWD python pointllm/eval/chat_gradio.py --model_name RunsenXu/PointLLM_7B_v1.2 --data_name data/objaverse_data
+```
+2. Kind remind: if you want to release the demo in public, please refer to https://www.gradio.app/guides/sharing-your-app#security-and-file-access.
+### Evaluation
+#### Inferencing
+1. Run the following commands to infer the results.
+2. Different commands for inferencing on different benchmarks (PointLLM_7B_v1.2 as an example):
+```bash
+cd PointLLM
+export PYTHONPATH=$PWD
+# Open Vocabulary Classification on Objaverse
+python pointllm/eval/eval_objaverse.py --model_name RunsenXu/PointLLM_7B_v1.2 --task_type classification --prompt_index 0 # or --prompt_index 1
+# Object captioning on Objaverse
+python pointllm/eval/eval_objaverse.py --model_name RunsenXu/PointLLM_7B_v1.2 --task_type captioning --prompt_index 2
+# Close-set Zero-shot Classification on ModelNet40
+python pointllm/eval/eval_modelnet_cls.py --model_name RunsenXu/PointLLM_7B_v1.2 --prompt_index 0 # or --prompt_index 1
+```
+3. Please check the default command-line arguments of these two scripts. You can specify different prompts, data paths, and other parameters.
+4. After inferencing, the results will be saved in `{model_name}/evaluation` as a dict with the following format:
+```bash
+{
+  "prompt": "",
+  "results": [
+    {
+      "object_id": "",
+      "ground_truth": "",
+      "model_output": "",
+      "label_name": "" # only for classification on modelnet40
+    }
+  ]
+}
+```
+#### ChatGPT/GPT-4 Evaluation
+1. Get your OpenAI API key at [https://platform.openai.com/api-keys](https://platform.openai.com/api-keys).
+2. Run the following commands to evaluate the model outputs in parallel with ChatGPT/GPT-4 (which cost approximately $1.5 to $2.2 USD).
+```bash
+cd PointLLM
+export PYTHONPATH=$PWD
+export OPENAI_API_KEY=sk-****
+# Open Vocabulary Classification on Objaverse
+python pointllm/eval/evaluator.py --results_path /path/to/model_output --model_type gpt-4-0613 --eval_type open-free-form-classification --parallel --num_workers 15
+# Object captioning on Objaverse
+python pointllm/eval/evaluator.py --results_path /path/to/model_output --model_type gpt-4-0613 --eval_type object-captioning --parallel --num_workers 15
+# Close-set Zero-shot Classification on ModelNet40
+python pointllm/eval/evaluator.py --results_path /path/to/model_output --model_type gpt-3.5-turbo-0613 --eval_type modelnet-close-set-classification --parallel --num_workers 15
+```
+3. The evaluation script supports interruption and resumption. You can interrupt the evaluation process at any time by using `Ctrl+C`. This will save the temporary results. If an error occurs during the evaluation, the script will also save the current state. You can resume the evaluation from where it left off by running the same command again.
+4. The evaluation results will be saved in `{model_name}/evaluation` as another dict.
+Some of the metrics are explained as follows:
+```bash
+"average_score": The GPT-evaluated captioning score we report in our paper.
+"accuracy": The classification accuracy we report in our paper, including random choices made by ChatGPT when model outputs are vague or ambiguous and ChatGPT outputs "INVALID".
+"clean_accuracy": The classification accuracy after removing those "INVALID" outputs.
+"total_predictions": The number of predictions.
+"correct_predictions": The number of correct predictions.
+"invalid_responses": The number of "INVALID" outputs by ChatGPT.
+# Some other statistics for calling OpenAI API
+"prompt_tokens": The total number of tokens of the prompts for ChatGPT/GPT-4.
+"completion_tokens": The total number of tokens of the completion results from ChatGPT/GPT-4.
+"GPT_cost": The API cost of the whole evaluation process, in US Dollars 💵.
+```
+5. <b>Open-Step Evaluation.</b> You can also start evaluation immediately after inferencing by passing the `--start_eval` flag and specifying the `--gpt_type`. For example:
+```bash
+python pointllm/eval/eval_objaverse.py --model_name RunsenXu/PointLLM_7B_v1.2 --task_type classification --prompt_index 0 --start_eval --gpt_type gpt-4-0613
+```
+#### Traditional Metric Evaluation
+1. For the object captioning task, run the following command to evaluate model outputs with traditional metrics including BLEU, ROUGE, METEOR, Sentence-BERT, and SimCSE.
+```bash
+python pointllm/eval/traditional_evaluator.py --results_path /path/to/model_captioning_output
+```
+2. Note that we recommend not using BLEU, ROUGE, and METEOR for evaluation as they favor short captions and fall short of capturing semantic accuracy and diversity.
+## 📝 TODO List
+- [x] Add inferencing codes with checkpoints.
+- [x] Release instruction-following data.
+- [x] Add training codes.
+- [x] Add evaluation codes.
+- [x] Add gradio demo codes.
+Community contributions are welcome!👇 If you need any support, please feel free to open an issue or contact us.
+- [ ] Support Phi-2 LLM to make PointLLM more accessible to the community.
+- [ ] Support Chinese LLMs like InternLM.
+## 🔗 Citation
+If you find our work and this codebase helpful, please consider starring this repo 🌟 and cite:
+```bibtex
+@article{xu2023pointllm,
+  title={PointLLM: Empowering Large Language Models to Understand Point Clouds},
+  author={Xu, Runsen and Wang, Xiaolong and Wang, Tai and Chen, Yilun and Pang, Jiangmiao and Lin, Dahua},
+  journal={arXiv preprint arXiv:2308.16911},
+  year={2023}
+}
+```
+## 📄 License
+<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/80x15.png" /></a>
+<br />
+This work is under the <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>.
+## 📚 Related Work
+Together, Let's make LLM for 3D great!
+- [Point-Bind & Point-LLM](https://arxiv.org/abs/2309.00615): aligns point clouds with Image-Bind, and leverages ImageBind-LLM to reason multi-modality input without 3D-instruction data training.
+- [3D-LLM](https://arxiv.org/abs/2307.12981): employs 2D foundation models to encode multi-view images of 3D point clouds.
+## 👏 Acknowledgements
+- [LLaVA](https://github.com/haotian-liu/LLaVA): Our codebase is built upon LLaVA.
+- [Vicuna](https://github.com/lm-sys/FastChat): We use the Vicuna-7B and Vicuna-13B checkpoints.
+- [Objaverse](https://objaverse.allenai.org): We use models of the Objaverse dataset for training and evaluation.
+- [Cap3D](https://github.com/crockwell/Cap3D/): We use the Cap3D captioning data for our data generation.
+- [ULIP-2](https://github.com/salesforce/ULIP): We use ULIP-2 for pre-training our point cloud encoder.

ThirdParty/PointLLM/__init__.py ADDED Viewed

File without changes

ThirdParty/PointLLM/pointllm/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # from .model import PointLLMLlamaForCausalLM

ThirdParty/PointLLM/pointllm/conversation.py ADDED Viewed

	@@ -0,0 +1,375 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def reset(self):
+        self.messages = self.messages[:self.offset]
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        if self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def pop_last_none_message(self):
+        # * pop the last message if it's None, this is used for multi-round dialogue
+        if self.messages[-1][1] is None:
+            self.messages.pop()
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode == "Crop":
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((224, 224))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="JPEG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    # image = image.resize((224, 224))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = msg.replace('<image>', img_str)
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Give three tips for staying healthy."),
+        ("Assistant",
+            "Sure, here are three tips for staying healthy:\n"
+            "1. Exercise regularly: Regular physical activity can help improve your overall health and wellbeing. "
+            "It can also help reduce your risk of chronic conditions such as obesity, diabetes, heart disease, "
+            "and certain cancers. Aim for at least 150 minutes of moderate-intensity aerobic exercise or "
+            "75 minutes of vigorous-intensity aerobic exercise per week, along with muscle-strengthening "
+            "activities at least two days per week.\n"
+            "2. Eat a balanced diet: Eating a balanced diet that is rich in fruits, "
+            "vegetables, whole grains, lean proteins, and healthy fats can help support "
+            "your overall health. Try to limit your intake of processed and high-sugar foods, "
+            "and aim to drink plenty of water throughout the day.\n"
+            "3. Get enough sleep: Getting enough quality sleep is essential for your physical "
+            "and mental health. Adults should aim for seven to nine hours of sleep per night. "
+            "Establish a regular sleep schedule and try to create a relaxing bedtime routine to "
+            "help improve the quality of your sleep.")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_v1_2 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1_1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+- You are a helpful language and vision assistant.
+- You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
+- You should follow the instructions carefully and explain your answers in detail.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_mpt_text = Conversation(
+    system="""<|im_start|>system
+- You are a helpful assistant chatbot trained by MosaicML.
+- You answer questions.
+- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_bair_v1 = Conversation(
+    system="BEGINNING OF CONVERSATION:",
+    roles=("USER", "GPT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+simple_conv = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!"),
+        ("Assistant", "Hi there! How can I help you today?")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+simple_conv_multimodal = Conversation(
+    system="You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab."
+           "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "Follow the instructions carefully and explain your answers in detail.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!"),
+        ("Assistant", "Hi there!  How can I help you today?\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+simple_conv_mpt_multimodal = Conversation(
+    system="""<|im_start|>system
+- You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab.
+- You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
+- You should follow the instructions carefully and explain your answers in detail.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+simple_conv_legacy = Conversation(
+    system="You are LLaVA, a large language model trained by UW Madison WAIV Lab."
+           "You are designed to assist human with a variety of tasks using natural language."
+           "Follow the instructions carefully.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!\n\n### Response:"),
+        ("Assistant", "Hi there!  How can I help you today?\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v1 = Conversation(
+    system="You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab."
+           "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "Follow the instructions carefully and explain your answers in detail.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+default_conversation = conv_v1_2
+conv_templates = {
+    "default": conv_v1_2,
+    "simple": simple_conv,
+    "simple_legacy": simple_conv_legacy,
+    "multimodal": simple_conv_multimodal,
+    "mpt_multimodal": simple_conv_mpt_multimodal,
+    "llava_v1": conv_llava_v1,
+    # fastchat
+    "v1": conv_v1_2,
+    "bair_v1": conv_bair_v1,
+    "vicuna_v1_1": conv_vicuna_v1_1,
+    "mpt": conv_mpt,
+    "mpt_text": conv_mpt_text,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

ThirdParty/PointLLM/pointllm/data/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .utils import load_objaverse_point_cloud, pc_norm, farthest_point_sample
+from .object_point_dataset import ObjectPointCloudDataset, make_object_point_data_module
+from .modelnet import ModelNet

ThirdParty/PointLLM/pointllm/data/modelnet.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import os
+import torch
+import numpy as np
+import pickle
+from torch.utils.data import Dataset
+from pointllm.utils import *
+from pointllm.data.utils import *
+class ModelNet(Dataset):
+    def __init__(self, config_path, split, subset_nums=-1, use_color=False):
+        """
+        Args:
+            data_args:
+                split: train or test
+        """
+        super(ModelNet, self).__init__()
+        if config_path is None:
+            # * use the default config file in the same dir
+            config_path = os.path.join(os.path.dirname(__file__), "modelnet_config", "ModelNet40.yaml")
+        config = cfg_from_yaml_file(config_path)
+        # * check data path
+        self.root = config["DATA_PATH"]
+        if not os.path.exists(self.root):
+            print(f"Data path {self.root} does not exist. Please check your data path.")
+            exit()
+        self.npoints = config.npoints
+        self.num_category = config.NUM_CATEGORY # * should be 40
+        self.random_sample = config.random_sampling
+        self.use_height = config.use_height
+        self.use_normals = config.USE_NORMALS
+        self.subset_nums = subset_nums
+        self.normalize_pc = True
+        self.use_color = use_color
+        if self.use_height or self.use_normals:
+            print(f"Warning: Usually we don't use height or normals for shapenet but use_height: {self.use_height} and \
+                  use_normals: {self.use_normals}.")
+        self.split = split
+        assert (self.split == 'train' or self.split == 'test')
+        self.catfile = os.path.join(os.path.dirname(__file__), "modelnet_config", 'modelnet40_shape_names_modified.txt')
+        # "tv_stand" -> "tv stand"
+        self.categories = [line.rstrip() for line in open(self.catfile)] # * list of category names
+        self.save_path = os.path.join(self.root,
+                                    'modelnet%d_%s_%dpts_fps.dat' % (self.num_category, self.split, self.npoints))
+        print('Load processed data from %s...' % self.save_path)
+        with open(self.save_path, 'rb') as f:
+            self.list_of_points, self.list_of_labels = pickle.load(f) # * ndarray of N, C: (8192, 6) (xyz and normals)
+        if self.subset_nums > 0:
+            # * set random seed
+            import random
+            random.seed(0)
+            # * random choose subset_nums
+            idxs = random.sample(range(len(self.list_of_labels)), self.subset_nums)
+            self.list_of_labels = [self.list_of_labels[idx] for idx in idxs]
+            self.list_of_points = [self.list_of_points[idx] for idx in idxs]
+        # * print len
+        print(f"Load {len(self.list_of_points)} data from {self.save_path}.")
+    def __len__(self):
+        return len(self.list_of_labels)
+    def _get_item(self, index):
+        point_set, label = self.list_of_points[index], self.list_of_labels[index]
+        if  self.npoints < point_set.shape[0]:
+            if self.random_sample:
+                # * random sample
+                point_set = point_set[np.random.choice(point_set.shape[0], self.npoints, replace=False)]
+            else:
+                point_set = farthest_point_sample(point_set, self.npoints)
+        point_set[:, 0:3] = pc_normalize(point_set[:, 0:3])
+        if not self.use_normals:
+            point_set = point_set[:, 0:3]
+        if self.use_height:
+            self.gravity_dim = 1
+            height_array = point_set[:, self.gravity_dim:self.gravity_dim + 1] - point_set[:,
+                                                                            self.gravity_dim:self.gravity_dim + 1].min()
+            point_set = np.concatenate((point_set, height_array), axis=1)
+        point_set = np.concatenate((point_set, np.zeros_like(point_set)), axis=-1) if self.use_color else point_set
+        return point_set, label.item() # * ndarray, int
+    def pc_norm(self, pc):
+        """ pc: NxC, return NxC """
+        xyz = pc[:, :3]
+        other_feature = pc[:, 3:]
+        centroid = np.mean(xyz, axis=0)
+        xyz = xyz - centroid
+        m = np.max(np.sqrt(np.sum(xyz ** 2, axis=1)))
+        xyz = xyz / m
+        pc = np.concatenate((xyz, other_feature), axis=1)
+        return pc
+    def __getitem__(self, index):
+        points, label = self._get_item(index)
+        pt_idxs = np.arange(0, points.shape[0])  # 2048
+        if self.split == 'train':
+            np.random.shuffle(pt_idxs)
+        current_points = points[pt_idxs].copy()
+        if self.normalize_pc:
+            # * modelnet point cloud is already normalized
+            current_points = self.pc_norm(current_points)
+        current_points = torch.from_numpy(current_points).float() # * N, C tensors
+        label_name = self.categories[int(label)]
+        data_dict = {
+            "indice": index, # * int
+            "point_clouds": current_points, # * tensor of N, C
+            "labels": label, # * int
+            "label_names": label_name # * str
+        }
+        return data_dict
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='ModelNet Dataset')
+    parser.add_argument("--config_path", type=str, default=None, help="config file path.")
+    parser.add_argument("--split", type=str, default="test", help="train or test.")
+    parser.add_argument("--subset_nums", type=int, default=200)
+    args = parser.parse_args()
+    dataset = ModelNet(config_path=args.config_path, split=args.split, subset_nums=args.subset_nums)
+    # * get the first item
+    print(dataset[0])

ThirdParty/PointLLM/pointllm/data/modelnet_config/ModelNet40.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+NAME: ModelNet
+DATA_PATH: data/modelnet40_data
+NUM_CATEGORY: 40
+USE_NORMALS: FALSE
+npoints: 8192
+random_sampling: TRUE
+use_height: FALSE
+use_normals: FALSE

ThirdParty/PointLLM/pointllm/data/object_point_dataset.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import os
+import json
+import torch
+import numpy as np
+import copy
+import transformers
+from torch.utils.data import Dataset
+from .utils import *
+def make_object_point_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for Joint3Ddataset with text and point cloud data."""
+    """Initialize datasets."""
+    data_collator = DataCollatorForPointTextDataset(tokenizer=tokenizer)
+    if data_args.split_train_val:
+        print("Loading training datasets.")
+        train_dataset = ObjectPointCloudDataset(
+            split='train',
+            data_path=data_args.data_path,
+            anno_path=data_args.anno_path,
+            pointnum=data_args.pointnum,
+            conversation_types=data_args.conversation_types,
+            tokenizer=tokenizer,
+            use_color=data_args.use_color,
+            data_args=data_args
+        )
+        print("Done!")
+        if data_args.data_debug_num > 0:
+            print('Debug mode, using training set as val set.')
+            val_dataset = train_dataset
+        else:
+            # * make a val dataset
+            print("Loading validation datasets.")
+            val_dataset = ObjectPointCloudDataset(
+                split='val', # * load train split
+                data_path=data_args.data_path,
+                anno_path=data_args.anno_path,
+                pointnum=data_args.pointnum,
+                conversation_types=data_args.conversation_types,
+                tokenizer=tokenizer,
+                use_color=data_args.use_color,
+                data_args=data_args
+            )
+        return dict(train_dataset=train_dataset, eval_dataset=val_dataset, data_collator=data_collator)
+    else:
+        # * use all data as training data
+        train_dataset = ObjectPointCloudDataset(
+            split='train',
+            data_path=data_args.data_path,
+            anno_path=data_args.anno_path,
+            pointnum=data_args.pointnum,
+            conversation_types=data_args.conversation_types,
+            use_color=data_args.use_color,
+            tokenizer=tokenizer,
+            data_args=data_args
+        )
+        return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
+class ObjectPointCloudDataset(Dataset):
+    """Dataset utilities for objaverse."""
+    def __init__(self,
+                 data_path=None,
+                 anno_path=None,
+                 tokenizer=None,
+                 pointnum=8192,
+                 split='train',
+                 conversation_types=None, # * default is simple_des, used for stage1 pre-train
+                 use_color=True,
+                 data_args=None):
+        """
+        split: only considered when data_args.split_train_val is True.
+        conversation_types: tuple, used to filter the data, default is ('simple_description'), other types is:
+            "detailed_description", "single_round", "multi_round".
+        tokenizer: load point clouds only if None
+        """
+        super(ObjectPointCloudDataset, self).__init__()
+        """Initialize dataset with object point clouds and text"""
+        self.data_path = data_path
+        self.anno_path = anno_path
+        self.tokenizer = tokenizer
+        self.split = split
+        if conversation_types is None:
+            self.conversation_types = ("simple_description",)
+        else:
+            self.conversation_types = conversation_types
+        self.data_args = data_args
+        self.normalize_pc = True
+        self.use_color = use_color
+        self.pointnum = pointnum
+        self.point_backbone_config = data_args.point_backbone_config if data_args is not None else None
+        self.point_indicator = '<point>'
+        # Load the data list from JSON
+        print(f"Loading anno file from {anno_path}.")
+        with open(anno_path, "r") as json_file:
+            self.list_data_dict = json.load(json_file)
+        # * print the conversations_type
+        print(f"Using conversation_type: {self.conversation_types}")
+        # * print before filtering
+        print(f"Before filtering, the dataset size is: {len(self.list_data_dict)}.")
+        # * iterate the list and filter
+        # * these two ids have corrupted colored point files, so filter them when use_color is True
+        filter_ids = ['6760e543e1d645d5aaacd3803bcae524', 'b91c0711149d460a8004f9c06d3b7f38'] if self.use_color else []
+        # Iterate the list, filter those "conversation_type" not in self.conversation_types
+        self.list_data_dict = [
+            data for data in self.list_data_dict
+            if data.get('conversation_type', 'simple_description') in self.conversation_types
+            and data.get('object_id') not in filter_ids
+        ]
+        # * print after filtering
+        print(f"After filtering, the dataset size is: {len(self.list_data_dict)}.")
+        # * print the size of different conversation_type
+        for conversation_type in self.conversation_types:
+            print(f"Number of {conversation_type}: {len([data for data in self.list_data_dict if data.get('conversation_type', 'simple_description') == conversation_type])}")
+        if self.data_args is not None and self.data_args.data_debug_num > 0:
+            self.list_data_dict = self.list_data_dict[:self.data_args.data_debug_num]
+            # * print all the scan_id in debug mode, not using for loop
+            print('Debug mode, using: ' + ' '.join([data['object_id'] for data in self.list_data_dict]))
+        elif self.data_args is not None and self.data_args.split_train_val:
+            # * split train and val with 9:1 ratios
+            if self.split == 'train':
+                self.list_data_dict = self.list_data_dict[:int(self.data_args.split_ratio * len(self.list_data_dict))]
+                print(f"Train set size: {len(self.list_data_dict)}")
+            else:
+                self.list_data_dict = self.list_data_dict[int(self.data_args.split_ratio * len(self.list_data_dict)):]
+                print(f"Val set size: {len(self.list_data_dict)}")
+    def _load_point_cloud(self, object_id, type='objaverse'):
+        if type == 'objaverse':
+            return self._load_objaverse_point_cloud(object_id)
+    def _load_objaverse_point_cloud(self, object_id):
+        filename = f"{object_id}_{self.pointnum}.npy"
+        point_cloud = np.load(os.path.join(self.data_path, filename))
+        if not self.use_color:
+            point_cloud = point_cloud[:, :3]
+        return point_cloud
+    def pc_norm(self, pc):
+        """ pc: NxC, return NxC """
+        xyz = pc[:, :3]
+        other_feature = pc[:, 3:]
+        centroid = np.mean(xyz, axis=0)
+        xyz = xyz - centroid
+        m = np.max(np.sqrt(np.sum(xyz ** 2, axis=1)))
+        xyz = xyz / m
+        pc = np.concatenate((xyz, other_feature), axis=1)
+        return pc
+    def __getitem__(self, index):
+        sources = self.list_data_dict[index]
+        if isinstance(index, int):
+            sources = [sources]
+        assert len(sources) == 1, "sources should be a list"
+        if self.point_indicator in sources[0]['conversations'][0]['value']:
+            object_id = self.list_data_dict[index]['object_id']
+            # Point cloud representation
+            point_cloud = self._load_point_cloud(object_id) # * N, C
+            if self.normalize_pc:
+                point_cloud = self.pc_norm(point_cloud) # * need to norm since point encoder is norm
+            if self.tokenizer is None:
+                data_dict = dict(
+                    point_clouds=torch.from_numpy(point_cloud.astype(np.float32)),
+                    object_ids=object_id
+                )
+                return data_dict
+            sources = preprocess_multimodal_point_cloud(
+                copy.deepcopy([e["conversations"] for e in sources]), self.point_backbone_config, point_indicator=self.point_indicator)
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+        data_dict = preprocess_v1(
+            sources,
+            self.tokenizer)
+        if isinstance(index, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+        # point exist in the data
+        if self.point_indicator in self.list_data_dict[index]['conversations'][0]['value']:
+            data_dict['point_clouds'] = torch.from_numpy(point_cloud.astype(np.float32))
+        return data_dict
+    def __len__(self):
+        """Return number of utterances."""
+        return len(self.list_data_dict)
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_path", default="data/objaverse_data", type=str,
+                        help="Path to the data directory.")
+    parser.add_argument("--anno_path", default=None, type=str, required=True,
+                        help="Path to the annotation file.")
+    parser.add_argument("--split", default='train', type=str,
+                        help="Whether to use the train or validation dataset.")
+    parser.add_argument("--pointnum", default=8192, type=int,
+                        help="Number of points in the point cloud.")
+    parser.add_argument("--data_debug_num", default=0, type=int,
+                        help="Number of data to debug with.")
+    parser.add_argument("--split_train_val", default=False, type=bool,
+                        help="Whether to split the dataset into training and validation.")
+    parser.add_argument("--split_ratio", default=0.9, type=float,
+                        help="The ratio of training to validation data.")
+    parser.add_argument("--tokenizer_path", default=None, type=str, required=True,
+                        help="Path to the tokenizer config file.")
+    args = parser.parse_args()
+    # Initialize tokenizer
+    tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer_path)
+    args.point_backbone_config = None
+    # Initialize dataset
+    dataset = ObjectPointCloudDataset(
+        data_path=args.data_path,
+        anno_path=args.anno_path,
+        pointnum=args.pointnum,
+        split=args.split,
+        tokenizer=tokenizer,
+        data_args=args
+    )
+    # Example usage
+    print(f'Dataset length: {len(dataset)}')

ThirdParty/PointLLM/pointllm/data/utils.py ADDED Viewed

	@@ -0,0 +1,236 @@

+from collections import OrderedDict, defaultdict
+import transformers
+from pointllm import conversation as conversation_lib
+from dataclasses import dataclass
+from typing import Optional, Dict, Sequence
+import torch
+import numpy as np
+import os
+IGNORE_INDEX = -100
+# * Sample Usage:
+# * from utils import LRUCache
+# * cache = LRUCache(capacity, max_access_count)
+# if self.cache is None:
+#     info_data = self.multiview_scannet[info_index]
+# else:
+#     info_data = self.cache.get(info_index)
+#     if info_data is None or self.cache.get_access_count(info_index) >= self.cache.max_access_count:
+#         # If not in cache, or accessed max_access_count times, load it and put it in cache
+#         info_data = self.multiview_scannet[info_index]
+#         self.cache.put(info_index, info_data)
+#         self.cache.reset_access_count(info_index)
+class LRUCache:
+    def __init__(self, capacity, max_access_count):
+        self.cache = OrderedDict()
+        self.access_count = defaultdict(int)
+        self.capacity = capacity
+        self.max_access_count = max_access_count
+    def get(self, key):
+        if key not in self.cache:
+            return None
+        value = self.cache.pop(key)
+        self.cache[key] = value  # Put key as the newest one
+        self.access_count[key] += 1
+        return value
+    def put(self, key, value):
+        if key in self.cache:  # Update the value and put it as newest
+            self.cache.pop(key)
+        elif len(self.cache) == self.capacity:  # If cache is full
+            oldest_key = next(iter(self.cache))
+            self.cache.popitem(last=False)  # Remove oldest item
+            del self.access_count[oldest_key]  # Remove the corresponding access count
+        self.cache[key] = value
+        self.access_count[key] = 1
+    def get_access_count(self, key):
+        return self.access_count.get(key, 0)
+    def reset_access_count(self, key):
+        self.access_count[key] = 0
+def preprocess_v1(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    input_ids = tokenizer(
+        conversations,
+        return_tensors="pt",
+        padding="longest",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+    ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2: # * can handle padded tokens
+                break
+            parts[0] += sep
+            round_len = len(tokenizer(rou).input_ids)
+            instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX # * this is necessary for padded tokens
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len: # * unk tokens in the dialogue will cause this.
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_multimodal_point_cloud(
+    sources: Sequence[str],
+    point_backbone_config: dict,
+    point_indicator: str = "<point>",
+) -> Dict:
+    point_token_len = point_backbone_config['point_token_len']
+    default_point_patch_token = point_backbone_config['default_point_patch_token']
+    for source in sources:
+        for sentence in source:
+            replace_token = default_point_patch_token * point_token_len
+            if point_backbone_config['mm_use_point_start_end']:
+                replace_token = point_backbone_config['default_point_start_token']+ replace_token + point_backbone_config['default_point_end_token']
+            sentence["value"] = sentence["value"].replace(point_indicator, replace_token)
+    return sources
+def pc_norm(pc):
+    """ pc: NxC, return NxC """
+    xyz = pc[:, :3]
+    other_feature = pc[:, 3:]
+    centroid = np.mean(xyz, axis=0)
+    xyz = xyz - centroid
+    m = np.max(np.sqrt(np.sum(xyz ** 2, axis=1)))
+    xyz = xyz / m
+    pc = np.concatenate((xyz, other_feature), axis=1)
+    return pc
+def load_objaverse_point_cloud(data_path, object_id, pointnum=8192, use_color=False):
+    filename = f"{object_id}_{pointnum}.npy"
+    point_cloud = np.load(os.path.join(data_path, filename))
+    # * normalize
+    point_cloud = pc_norm(point_cloud)
+    if not use_color:
+        point_cloud = point_cloud[:, :3]
+    return point_cloud
+@dataclass
+class DataCollatorForPointTextDataset(object):
+    """Collate examples for mixed dataset with text and point cloud data."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        if 'point_clouds' in instances[0]:
+            point_clouds = [instance['point_clouds'] for instance in instances]
+            if all(x is not None and x.shape == point_clouds[0].shape for x in point_clouds): # * point_clouds have different shapes
+                batch['point_clouds'] = torch.stack(point_clouds)
+            else:
+                batch['point_clouds'] = point_clouds # * return as lists
+        return batch
+def farthest_point_sample(point, npoint):
+    """
+    Input:
+        xyz: pointcloud data, [N, D]
+        npoint: number of samples
+    Return:
+        centroids: sampled pointcloud index, [npoint, D]
+    """
+    N, D = point.shape
+    xyz = point[:,:3]
+    centroids = np.zeros((npoint,))
+    distance = np.ones((N,)) * 1e10
+    farthest = np.random.randint(0, N)
+    for i in range(npoint):
+        centroids[i] = farthest
+        centroid = xyz[farthest, :]
+        dist = np.sum((xyz - centroid) ** 2, -1)
+        mask = dist < distance
+        distance[mask] = dist[mask]
+        farthest = np.argmax(distance, -1)
+    point = point[centroids.astype(np.int32)]
+    return point
+def pc_normalize(pc):
+    """
+    pc: Nx3 array
+    This functions normalizes a point cloud to fit within a unit sphere.
+    It first calculates the centroid of the point cloud and then subtracts
+    it from all points before scaling all points to fit within a unit sphere.
+    """
+    centroid = np.mean(pc, axis=0)
+    pc = pc - centroid
+    m = np.max(np.sqrt(np.sum(pc**2, axis=1)))
+    pc = pc / m
+    return pc

ThirdParty/PointLLM/pointllm/eval/PointLLM_chat.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import argparse
+from transformers import AutoTokenizer
+import torch
+import os
+from pointllm.conversation import conv_templates, SeparatorStyle
+from pointllm.utils import disable_torch_init
+from pointllm.model import *
+from pointllm.model.utils import KeywordsStoppingCriteria
+from pointllm.data import load_objaverse_point_cloud
+import os
+def load_point_cloud(args):
+    object_id = args.object_id
+    print(f"[INFO] Loading point clouds using object_id: {object_id}")
+    point_cloud = load_objaverse_point_cloud(args.data_path, object_id, pointnum=8192, use_color=True)
+    return object_id, torch.from_numpy(point_cloud).unsqueeze_(0).to(torch.float32)
+def init_model(args):
+    # Model
+    disable_torch_init()
+    model_path = args.model_path
+    print(f'[INFO] Model name: {model_path}')
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = PointLLMLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=False, use_cache=True, torch_dtype=args.torch_dtype).cuda()
+    model.initialize_tokenizer_point_backbone_config_wo_embedding(tokenizer)
+    model.eval()
+    mm_use_point_start_end = getattr(model.config, "mm_use_point_start_end", False)
+    # Add special tokens ind to model.point_config
+    point_backbone_config = model.get_model().point_backbone_config
+    if mm_use_point_start_end:
+        if "v1" in model_path.lower():
+            conv_mode = "vicuna_v1_1"
+        else:
+            raise NotImplementedError
+        conv = conv_templates[conv_mode].copy()
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+    keywords = [stop_str]
+    return model, tokenizer, point_backbone_config, keywords, mm_use_point_start_end, conv
+def start_conversation(args, model, tokenizer, point_backbone_config, keywords, mm_use_point_start_end, conv):
+    point_token_len = point_backbone_config['point_token_len']
+    default_point_patch_token = point_backbone_config['default_point_patch_token']
+    default_point_start_token = point_backbone_config['default_point_start_token']
+    default_point_end_token = point_backbone_config['default_point_end_token']
+    # The while loop will keep running until the user decides to quit
+    print("[INFO] Starting conversation... Enter 'q' to exit the program and enter 'exit' to exit the current conversation.")
+    while True:
+        print("-" * 80)
+        # Prompt for object_id
+        object_id = input("[INFO] Please enter the object_id or 'q' to quit: ")
+        # Check if the user wants to quit
+        if object_id.lower() == 'q':
+            print("[INFO] Quitting...")
+            break
+        else:
+            # print info
+            print(f"[INFO] Chatting with object_id: {object_id}.")
+        # Update args with new object_id
+        args.object_id = object_id.strip()
+        # Load the point cloud data
+        try:
+            id, point_clouds = load_point_cloud(args)
+        except Exception as e:
+            print(f"[ERROR] {e}")
+            continue
+        point_clouds = point_clouds.cuda().to(args.torch_dtype)
+        # Reset the conversation template
+        conv.reset()
+        print("-" * 80)
+        # Start a loop for multiple rounds of dialogue
+        for i in range(100):
+            # This if-else block ensures the initial question from the user is included in the conversation
+            qs = input(conv.roles[0] + ': ')
+            if qs == 'exit':
+                break
+            if i == 0:
+                if mm_use_point_start_end:
+                    qs = default_point_start_token + default_point_patch_token * point_token_len + default_point_end_token + '\n' + qs
+                else:
+                    qs = default_point_patch_token * point_token_len + '\n' + qs
+            # Append the new message to the conversation history
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            inputs = tokenizer([prompt])
+            input_ids = torch.as_tensor(inputs.input_ids).cuda()
+            stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+            stop_str = keywords[0]
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    point_clouds=point_clouds,
+                    do_sample=True,
+                    temperature=1.0,
+                    top_k=50,
+                    max_length=2048,
+                    top_p=0.95,
+                    stopping_criteria=[stopping_criteria])
+            input_token_len = input_ids.shape[1]
+            n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+            if n_diff_input_output > 0:
+                print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+            outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+            outputs = outputs.strip()
+            if outputs.endswith(stop_str):
+                outputs = outputs[:-len(stop_str)]
+            outputs = outputs.strip()
+            # Append the model's response to the conversation history
+            conv.pop_last_none_message()
+            conv.append_message(conv.roles[1], outputs)
+            print(f'{conv.roles[1]}: {outputs}\n')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, \
+       default="RunsenXu/PointLLM_7B_v1.2")
+    parser.add_argument("--data_path", type=str, default="data/objaverse_data")
+    parser.add_argument("--torch_dtype", type=str, default="float32", choices=["float32", "float16", "bfloat16"])
+    args = parser.parse_args()
+    dtype_mapping = {
+        "float32": torch.float32,
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+    }
+    args.torch_dtype = dtype_mapping[args.torch_dtype]
+    model, tokenizer, point_backbone_config, keywords, mm_use_point_start_end, conv = init_model(args)
+    start_conversation(args, model, tokenizer, point_backbone_config, keywords, mm_use_point_start_end, conv)

ThirdParty/PointLLM/pointllm/eval/chat_gradio.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import argparse
+from transformers import AutoTokenizer
+import torch
+import os
+from pointllm.conversation import conv_templates, SeparatorStyle
+from pointllm.utils import disable_torch_init
+from pointllm.model import *
+from pointllm.model.utils import KeywordsStoppingCriteria
+import numpy as np
+from pointllm.data import pc_norm, farthest_point_sample
+import os
+# Additional import for gradio
+import gradio as gr
+import open3d as o3d
+import plotly.graph_objects as go
+import objaverse
+import time
+import logging
+def change_input_method(input_method):
+    if input_method == 'File':
+        result = [gr.update(visible=True),
+        gr.update(visible=False)]
+    elif input_method == 'Object ID':
+        result = [gr.update(visible=False),
+        gr.update(visible=True)]
+    return result
+def init_model(args):
+    # Model
+    disable_torch_init()
+    model_name = os.path.expanduser(args.model_name)
+    # * print the model_name (get the basename)
+    print(f'[INFO] Model name: {os.path.basename(model_name)}')
+    logging.warning(f'Model name: {os.path.basename(model_name)}')
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = PointLLMLlamaForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=False, use_cache=True).cuda()
+    model.initialize_tokenizer_point_backbone_config_wo_embedding(tokenizer)
+    model.eval()
+    mm_use_point_start_end = getattr(model.config, "mm_use_point_start_end", False)
+    # Add special tokens ind to model.point_config
+    point_backbone_config = model.get_model().point_backbone_config
+    conv = conv_templates["vicuna_v1_1"].copy()
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+    keywords = [stop_str]
+    return model, tokenizer, point_backbone_config, keywords, mm_use_point_start_end, conv
+def start_conversation(args, model, tokenizer, point_backbone_config, keywords, mm_use_point_start_end, conv):
+    point_token_len = point_backbone_config['point_token_len']
+    default_point_patch_token = point_backbone_config['default_point_patch_token']
+    default_point_start_token = point_backbone_config['default_point_start_token']
+    default_point_end_token = point_backbone_config['default_point_end_token']
+    # The while loop will keep running until the user decides to quit
+    print("[INFO] Starting conversation...")
+    logging.warning("Starting conversation...")
+    while True:
+        print("-" * 80)
+        logging.warning("-" * 80)
+        # Reset the conversation template
+        conv.reset()
+        def confirm_point_cloud(input_choice, object_id_input, point_cloud_input, chatbot, answer_time, conv):
+            objects = None
+            data = None
+            object_id_input = object_id_input.strip()
+            print("%" * 80)
+            logging.warning("%" * 80)
+            if input_choice == 'File':
+                file = point_cloud_input.name
+                print(f"Uploading file: {file}.")
+                logging.warning(f"Uploading file: {file}.")
+            elif input_choice == 'Object ID':
+                file = os.path.join(args.data_path, "{}_8192.npy".format(object_id_input))
+                print(f"Object_id: {object_id_input}")
+                logging.warning(f"Object_id: {object_id_input}")
+                object_uids = [object_id_input]
+                objects = objaverse.load_objects(uids=object_uids)
+            print("%" * 80)
+            logging.warning("%" * 80)
+            manual_no_color = "no_color" in file
+            try:
+                if '.ply' in file:
+                    pcd = o3d.io.read_point_cloud(file)
+                    points = np.asarray(pcd.points)  # xyz
+                    colors = np.asarray(pcd.colors)  # rgb, if available
+                    # * if no colors actually, empty array
+                    if colors.size == 0:
+                        colors = None
+                elif '.npy' in file:
+                    data = np.load(file)
+                    if data.shape[1] >= 3:
+                        points = data[:, :3]
+                    else:
+                        raise ValueError("Input array has the wrong shape. Expected: [N, 3]. Got: {}.".format(data.shape))
+                    colors = None if data.shape[1] < 6 else data[:, 3:6]
+                else:
+                    raise ValueError("Not supported data format.")
+            # error
+            except Exception as e:
+                print(f"[ERROR] {e}")
+                logging.warning(f"[ERROR] {e}")
+                chatbot_system_message = "Sorry. The Objaverse id is not supported or the uploaded file has something wrong!"
+                print(f"[ChatBot System Message]: {chatbot_system_message}")
+                logging.warning(f"[ChatBot System Message]: {chatbot_system_message}")
+                outputs = f"<span style='color: red;'>[System] {chatbot_system_message}</span>" # "You upload a new Points Cloud"
+                chatbot = chatbot + [[None, outputs]]
+                return None, None, chatbot, answer_time, None
+            if manual_no_color:
+                colors = None
+            if colors is not None:
+                # * if colors in range(0-1)
+                if np.max(colors) <= 1:
+                    color_data = np.multiply(colors, 255).astype(int)  # Convert float values (0-1) to integers (0-255)
+                # * if colors in range(0-255)
+                elif np.max(colors) <= 255:
+                    color_data = colors.astype(int)
+            else:
+                color_data = np.zeros_like(points).astype(int)  # Default to black color if RGB information is not available
+            colors = color_data.astype(np.float32) / 255 # model input is (0-1)
+            # Convert the RGB color data to a list of RGB strings in the format 'rgb(r, g, b)'
+            color_strings = ['rgb({},{},{})'.format(r, g, b) for r, g, b in color_data]
+            fig = go.Figure(
+                data=[
+                    go.Scatter3d(
+                        x=points[:, 0], y=points[:, 1], z=points[:, 2],
+                        mode='markers',
+                        marker=dict(
+                            size=1.2,
+                            color=color_strings,  # Use the list of RGB strings for the marker colors
+                        )
+                    )
+                ],
+                layout=dict(
+                    scene=dict(
+                        xaxis=dict(visible=False),
+                        yaxis=dict(visible=False),
+                        zaxis=dict(visible=False)
+                    ),
+                    paper_bgcolor='rgb(255,255,255)'  # Set the background color to dark gray 50, 50, 50
+                ),
+            )
+            points = np.concatenate((points, colors), axis=1)
+            if 8192 < points.shape[0]:
+                points = farthest_point_sample(points, 8192)
+            point_clouds = pc_norm(points)
+            point_clouds = torch.from_numpy(point_clouds).unsqueeze_(0).to(torch.float32).cuda()
+            answer_time = 0
+            conv.reset()
+            outputs = "<span style='color: red;'>[System] New Point Cloud</span>"
+            chatbot = chatbot + [[None, outputs]]
+            return fig, list(objects.values())[0] if objects is not None else None, chatbot, answer_time, point_clouds
+        def answer_generate(history, answer_time, point_clouds, conv):
+            if point_clouds is None:
+                outputs = "<span style='color: red;'>[System] Please input point cloud! </span>"
+                history[-1][1] = outputs
+                yield history
+            else:
+                print(f"Answer Time: {answer_time}")
+                logging.warning(f"Answer Time: {answer_time}")
+                input_text = history[-1][0]
+                qs = input_text
+                if answer_time == 0:
+                    if mm_use_point_start_end:
+                        qs = default_point_start_token + default_point_patch_token * point_token_len + default_point_end_token + '\n' + qs
+                    else:
+                        qs = default_point_patch_token * point_token_len + '\n' + qs
+                # Append the new message to the conversation history
+                conv.append_message(conv.roles[0], qs)
+                conv.append_message(conv.roles[1], None)
+                prompt = conv.get_prompt()
+                print("#" * 80)
+                print(f'{prompt.replace("<point_patch>" * point_token_len, f"<point_patch> * {point_token_len}")}') # for concise printing
+                print("#" * 80)
+                logging.warning("#" * 80)
+                logging.warning(f'{prompt.replace("<point_patch>" * point_token_len, f"<point_patch> * {point_token_len}")}') # for concise printing
+                logging.warning("#" * 80)
+                inputs = tokenizer([prompt])
+                input_ids = torch.as_tensor(inputs.input_ids).cuda()
+                stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+                stop_str = keywords[0]
+                try:
+                    if input_ids.shape[1] >= 2047:
+                        raise ValueError("Current context length exceeds the maximum context length (2048) of the model.")
+                    with torch.inference_mode():
+                        output_ids = model.generate(
+                            input_ids,
+                            point_clouds=point_clouds,
+                            do_sample=True,
+                            temperature=1.0,
+                            top_k=50,
+                            max_length=2048,
+                            top_p=0.95,
+                            stopping_criteria=[stopping_criteria])
+                    input_token_len = input_ids.shape[1]
+                    n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+                    if n_diff_input_output > 0:
+                        print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+                        logging.warning(f'{n_diff_input_output} output_ids are not the same as the input_ids')
+                    outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+                    outputs = outputs.strip()
+                    if outputs.endswith(stop_str):
+                        outputs = outputs[:-len(stop_str)]
+                    outputs = outputs.strip()
+                    # Append the model's response to the conversation history
+                    conv.pop_last_none_message()
+                    conv.append_message(conv.roles[1], outputs)
+                    print(f'{conv.roles[1]}: {outputs}\n')
+                    logging.warning(f'{conv.roles[1]}: {outputs}\n')
+                    answer_time += 1
+                    history[-1][1] = ""
+                    for character in outputs:
+                        history[-1][1] += character
+                        yield history
+                # error
+                except Exception as e:
+                    print(f"[ERROR] {e}")
+                    logging.warning(f"[ERROR] {e}")
+                    if input_ids.shape[1] >= 2047:
+                        chatbot_system_message = "Current context length exceeds the maximum context length (2048) of the model. Please press 'Clear' to restart."
+                    else:
+                        chatbot_system_message = "Sorry. There is something wrong when generating. Please check the your uploaded point cloud or the Objaverse id, and \
+                        confirm the point cloud again."
+                    print(f"[ChatBot System Message]: {chatbot_system_message}")
+                    logging.warning(f"[ChatBot System Message]: {chatbot_system_message}")
+                    outputs = f"<span style='color: red;'>[System] {chatbot_system_message}</span>" # "You upload a new Points Cloud"
+                    history[-1][1] = outputs
+                    yield history
+        with gr.Blocks() as demo:
+            answer_time = gr.State(value=0)
+            point_clouds = gr.State(value=None)
+            conv_state = gr.State(value=conv.copy())
+            gr.Markdown(
+                """
+                # PointLLM: Empowering Large Language Models to Understand Point Clouds. 🚀
+                If you think this demo interesting, please consider starring 🌟 our github repo. :)
+                [[Project Page](https://runsenxu.com/projects/PointLLM)] [[Paper](https://arxiv.org/abs/2308.16911)] [[Code](https://github.com/OpenRobotLab/PointLLM)]
+                """
+            )
+            with gr.Row():
+                with gr.Column():
+                    input_choice = gr.Radio(['File', 'Object ID'], value='Object ID', interactive=True, label='Input Method', info="How do you want to load point clouds?")
+                    object_id_input = gr.Textbox(visible = True,lines=1, label='Object ID Input')
+                    point_cloud_input = gr.File(visible = False, label="Upload Point Cloud File (PLY, NPY)")
+                    output = gr.Plot()
+                    btn = gr.Button(value="Confirm Point Cloud")
+                model3D = gr.Model3D()
+                with gr.Column():
+                    chatbot  = gr.Chatbot([], elem_id="chatbot", height=560) # ,color_map=("green", "pink")
+                    def user(user_message, history):
+                        return "", history + [[user_message, None]]
+                    def clear_conv(history, conv):
+                        conv.reset()
+                        return None, 0
+                    with gr.Row():
+                        text_input = gr.Textbox(
+                                show_label=False,
+                                placeholder="Enter text and press enter",
+                                container=False,
+                            )
+                        run_button = gr.Button("Send")
+                    clear = gr.Button("Clear")
+                    text_input.submit(user, [text_input, chatbot], [text_input, chatbot], queue=False).then(answer_generate, [chatbot, answer_time, point_clouds, conv_state], chatbot).then(lambda x : x+1,answer_time, answer_time)
+                    clear.click(clear_conv, inputs=[chatbot, conv_state], outputs=[chatbot, answer_time], queue=False)
+                btn.click(confirm_point_cloud, inputs=[input_choice, object_id_input, point_cloud_input, chatbot, answer_time, conv_state], outputs=[output, model3D, chatbot, answer_time, point_clouds])
+            input_choice.change(change_input_method, input_choice, [point_cloud_input, object_id_input])
+            run_button.click(user, [text_input, chatbot], [text_input, chatbot], queue=False).then(answer_generate, [chatbot, answer_time, point_clouds, conv_state], chatbot).then(lambda x : x+1, answer_time, answer_time)
+            gr.Markdown(
+                """
+                ### Usage:
+                1. Upload your point cloud file (ply, npy only) or input the supported [Objaverse object id (uid)](https://drive.google.com/file/d/1gLwA7aHfy1KCrGeXlhICG9rT2387tWY8/view?usp=sharing) (currently 660K objects only, you may try the example object ids below).
+                2. If your point cloud file does not contian colors, manually set the file name contains 'no_color' (e.g., 'xxx_no_color.npy'), and the black color will be assigned.
+                3. If uploading your own point cloud file with color in npy format, the first three dimensions should be xyz, and the next three dimensions should be rgb. The rgb values should range from **0 to 1**.
+                4. Click **Confirm Point Cloud**.
+                5. As we use FPS sampling to downsample the point cloud to 8192 points, it may take a long time to confirm the point cloud if the point cloud has too many points. You may use random sampling to downsample the point cloud before uploading.
+                6. Once '[System] New Point Cloud' appears in the dialogue box, a new conversation with PointLLM is initialized.
+                7. The 'Clear' button will clear the conversation history.
+                """)
+            with gr.Accordion("Example Objaverse object ids in the validation set!", open=False):
+                example_object_ids = [  ["b4bbf2116b1a41a5a3b9d3622b07074c", "0b8da82a3d7a436f9b585436c4b72f56", "650c53d68d374c18886aab91bcf8bb54"],
+                                        ["983fa8b23a084f5dacd157e6c9ceba97", "8fe23dd4bf8542b49c3a574b33e377c3", "83cb2a9e9afb47cd9f45461613796645"],
+                                        ["3d679a3888c548afb8cf889915af7fd2", "7bcf8626eaca40e592ffd0aed08aa30b", "69865c89fc7344be8ed5c1a54dbddc20"],
+                                        ["252f3b3f5cd64698826fc1ab42614677", "e85ebb729b02402bbe3b917e1196f8d3", "97367c4740f64935b7a5e34ae1398035"],
+                                        ["fc8dd5a2fc9f4dd19ad6a64a8a6e89e9", "8257772b0e2f408ba269264855dfea00", "d6a3520486bb474f9b5e72eda8408974"],
+                                        ["3d10918e6a9a4ad395a7280c022ad2b9", "00002bcb84af4a4781174e62619f14e2", "76ba80230d454de996878c2763fe7e5c"]]
+                gr.DataFrame(
+                    type="array",
+                    headers=["Example Object IDs"] * 3,
+                    row_count=6,
+                    col_count=3,
+                    value=example_object_ids
+                )
+            gr.Markdown(
+                """
+                #### Terms of use
+                By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
+                """
+            )
+        demo.queue()
+        demo.launch(server_name="0.0.0.0", server_port=args.port, share=False)    # server_port=7832, share=True
+if __name__ == "__main__":
+    # ! To release this demo in public, make sure to start in a place where no important data is stored.
+    # ! Please check 1. the lanuch dir 2. the tmp dir (GRADIO_TEMP_DIR)
+    # ! refer to https://www.gradio.app/guides/sharing-your-app#security-and-file-access
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, \
+         default="RunsenXu/PointLLM_7B_v1.2")
+    parser.add_argument("--data_path", type=str, default="data/objaverse_data", required=False)
+    parser.add_argument("--pointnum", type=int, default=8192)
+    parser.add_argument("--log_file", type=str, default="serving_workdirs/serving_log.txt")
+    parser.add_argument("--tmp_dir", type=str, default="serving_workdirs/tmp")
+    # For gradio
+    parser.add_argument("--port", type=int, default=7810)
+    args = parser.parse_args()
+    # * make serving dirs
+    os.makedirs(os.path.dirname(args.log_file), exist_ok=True)
+    os.makedirs(args.tmp_dir, exist_ok=True)
+    # * add the current time for log name
+    args.log_file = args.log_file.replace(".txt", f"_{time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())}.txt")
+    logging.basicConfig(
+        filename=args.log_file,
+        level=logging.WARNING, # * default gradio is info, so use warning
+        format='%(asctime)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    logging.warning("-----New Run-----")
+    logging.warning(f"args: {args}")
+    print("-----New Run-----")
+    print(f"[INFO] Args: {args}")
+    # * set env variable GRADIO_TEMP_DIR to args.tmp_dir
+    os.environ["GRADIO_TEMP_DIR"] = args.tmp_dir
+    model, tokenizer, point_backbone_config, keywords, mm_use_point_start_end, conv = init_model(args)
+    start_conversation(args, model, tokenizer, point_backbone_config, keywords, mm_use_point_start_end, conv)