diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..15a45ca35101f815b14831e73f8ef255895a9ea0 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c9d60913b5aee2408c17a596bc65d10ad84444a1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+__pycache__
+.idea
+*.pyc
+.DS_Store
+checkpoints
+results
+venv
+*.egg-info
+build
+dist
+*.eg
+checkpoints_test
+logs
+third_party
\ No newline at end of file
diff --git a/DockerfileAPI b/DockerfileAPI
new file mode 100644
index 0000000000000000000000000000000000000000..ccf2440704f98fc27f1d83de1dfb73bb0848b681
--- /dev/null
+++ b/DockerfileAPI
@@ -0,0 +1,7 @@
+FROM shaoguo/faster_liveportrait:v3
+USER root
+RUN mkdir -p /root/FasterLiveportrait
+RUN chown -R /root/FasterLiveportrait
+COPY . /root/FasterLiveportrait
+WORKDIR /root/FasterLiveportrait
+CMD ["/bin/bash && bash scripts/start_api.sh"]
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..510d38feced882e28adafaf80d4fc473a87a9ccc
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,33 @@
+MIT License
+
+Copyright (c) 2025 warmshao
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+---
+
+ADDITIONAL NOTICE FOR MODELS:
+
+This repository may contain or reference machine learning models. These models
+are subject to their respective licenses, which may differ from the MIT license
+applied to the code in this repository. Users are responsible for complying
+with the license terms of any models they use. This repository and its
+maintainers assume no responsibility for model licensing compliance.
+
+Please check the original source and license of each model before use.
diff --git a/README.md b/README.md
index 4193dac335c6da4994b4d8ef4945537d2d387ea3..dfc8901e548b485c0b7cf9e8a6192ff2b4b497ec 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,183 @@
----
-title: FasterLivepotrait
-emoji: 💻
-colorFrom: purple
-colorTo: gray
-sdk: docker
-pinned: false
-license: mit
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+## FasterLivePortrait: Bring portraits to life in Real Time!
+<a href="README.md">English</a> | <a href="README_ZH.md">中文</a>
+
+**Original repository: [LivePortrait](https://github.com/KwaiVGI/LivePortrait), thanks to the authors for sharing**
+
+**New features:**
+* Achieved real-time running of LivePortrait on RTX 3090 GPU using TensorRT, reaching speeds of 30+ FPS. This is the speed for rendering a single frame, including pre- and post-processing, not just the model inference speed.
+* Seamless support for native gradio app, with several times faster speed and support for simultaneous inference on multiple faces and Animal Model.
+* Added support for [JoyVASA](https://github.com/jdh-algo/JoyVASA), which can drive videos or images with audio.
+
+**If you find this project useful, please give it a star ✨✨**
+
+### Demo (Explore more features)
+* Anyone want this? Fell free to contact me.
+
+<video src="https://github.com/user-attachments/assets/554c37fc-d098-4938-a638-1660d85d222e" controls="controls" width="500" height="300">Your browser does not support this video!</video>
+
+
+* Text-driven video, based on kokoro-82M:
+
+<video src="https://github.com/user-attachments/assets/04e962e2-6c57-4d01-ae4a-2f6d2d501c5a" controls="controls" width="500" height="300">Your browser does not support this video!</video>
+
+* Audio-driven video (real-time):
+
+<video src="https://github.com/user-attachments/assets/98bb5ff7-0796-42db-9d7b-e04ddd2c3c14" controls="controls" width="500" height="300">Your browser does not support this video!</video>
+
+* Animal-driven:
+
+<video src="https://github.com/user-attachments/assets/dada0a92-593a-480b-a034-cbcce16e38b9" controls="controls" width="500" height="300">Your browser does not support this video!</video>
+
+* Multiple faces driven simultaneously:
+
+<video src="https://github.com/KwaiVGI/LivePortrait/assets/138360003/b37de35d-6feb-4100-b73f-58ac23121483" controls="controls" width="500" height="300">Your browser does not support this video!</video>
+
+
+### Environment Setup
+* Option 1 (recommended): If you are a Windows user, you can directly download the [integrated package](https://github.com/warmshao/FasterLivePortrait/releases/tag/v1.8).
+    * You need to install [git](https://git-scm.com/downloads) first, then double-click `update.bat` to update the code.
+    * Double-click `scripts/all_onnx2trt.bat` to convert onnx files to tensorrt files.
+    * Double-click `webui.bat` to open the webpage, or double-click `camera.bat` to open the camera for real-time operation.
+* Option 2: Docker.A docker image is provided for  eliminating the need to install onnxruntime-gpu and TensorRT manually.
+  * Install [Docker](https://docs.docker.com/desktop/install/windows-install/) according to your system
+  * Download the image: `docker pull shaoguo/faster_liveportrait:v3`
+  * Execute the command, replace `$FasterLivePortrait_ROOT` with the local directory where you downloaded FasterLivePortrait:
+  ```shell
+  docker run -it --gpus=all \
+  --name faster_liveportrait \
+  -v $FasterLivePortrait_ROOT:/root/FasterLivePortrait \
+  --restart=always \
+  -p 9870:9870 \
+  shaoguo/faster_liveportrait:v3 \
+  /bin/bash
+  ```
+* Option 3: Create a new Python virtual environment and install the necessary Python packages manually.
+  * First, install [ffmpeg](https://www.ffmpeg.org/download.html)
+  * Run `pip install -r requirements.txt`
+  * Then follow the tutorials below to install onnxruntime-gpu or TensorRT. Note that this has only been tested on Linux systems.
+
+### Usage
+#### 1. TensorRT Inference(Recommended)
+* (Ignored in Docker) Install TensorRT 8.x (versions >=10.x are not compatible). Remember the installation path of [TensorRT](https://developer.nvidia.com/tensorrt).
+* (Ignored in Docker) Install the grid_sample TensorRT plugin, as the model uses grid sample that requires 5D input, which is not supported by the native grid_sample operator.
+  * `git clone https://github.com/SeanWangJS/grid-sample3d-trt-plugin`
+  * Modify line 30 in `CMakeLists.txt` to: `set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "60;70;75;80;86")`
+  * `export PATH=/usr/local/cuda/bin:$PATH`
+  * `mkdir build && cd build`
+  * `cmake .. -DTensorRT_ROOT=$TENSORRT_HOME`, replace $TENSORRT_HOME with your own TensorRT root directory.
+  * `make`, remember the address of the .so file, replace `/opt/grid-sample3d-trt-plugin/build/libgrid_sample_3d_plugin.so` in `scripts/onnx2trt.py` and `src/models/predictor.py` with your own .so file path
+* Download ONNX model files:`huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`. Convert all ONNX models to TensorRT, run `sh scripts/all_onnx2trt.sh` and `sh scripts/all_onnx2trt_animal.sh`
+* Test the pipeline using tensorrt:
+  ```shell
+   python run.py \
+   --src_image assets/examples/source/s10.jpg \
+   --dri_video assets/examples/driving/d14.mp4 \
+   --cfg configs/trt_infer.yaml
+* To run in real-time using a camera:
+  ```shell
+   python run.py \
+   --src_image assets/examples/source/s10.jpg \
+   --dri_video 0 \
+   --cfg configs/trt_infer.yaml \
+   --realtime
+  ```
+  
+#### 2. Onnxruntime Inference
+* First, download the converted onnx model files:`huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`.
+* (Ignored in Docker)If you want to use onnxruntime cpu inference, simply `pip install onnxruntime`. However, cpu inference is extremely slow and not recommended. The latest onnxruntime-gpu still doesn't support grid_sample cuda, but I found a branch that supports it. Follow these steps to install `onnxruntime-gpu` from source:
+  * `git clone https://github.com/microsoft/onnxruntime`
+  * `git checkout liqun/ImageDecoder-cuda`. Thanks to liqun for the grid_sample with cuda implementation!
+  * Run the following commands to compile, changing `cuda_version` and `CMAKE_CUDA_ARCHITECTURES` according to your machine (your cuDNN version must be 8.x, 9.x is not compatible):
+  ```shell
+  ./build.sh --parallel \
+  --build_shared_lib --use_cuda \
+  --cuda_version 11.8 \
+  --cuda_home /usr/local/cuda --cudnn_home /usr/local/cuda/ \
+  --config Release --build_wheel --skip_tests \
+  --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES="60;70;75;80;86" \
+  --cmake_extra_defines CMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+  --disable_contrib_ops \
+  --allow_running_as_root
+  ```
+  * `pip install build/Linux/Release/dist/onnxruntime_gpu-1.17.0-cp310-cp310-linux_x86_64.whl`
+* Test the pipeline using onnxruntime:
+    ```
+      python run.py \
+     --src_image assets/examples/source/s10.jpg \
+     --dri_video assets/examples/driving/d14.mp4 \
+     --cfg configs/onnx_infer.yaml
+     ```
+
+
+### Gradio WebUI
+* onnxruntime: `python webui.py --mode onnx`
+* tensorrt: `python webui.py --mode trt`
+* The default port is 9870. Open the webpage: `http://localhost:9870/`
+
+Hotkeys for webcam mode (when render window is on focus)\
+Q > exit\
+S > Stitching\
+Z > RelativeMotion\
+X > AnimationRegion\
+C > CropDrivingVideo\
+K,L > AdjustSourceScale\
+N,M > AdjustDriverScale
+
+## License
+
+- **Code**: This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+- **Models**: Any machine learning models used in this project are subject to their respective licenses. Please refer to the original model sources for license information. We do not take responsibility for model license compliance.
+
+
+**Changelog**
+- [x] **2025/06/29:** LivePortrait animal v1.1 onnx models are available. Download from [this](https://huggingface.co/warmshao/FasterLivePortrait/tree/main/liveportrait_animal_onnx_v1.1).
+- [x] **2024/12/22:** Add API Deployment `python api.py`, For more information, please refer to the [tutorial](assets/docs/API.md).
+- [x] **2024/12/21:** Added support for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M), enabling text-driven video or image generation.
+  - Updated code: `git pull origin master` and install the latest Python dependencies `pip install requirements.txt`, or simply double-click `update.bat` on Windows.
+  - Download the model: `huggingface-cli download hexgrad/Kokoro-82M --local-dir .\checkpoints\Kokoro-82M`.
+  - For Linux, install `espeak-ng`: `apt-get -qq -y install espeak-ng > /dev/null 2>&1`
+  - For Windows, refer to [manual installation instructions](https://huggingface.co/hexgrad/Kokoro-82M/discussions/12) and configure the `espeak-ng` environment variables.  The current read location is [here](src/pipelines/gradio_live_portrait_pipeline.py:437); modify it if your installation path differs.
+  -  Now you can use it normally in the "Drive Text" tab.
+- [x] **2024/12/16:** Added support for [JoyVASA](https://github.com/jdh-algo/JoyVASA), which can drive videos or images with audio. Very cool!
+ - Update code, then download the models: `huggingface-cli download TencentGameMate/chinese-hubert-base --local-dir .\checkpoints\chinese-hubert-base` and `huggingface-cli download jdh-algo/JoyVASA --local-dir ./checkpoints/JoyVASA`
+ - After launching the webui, follow the tutorial below. When the source is a video, it's recommended to only drive the mouth movements
+  
+  <video src="https://github.com/user-attachments/assets/42fb24be-0cde-4138-9671-e52eec95e7f5" controls="controls" width="500" height="400">您的浏览器不支持播放该视频！</video>
+
+- [x] **2024/12/14:** Added pickle and image driving, as well as region driving animation_region.
+  - Please update the latest code. Windows users can directly double-click `update.bat` to update, but note that your local code will be overwritten.
+  - Running `python run.py` now automatically saves the corresponding pickle to the same directory as the driving video, allowing for direct reuse.
+  - After opening webui, you can experience the new pickle and image driving, as well as the region driving animation_region features. Note that for image driving, remember to disable `relative motion`.
+- [x] **2024/08/11:** Optimized paste_back speed and fixed some bugs.
+  - Used torchgeometry + cuda to optimize the paste_back function, significantly improving speed. Example: `python run.py --src_image assets/examples/source/s39.jpg --dri_video assets/examples/driving/d0.mp4 --cfg configs/trt_infer.yaml --paste_back --animal`
+  - Fixed issues with Xpose ops causing errors on some GPUs and other bugs. Please use the latest docker image: `docker pull shaoguo/faster_liveportrait:v3`
+- [x] **2024/08/11:** Optimized paste_back speed and fixed some bugs.
+  - Used torchgeometry + cuda to optimize the paste_back function, significantly improving speed. Example: `python run.py --src_image assets/examples/source/s39.jpg --dri_video assets/examples/driving/d0.mp4 --cfg configs/trt_infer.yaml --paste_back --animal`
+  - Fixed issues with Xpose ops causing errors on some GPUs and other bugs. Please use the latest docker image: `docker pull shaoguo/faster_liveportrait:v3`
+- [x] **2024/08/07:** Added support for animal models and MediaPipe models, so you no longer need to worry about copyright issues.
+  - Added support for animal models.
+    - Download the animal ONNX file: `huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`, then convert it to TRT format.
+    - Update the Docker image: `docker pull shaoguo/faster_liveportrait:v3`. Using animal model:`python run.py --src_image assets/examples/source/s39.jpg --dri_video 0 --cfg configs/trt_infer.yaml --realtime --animal`
+    - Windows users can download the latest [Windows all-in-one package](https://github.com/warmshao/FasterLivePortrait/releases) from the release page, then unzip and use it.
+    - Simple usage tutorial:
+    
+    <video src="https://github.com/user-attachments/assets/dc37e2dd-551a-43b0-8929-fc5d5fe16ec5" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+    
+  - Using MediaPipe model to replace InsightFace
+    - For web usage: `python webui.py --mode trt --mp` or `python webui.py --mode onnx --mp`
+    - For local webcam: `python run.py --src_image assets/examples/source/s12.jpg --dri_video 0 --cfg configs/trt_mp_infer.yaml`
+- [x] **2024/07/24:** Windows integration package, no installation required, one-click run, supports TensorRT and OnnxruntimeGPU. Thanks to @zhanghongyong123456 for their contribution in this [issue](https://github.com/warmshao/FasterLivePortrait/issues/22).
+  - [Optional] If you have already installed CUDA and cuDNN on your Windows computer, please skip this step. I have only verified on CUDA 12.2. If you haven't installed CUDA or encounter CUDA-related errors, you need to follow these steps:
+    - Download [CUDA 12.2](https://developer.nvidia.com/cuda-12-2-0-download-archive?target_os=Windows&target_arch=x86_64), double-click the exe and install following the default settings step by step.
+    - Download the [cuDNN](https://developer.nvidia.com/downloads/compute/cudnn/secure/8.9.7/local_installers/12.x/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip) zip file, extract it, and copy the lib, bin, and include folders from the cuDNN folder to the CUDA 12.2 folder (default is C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2)
+  - Download the installation-free [Windows integration package](https://github.com/warmshao/FasterLivePortrait/releases) from the release page and extract it.
+  - Enter `FasterLivePortrait-windows` and double-click `scripts/all_onnx2trt.bat` to convert onnx files, which will take some time.
+  - For web demo: Double-click `webui.bat`, open the webpage: `http://localhost:9870/`
+  - For real-time camera operation, double-click `camera.bat`，press `q` to stop. If you want to change the target image, run in command line: `camera.bat assets/examples/source/s9.jpg`
+- [x] **2024/07/18:** macOS support added(No need for Docker, Python is enough). M1/M2 chips are faster, but it's still quite slow 😟
+  - Install ffmpeg: `brew install ffmpeg`
+  - Set up a Python 3.10 virtual environment. Recommend using [miniforge](https://github.com/conda-forge/miniforge): `conda create -n flip python=3.10 && conda activate flip`
+  - Install requirements: `pip install -r requirements_macos.txt`
+  - Download ONNX files: `huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`
+  - Test: `python webui.py --mode onnx`
+- [x] **2024/07/17:** Added support for Docker environment, providing a runnable image.
diff --git a/README_ZH.md b/README_ZH.md
new file mode 100644
index 0000000000000000000000000000000000000000..2000b3896de7e2b6e02a33b4ccf23163c5bda5e0
--- /dev/null
+++ b/README_ZH.md
@@ -0,0 +1,173 @@
+## FasterLivePortrait：Bring portrait to life in Real Time!
+<a href="README.md">English</a> | <a href="README_ZH.md">中文</a>
+
+**原仓库: [LivePortrait](https://github.com/KwaiVGI/LivePortrait)，感谢作者的分享**
+
+**新增功能：**
+* 通过TensorRT实现在RTX 3090显卡上**实时**运行LivePortrait，速度达到 30+ FPS. 这个速度是实测渲染出一帧的速度，而不仅仅是模型的推理时间。
+* 无缝支持原生的gradio app, 速度快了好几倍，同时支持多张人脸、Animal模型。
+* 增加[JoyVASA](https://github.com/jdh-algo/JoyVASA)的支持，可以用音频驱动视频或图片。
+
+**如果你觉得这个项目有用，帮我点个star吧✨✨**
+
+### Demo(还有很多功能等你探索)
+* 文本驱动视频，基于kokoro-82M:
+
+<video src="https://github.com/user-attachments/assets/04e962e2-6c57-4d01-ae4a-2f6d2d501c5a" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+* 声音驱动视频(可以实时):
+
+<video src="https://github.com/user-attachments/assets/98bb5ff7-0796-42db-9d7b-e04ddd2c3c14" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+* 动物驱动:
+
+<video src="https://github.com/user-attachments/assets/dada0a92-593a-480b-a034-cbcce16e38b9" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+* 多张人脸同时驱动:
+
+<video src="https://github.com/KwaiVGI/LivePortrait/assets/138360003/b37de35d-6feb-4100-b73f-58ac23121483" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+
+
+### 环境安装
+* 方式1：如果你是Windows用户，推荐可以直接下载[整合包](https://github.com/warmshao/FasterLivePortrait/releases/tag/v1.8)。
+  * 需要先安装好[git](https://git-scm.com/downloads), 双击`update.bat`更新代码。
+  * 双击`scripts/all_onnx2trt.bat`转换onnx文件为tensorrt文件。
+  * 双击`webui.bat`打开网页，或者双击`camera.bat`打开摄像头实时运行。
+* 方式2：Docker，提供了一个镜像，不用再自己安装onnxruntime-gpu和TensorRT。
+  * 根据自己的系统安装[docker](https://docs.docker.com/desktop/install/windows-install/)
+  * 下载镜像：`docker pull shaoguo/faster_liveportrait:v3`
+  * 执行命令, `$FasterLivePortrait_ROOT`要替换成你下载的FasterLivePortrait在本地的目录:
+  ```shell
+  docker run -it --gpus=all \
+  --name faster_liveportrait \
+  -v $FasterLivePortrait_ROOT:/root/FasterLivePortrait \
+  --restart=always \
+  -p 9870:9870 \
+  shaoguo/faster_liveportrait:v3 \
+  /bin/bash
+  ```
+  * 然后可以根据下面Onnxruntime 推理和TensorRT 推理教程进行使用。
+  
+* 方式3：新建一个python虚拟环境，自己安装必要的python包
+  * 请先安装[ffmpeg](https://www.ffmpeg.org/download.html)
+  * `pip install -r requirements.txt`
+  * 再根据以下教程安装onnxruntime-gpu或TensorRT。
+
+### 使用方法
+#### 1. TensorRT 推理(推荐, 可以实时)
+* (Docker环境可忽略）安装TensorRT，请记住[TensorRT](https://developer.nvidia.com/tensorrt)安装的路径。
+* (Docker环境可忽略）安装 grid_sample的tensorrt插件，因为模型用到的grid sample需要有5d的输入,原生的grid_sample 算子不支持。
+  * `git clone https://github.com/SeanWangJS/grid-sample3d-trt-plugin`
+  * 修改`CMakeLists.txt`中第30行为:`set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "60;70;75;80;86")`
+  * `export PATH=/usr/local/cuda/bin:$PATH`
+  * `mkdir build && cd build`
+  * `cmake .. -DTensorRT_ROOT=$TENSORRT_HOME`,$TENSORRT_HOME 替换成你自己TensorRT的根目录。
+  * `make`，记住so文件的地址，将`scripts/onnx2trt.py`和`src/models/predictor.py`里`/opt/grid-sample3d-trt-plugin/build/libgrid_sample_3d_plugin.so`替换成自己的so路径
+* 下载Onnx文件：`huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`。将onnx模型转为tensorrt，运行`sh scripts/all_onnx2trt.sh`和`sh scripts/all_onnx2trt_animal.sh`
+* 用tensorrt测试pipeline：
+  ```shell
+   python run.py \
+   --src_image assets/examples/source/s10.jpg \
+   --dri_video assets/examples/driving/d14.mp4 \
+   --cfg configs/trt_infer.yaml
+  ```
+  如果要使用摄像头实时运行：
+  ```shell
+   python run.py \
+   --src_image assets/examples/source/s10.jpg \
+   --dri_video 0 \
+   --cfg configs/trt_infer.yaml \
+   --realtime
+  ```
+#### 2. Onnxruntime 推理
+* 首先下载我转换好的[模型onnx文件](https://huggingface.co/warmshao/FasterLivePortrait): `huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`。
+* (Docker环境可忽略）如果你要用onnxruntime cpu推理的话，直接`pip install onnxruntime`即可，但是cpu推理超级慢。但是最新的onnxruntime-gpu仍然无法支持grid_sample cuda，好在我看到一位大佬在分支上支持了，按照以下步骤源码安装`onnxruntime-gpu`:
+  * `git clone https://github.com/microsoft/onnxruntime`
+  * `git checkout liqun/ImageDecoder-cuda`. Thanks for liqun's grid_sample with cuda implementation!
+  * 运行以下命令编译,`cuda_version`和`CMAKE_CUDA_ARCHITECTURES`根据自己的机器更改:
+  ```shell
+  ./build.sh --parallel \
+  --build_shared_lib --use_cuda \
+  --cuda_version 11.8 \
+  --cuda_home /usr/local/cuda --cudnn_home /usr/local/cuda/ \
+  --config Release --build_wheel --skip_tests \
+  --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES="60;70;75;80;86" \
+  --cmake_extra_defines CMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+  --disable_contrib_ops \
+  --allow_running_as_root
+  ```
+  * `pip install build/Linux/Release/dist/onnxruntime_gpu-1.17.0-cp310-cp310-linux_x86_64.whl`就可以了
+* 用onnxruntime测试pipeline：
+  ```shell
+   python run.py \
+   --src_image assets/examples/source/s10.jpg \
+   --dri_video assets/examples/driving/d14.mp4 \
+   --cfg configs/onnx_infer.yaml
+  ```
+
+### Gradio WebUI
+* onnxruntime: `python webui.py --mode onnx`
+* tensorrt: `python webui.py --mode trt`
+* 默认端口在9870，打开网页：`http://localhost:9870/`
+
+Hotkeys for webcam mode (when render window is on focus)\
+Q > exit\
+S > Stitching\
+Z > RelativeMotion\
+X > AnimationRegion\
+C > CropDrivingVideo\
+K,L > AdjustSourceScale\
+N,M > AdjustDriverScale
+
+## 许可证
+
+- **代码**: 本项目采用 MIT 许可证 - 详细信息请查看 [LICENSE](LICENSE) 文件。
+- **模型**: 本项目中使用的任何机器学习模型均遵循其各自的许可证。请参考原始模型来源获取许可证信息。我们不承担模型许可证合规性的责任。
+
+
+**日志**
+- [x] **2025/06/29:** [LivePortrait animal v1.1 onnx模型](https://huggingface.co/warmshao/FasterLivePortrait/tree/main/liveportrait_animal_onnx_v1.1)。
+- [x] **2024/12/22:** 增加api部署`python api.py`, 其他参考[教程](assets/docs/API_ZH.md)使用。
+- [x] **2024/12/21:** 增加[Kokoro-82M](hhttps://huggingface.co/hexgrad/Kokoro-82M)的支持，可以用文本驱动视频或图片。
+  - 更新代码, `git pull origin master`并安装最新的python依赖 `pip install requirements.txt`， 或者 windows下直接双击 `update.bat`. 
+  - 然后下载模型: `huggingface-cli download hexgrad/Kokoro-82M --local-dir .\checkpoints\Kokoro-82M`. 
+  - 如果是Linux请安装`apt-get -qq -y install espeak-ng > /dev/null 2>&1`
+  - 如果是windows请参考[自行安装](https://huggingface.co/hexgrad/Kokoro-82M/discussions/12)并配置好`espeak-ng`环境变量。我是在[这里](src/pipelines/gradio_live_portrait_pipeline.py:437)读取，如果你的位置变了，请自行修改。
+  - 然后就可以在Drive Text的标签页正常使用了。
+- [x] **2024/12/16:** 增加[JoyVASA](https://github.com/jdh-algo/JoyVASA)的支持，可以用音频驱动视频或图片。非常酷！
+  - 更新代码，然后下载模型: `huggingface-cli download TencentGameMate/chinese-hubert-base --local-dir .\checkpoints\chinese-hubert-base` 和 ` huggingface-cli download jdh-algo/JoyVASA --local-dir ./checkpoints/JoyVASA`
+  - 启动webui后根据以下教程使用即可，建议source 是视频的情况下只驱动嘴部
+
+   <video src="https://github.com/user-attachments/assets/42fb24be-0cde-4138-9671-e52eec95e7f5" controls="controls" width="500" height="400">您的浏览器不支持播放该视频！</video>
+  
+- [x] **2024/12/14:** 增加pickle和image驱动以及区域驱动`animation_region`。
+  - 请更新最新的代码，windows用户可以直接双击`update.bat`更新，但请注意本地的代码将会被覆盖。
+  - `python run.py ` 现在运行 `driving video`会自动保存对应的pickle到跟`driving video`一样的目录，可以直接复用。
+  - 打开`webui`后即可体验新的pickle和image驱动以及区域驱动`animation_region`等功能。注意image驱动记得把`relative motion`取消掉。
+- [x] **2024/08/11:** 优化paste_back的速度，修复一些bug。
+  - 用torchgeometry + cuda优化paste_back函数，现在速度提升了很多。示例：`python run.py --src_image assets/examples/source/s39.jpg --dri_video assets/examples/driving/d0.mp4 --cfg configs/trt_infer.yaml --paste_back --animal`
+  - 修复Xpose的ops在一些显卡运行报错的问题等bug。请使用最新的镜像:`docker pull shaoguo/faster_liveportrait:v3`
+- [x] **2024/08/07:** 增加animal模型的支持，同时支持mediapipe模型，现在你不用再担心版权的问题。
+  - 增加对animal模型的支持。
+    - 需要下载animal的onnx文件：`huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`，然后转换成trt文件。
+    - 更新镜像`docker pull shaoguo/faster_liveportrait:v3`, 使用animal模型的示例:`python run.py --src_image assets/examples/source/s39.jpg --dri_video 0 --cfg configs/trt_infer.yaml --realtime --animal`
+    - windows系统可以从release页下载最新的[windows 整合包](https://github.com/warmshao/FasterLivePortrait/releases)，解压后使用。
+    - 简单的使用教程：
+    
+    <video src="https://github.com/user-attachments/assets/dc37e2dd-551a-43b0-8929-fc5d5fe16ec5" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+    
+  - 使用mediapipe模型替代insight_face
+    - 网页端使用: `python webui.py --mode trt --mp` 或 `python webui.py --mode onnx --mp`
+    - 本地摄像头运行: `python run.py --src_image assets/examples/source/s12.jpg --dri_video assets/examples/driving/d0.mp4 --cfg configs/trt_mp_infer.yaml`
+- [x] **2024/07/24:** Windows的整合包, 免安装一键运行，支持TensorRT和OnnxruntimeGPU。感谢@zhanghongyong123456在[issue](https://github.com/warmshao/FasterLivePortrait/issues/22)的贡献。
+  - 【可选】如果你的windows电脑已经装过cuda和cudnn，请忽略这一步。我只在cuda12.2上验证过，如果没安装cuda或报cuda相关的错，你需要按照以下步骤进行安装：
+    - 下载[cuda12.2](https://developer.nvidia.com/cuda-12-2-0-download-archive?target_os=Windows&target_arch=x86_64), 双击exe后按照默认设置一步步安装即可。
+    - 下载[cudnn](https://developer.nvidia.com/downloads/compute/cudnn/secure/8.9.7/local_installers/12.x/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip) 压缩包，解压后将cudnn 文件夹下的lib、bin、include 文件夹复制到 CUDA12.2 文件夹下（默认为C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2）
+  - 从release页下载免安装[windows 整合包](https://github.com/warmshao/FasterLivePortrait/releases)并解压。
+  - 进入`FasterLivePortrait-windows`后双击`scripts/all_onnx2trt.bat`对onnx文件进行转换，这会等上一段时间。
+  - 网页端demo：双击`webui.bat`, 打开网页：`http://localhost:9870/`
+  - 摄像头实时运行，双击`camera.bat`，按`q`停止。如果你想更换目标图像，命令行运行:`camera.bat assets/examples/source/s9.jpg`。
+- [x] **2024/07/18:** MacOS支持(不需要Docker，python就可以了），M1/M2的速度比较快，但还是很慢😟
+  - 安装ffmpeg: `brew install ffmpeg`
+  - 安装python=3.10的虚拟环境，推荐可以用[miniforge](https://github.com/conda-forge/miniforge).`conda create -n flip python=3.10 && conda activate flip`
+  - `pip install -r requirements_macos.txt`
+  - 下载onnx文件: `huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`
+  - 测试: `python webui.py --mode onnx`
+- [x] **2024/07/17:** 增加docker环境的支持，提供可运行的镜像。
diff --git a/api.py b/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a1c7157edeeb8c1c3ec243f778f16fcda376a61
--- /dev/null
+++ b/api.py
@@ -0,0 +1,479 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/9/13 0:23
+# @Project : FasterLivePortrait
+# @FileName: api.py
+import pdb
+import shutil
+from typing import Optional, Dict, Any
+import io
+import os
+import subprocess
+import uvicorn
+import cv2
+import time
+import numpy as np
+import os
+import datetime
+import platform
+import pickle
+from tqdm import tqdm
+from pydantic import BaseModel
+from fastapi import APIRouter, Depends, FastAPI, Request, Response, UploadFile
+from fastapi import File, Body, Form
+from omegaconf import OmegaConf
+from fastapi.responses import StreamingResponse
+from zipfile import ZipFile
+from src.pipelines.faster_live_portrait_pipeline import FasterLivePortraitPipeline
+from src.utils.utils import video_has_audio
+from src.utils import logger
+
+# model dir
+project_dir = os.path.dirname(__file__)
+checkpoints_dir = os.environ.get("FLIP_CHECKPOINT_DIR", os.path.join(project_dir, "checkpoints"))
+log_dir = os.path.join(project_dir, "logs")
+os.makedirs(log_dir, exist_ok=True)
+result_dir = os.path.join(project_dir, "results")
+os.makedirs(result_dir, exist_ok=True)
+
+logger_f = logger.get_logger("faster_liveportrait_api", log_file=os.path.join(log_dir, "log_run.log"))
+
+app = FastAPI()
+
+global pipe
+
+if platform.system().lower() == 'windows':
+    FFMPEG = "third_party/ffmpeg-7.0.1-full_build/bin/ffmpeg.exe"
+else:
+    FFMPEG = "ffmpeg"
+
+
+def check_all_checkpoints_exist(infer_cfg):
+    """
+    check whether all checkpoints exist
+    :return:
+    """
+    ret = True
+    for name in infer_cfg.models:
+        if not isinstance(infer_cfg.models[name].model_path, str):
+            for i in range(len(infer_cfg.models[name].model_path)):
+                infer_cfg.models[name].model_path[i] = infer_cfg.models[name].model_path[i].replace("./checkpoints",
+                                                                                                    checkpoints_dir)
+                if not os.path.exists(infer_cfg.models[name].model_path[i]) and not os.path.exists(
+                        infer_cfg.models[name].model_path[i][:-4] + ".onnx"):
+                    return False
+        else:
+            infer_cfg.models[name].model_path = infer_cfg.models[name].model_path.replace("./checkpoints",
+                                                                                          checkpoints_dir)
+            if not os.path.exists(infer_cfg.models[name].model_path) and not os.path.exists(
+                    infer_cfg.models[name].model_path[:-4] + ".onnx"):
+                return False
+    for name in infer_cfg.animal_models:
+        if not isinstance(infer_cfg.animal_models[name].model_path, str):
+            for i in range(len(infer_cfg.animal_models[name].model_path)):
+                infer_cfg.animal_models[name].model_path[i] = infer_cfg.animal_models[name].model_path[i].replace(
+                    "./checkpoints",
+                    checkpoints_dir)
+                if not os.path.exists(infer_cfg.animal_models[name].model_path[i]) and not os.path.exists(
+                        infer_cfg.animal_models[name].model_path[i][:-4] + ".onnx"):
+                    return False
+        else:
+            infer_cfg.animal_models[name].model_path = infer_cfg.animal_models[name].model_path.replace("./checkpoints",
+                                                                                                        checkpoints_dir)
+            if not os.path.exists(infer_cfg.animal_models[name].model_path) and not os.path.exists(
+                    infer_cfg.animal_models[name].model_path[:-4] + ".onnx"):
+                return False
+
+    # XPOSE
+    xpose_model_path = os.path.join(checkpoints_dir, "liveportrait_animal_onnx/xpose.pth")
+    if not os.path.exists(xpose_model_path):
+        return False
+    embeddings_cache_9_path = os.path.join(checkpoints_dir, "liveportrait_animal_onnx/clip_embedding_9.pkl")
+    if not os.path.exists(embeddings_cache_9_path):
+        return False
+    embeddings_cache_68_path = os.path.join(checkpoints_dir, "liveportrait_animal_onnx/clip_embedding_68.pkl")
+    if not os.path.exists(embeddings_cache_68_path):
+        return False
+    return ret
+
+
+def convert_onnx_to_trt_models(infer_cfg):
+    ret = True
+    for name in infer_cfg.models:
+        if not isinstance(infer_cfg.models[name].model_path, str):
+            for i in range(len(infer_cfg.models[name].model_path)):
+                trt_path = infer_cfg.models[name].model_path[i]
+                onnx_path = trt_path[:-4] + ".onnx"
+                if not os.path.exists(trt_path):
+                    convert_cmd = f"python scripts/onnx2trt.py -o {onnx_path}"
+                    logger_f.info(f"convert onnx model: {onnx_path}")
+                    result = subprocess.run(convert_cmd, shell=True, check=True)
+                    # 检查结果
+                    if result.returncode == 0:
+                        logger_f.info(f"convert onnx model: {onnx_path} successful")
+                    else:
+                        logger_f.error(f"convert onnx model: {onnx_path} failed")
+                        return False
+        else:
+            trt_path = infer_cfg.models[name].model_path
+            onnx_path = trt_path[:-4] + ".onnx"
+            if not os.path.exists(trt_path):
+                convert_cmd = f"python scripts/onnx2trt.py -o {onnx_path}"
+                logger_f.info(f"convert onnx model: {onnx_path}")
+                result = subprocess.run(convert_cmd, shell=True, check=True)
+                # 检查结果
+                if result.returncode == 0:
+                    logger_f.info(f"convert onnx model: {onnx_path} successful")
+                else:
+                    logger_f.error(f"convert onnx model: {onnx_path} failed")
+                    return False
+
+    for name in infer_cfg.animal_models:
+        if not isinstance(infer_cfg.animal_models[name].model_path, str):
+            for i in range(len(infer_cfg.animal_models[name].model_path)):
+                trt_path = infer_cfg.animal_models[name].model_path[i]
+                onnx_path = trt_path[:-4] + ".onnx"
+                if not os.path.exists(trt_path):
+                    convert_cmd = f"python scripts/onnx2trt.py -o {onnx_path}"
+                    logger_f.info(f"convert onnx model: {onnx_path}")
+                    result = subprocess.run(convert_cmd, shell=True, check=True)
+                    # 检查结果
+                    if result.returncode == 0:
+                        logger_f.info(f"convert onnx model: {onnx_path} successful")
+                    else:
+                        logger_f.error(f"convert onnx model: {onnx_path} failed")
+                        return False
+        else:
+            trt_path = infer_cfg.animal_models[name].model_path
+            onnx_path = trt_path[:-4] + ".onnx"
+            if not os.path.exists(trt_path):
+                convert_cmd = f"python scripts/onnx2trt.py -o {onnx_path}"
+                logger_f.info(f"convert onnx model: {onnx_path}")
+                result = subprocess.run(convert_cmd, shell=True, check=True)
+                # 检查结果
+                if result.returncode == 0:
+                    logger_f.info(f"convert onnx model: {onnx_path} successful")
+                else:
+                    logger_f.error(f"convert onnx model: {onnx_path} failed")
+                    return False
+    return ret
+
+
+@app.on_event("startup")
+async def startup_event():
+    global pipe
+    # default use trt model
+    cfg_file = os.path.join(project_dir, "configs/trt_infer.yaml")
+    infer_cfg = OmegaConf.load(cfg_file)
+    checkpoints_exist = check_all_checkpoints_exist(infer_cfg)
+
+    # first: download model if not exist
+    if not checkpoints_exist:
+        download_cmd = f"huggingface-cli download warmshao/FasterLivePortrait --local-dir {checkpoints_dir}"
+        logger_f.info(f"download model: {download_cmd}")
+        result = subprocess.run(download_cmd, shell=True, check=True)
+        # 检查结果
+        if result.returncode == 0:
+            logger_f.info(f"Download checkpoints to {checkpoints_dir} successful")
+        else:
+            logger_f.error(f"Download checkpoints to {checkpoints_dir} failed")
+            exit(1)
+    # second: convert onnx model to trt
+    convert_ret = convert_onnx_to_trt_models(infer_cfg)
+    if not convert_ret:
+        logger_f.error(f"convert onnx model to trt failed")
+        exit(1)
+
+    infer_cfg.infer_params.flag_pasteback = True
+    pipe = FasterLivePortraitPipeline(cfg=infer_cfg, is_animal=True)
+
+
+def run_with_video(source_image_path, driving_video_path, save_dir):
+    global pipe
+    ret = pipe.prepare_source(source_image_path, realtime=False)
+    if not ret:
+        logger_f.warning(f"no face in {source_image_path}! exit!")
+        return
+    vcap = cv2.VideoCapture(driving_video_path)
+    fps = int(vcap.get(cv2.CAP_PROP_FPS))
+    h, w = pipe.src_imgs[0].shape[:2]
+
+    # render output video
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    vsave_crop_path = os.path.join(save_dir,
+                                   f"{os.path.basename(source_image_path)}-{os.path.basename(driving_video_path)}-crop.mp4")
+    vout_crop = cv2.VideoWriter(vsave_crop_path, fourcc, fps, (512 * 2, 512))
+    vsave_org_path = os.path.join(save_dir,
+                                  f"{os.path.basename(source_image_path)}-{os.path.basename(driving_video_path)}-org.mp4")
+    vout_org = cv2.VideoWriter(vsave_org_path, fourcc, fps, (w, h))
+
+    infer_times = []
+    motion_lst = []
+    c_eyes_lst = []
+    c_lip_lst = []
+
+    frame_ind = 0
+    while vcap.isOpened():
+        ret, frame = vcap.read()
+        if not ret:
+            break
+        t0 = time.time()
+        first_frame = frame_ind == 0
+        dri_crop, out_crop, out_org, dri_motion_info = pipe.run(frame, pipe.src_imgs[0], pipe.src_infos[0],
+                                                                first_frame=first_frame)
+        frame_ind += 1
+        if out_crop is None:
+            logger_f.warning(f"no face in driving frame:{frame_ind}")
+            continue
+
+        motion_lst.append(dri_motion_info[0])
+        c_eyes_lst.append(dri_motion_info[1])
+        c_lip_lst.append(dri_motion_info[2])
+
+        infer_times.append(time.time() - t0)
+        # print(time.time() - t0)
+        dri_crop = cv2.resize(dri_crop, (512, 512))
+        out_crop = np.concatenate([dri_crop, out_crop], axis=1)
+        out_crop = cv2.cvtColor(out_crop, cv2.COLOR_RGB2BGR)
+        vout_crop.write(out_crop)
+        out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+        vout_org.write(out_org)
+    vcap.release()
+    vout_crop.release()
+    vout_org.release()
+    if video_has_audio(driving_video_path):
+        vsave_crop_path_new = os.path.splitext(vsave_crop_path)[0] + "-audio.mp4"
+        subprocess.call(
+            [FFMPEG, "-i", vsave_crop_path, "-i", driving_video_path,
+             "-b:v", "10M", "-c:v",
+             "libx264", "-map", "0:v", "-map", "1:a",
+             "-c:a", "aac",
+             "-pix_fmt", "yuv420p", vsave_crop_path_new, "-y", "-shortest"])
+        vsave_org_path_new = os.path.splitext(vsave_org_path)[0] + "-audio.mp4"
+        subprocess.call(
+            [FFMPEG, "-i", vsave_org_path, "-i", driving_video_path,
+             "-b:v", "10M", "-c:v",
+             "libx264", "-map", "0:v", "-map", "1:a",
+             "-c:a", "aac",
+             "-pix_fmt", "yuv420p", vsave_org_path_new, "-y", "-shortest"])
+
+        logger_f.info(vsave_crop_path_new)
+        logger_f.info(vsave_org_path_new)
+    else:
+        logger_f.info(vsave_crop_path)
+        logger_f.info(vsave_org_path)
+
+    logger_f.info(
+        "inference median time: {} ms/frame, mean time: {} ms/frame".format(np.median(infer_times) * 1000,
+                                                                            np.mean(infer_times) * 1000))
+    # save driving motion to pkl
+    template_dct = {
+        'n_frames': len(motion_lst),
+        'output_fps': fps,
+        'motion': motion_lst,
+        'c_eyes_lst': c_eyes_lst,
+        'c_lip_lst': c_lip_lst,
+    }
+    template_pkl_path = os.path.join(save_dir,
+                                     f"{os.path.basename(driving_video_path)}.pkl")
+    with open(template_pkl_path, "wb") as fw:
+        pickle.dump(template_dct, fw)
+    logger_f.info(f"save driving motion pkl file at : {template_pkl_path}")
+
+
+def run_with_pkl(source_image_path, driving_pickle_path, save_dir):
+    global pipe
+    ret = pipe.prepare_source(source_image_path, realtime=False)
+    if not ret:
+        logger_f.warning(f"no face in {source_image_path}! exit!")
+        return
+
+    with open(driving_pickle_path, "rb") as fin:
+        dri_motion_infos = pickle.load(fin)
+
+    fps = int(dri_motion_infos["output_fps"])
+    h, w = pipe.src_imgs[0].shape[:2]
+
+    # render output video
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    vsave_crop_path = os.path.join(save_dir,
+                                   f"{os.path.basename(source_image_path)}-{os.path.basename(driving_pickle_path)}-crop.mp4")
+    vout_crop = cv2.VideoWriter(vsave_crop_path, fourcc, fps, (512, 512))
+    vsave_org_path = os.path.join(save_dir,
+                                  f"{os.path.basename(source_image_path)}-{os.path.basename(driving_pickle_path)}-org.mp4")
+    vout_org = cv2.VideoWriter(vsave_org_path, fourcc, fps, (w, h))
+
+    infer_times = []
+    motion_lst = dri_motion_infos["motion"]
+    c_eyes_lst = dri_motion_infos["c_eyes_lst"] if "c_eyes_lst" in dri_motion_infos else dri_motion_infos[
+        "c_d_eyes_lst"]
+    c_lip_lst = dri_motion_infos["c_lip_lst"] if "c_lip_lst" in dri_motion_infos else dri_motion_infos["c_d_lip_lst"]
+
+    frame_num = len(motion_lst)
+    for frame_ind in tqdm(range(frame_num)):
+        t0 = time.time()
+        first_frame = frame_ind == 0
+        dri_motion_info_ = [motion_lst[frame_ind], c_eyes_lst[frame_ind], c_lip_lst[frame_ind]]
+        out_crop, out_org = pipe.run_with_pkl(dri_motion_info_, pipe.src_imgs[0], pipe.src_infos[0],
+                                              first_frame=first_frame)
+        if out_crop is None:
+            logger_f.warning(f"no face in driving frame:{frame_ind}")
+            continue
+
+        infer_times.append(time.time() - t0)
+        # print(time.time() - t0)
+        out_crop = cv2.cvtColor(out_crop, cv2.COLOR_RGB2BGR)
+        vout_crop.write(out_crop)
+        out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+        vout_org.write(out_org)
+
+    vout_crop.release()
+    vout_org.release()
+    logger_f.info(vsave_crop_path)
+    logger_f.info(vsave_org_path)
+    logger_f.info(
+        "inference median time: {} ms/frame, mean time: {} ms/frame".format(np.median(infer_times) * 1000,
+                                                                            np.mean(infer_times) * 1000))
+
+
+class LivePortraitParams(BaseModel):
+    flag_pickle: bool = False
+    flag_relative_input: bool = True
+    flag_do_crop_input: bool = True
+    flag_remap_input: bool = True
+    driving_multiplier: float = 1.0
+    flag_stitching: bool = True
+    flag_crop_driving_video_input: bool = True
+    flag_video_editing_head_rotation: bool = False
+    flag_is_animal: bool = True
+    scale: float = 2.3
+    vx_ratio: float = 0.0
+    vy_ratio: float = -0.125
+    scale_crop_driving_video: float = 2.2
+    vx_ratio_crop_driving_video: float = 0.0
+    vy_ratio_crop_driving_video: float = -0.1
+    driving_smooth_observation_variance: float = 1e-7
+
+
+@app.post("/predict/")
+async def upload_files(
+        source_image: Optional[UploadFile] = File(None),
+        driving_video: Optional[UploadFile] = File(None),
+        driving_pickle: Optional[UploadFile] = File(None),
+        flag_is_animal: bool = Form(...),
+        flag_pickle: bool = Form(...),
+        flag_relative_input: bool = Form(...),
+        flag_do_crop_input: bool = Form(...),
+        flag_remap_input: bool = Form(...),
+        driving_multiplier: float = Form(...),
+        flag_stitching: bool = Form(...),
+        flag_crop_driving_video_input: bool = Form(...),
+        flag_video_editing_head_rotation: bool = Form(...),
+        scale: float = Form(...),
+        vx_ratio: float = Form(...),
+        vy_ratio: float = Form(...),
+        scale_crop_driving_video: float = Form(...),
+        vx_ratio_crop_driving_video: float = Form(...),
+        vy_ratio_crop_driving_video: float = Form(...),
+        driving_smooth_observation_variance: float = Form(...)
+):
+    # 根据传入的表单参数构建 infer_params
+    infer_params = LivePortraitParams(
+        flag_is_animal=flag_is_animal,
+        flag_pickle=flag_pickle,
+        flag_relative_input=flag_relative_input,
+        flag_do_crop_input=flag_do_crop_input,
+        flag_remap_input=flag_remap_input,
+        driving_multiplier=driving_multiplier,
+        flag_stitching=flag_stitching,
+        flag_crop_driving_video_input=flag_crop_driving_video_input,
+        flag_video_editing_head_rotation=flag_video_editing_head_rotation,
+        scale=scale,
+        vx_ratio=vx_ratio,
+        vy_ratio=vy_ratio,
+        scale_crop_driving_video=scale_crop_driving_video,
+        vx_ratio_crop_driving_video=vx_ratio_crop_driving_video,
+        vy_ratio_crop_driving_video=vy_ratio_crop_driving_video,
+        driving_smooth_observation_variance=driving_smooth_observation_variance
+    )
+
+    global pipe
+    pipe.init_vars()
+    if infer_params.flag_is_animal != pipe.is_animal:
+        pipe.init_models(is_animal=infer_params.flag_is_animal)
+
+    args_user = {
+        'flag_relative_motion': infer_params.flag_relative_input,
+        'flag_do_crop': infer_params.flag_do_crop_input,
+        'flag_pasteback': infer_params.flag_remap_input,
+        'driving_multiplier': infer_params.driving_multiplier,
+        'flag_stitching': infer_params.flag_stitching,
+        'flag_crop_driving_video': infer_params.flag_crop_driving_video_input,
+        'flag_video_editing_head_rotation': infer_params.flag_video_editing_head_rotation,
+        'src_scale': infer_params.scale,
+        'src_vx_ratio': infer_params.vx_ratio,
+        'src_vy_ratio': infer_params.vy_ratio,
+        'dri_scale': infer_params.scale_crop_driving_video,
+        'dri_vx_ratio': infer_params.vx_ratio_crop_driving_video,
+        'dri_vy_ratio': infer_params.vy_ratio_crop_driving_video,
+    }
+    # update config from user input
+    update_ret = pipe.update_cfg(args_user)
+
+    # 保存 source_image 到指定目录
+    temp_dir = os.path.join(result_dir, f"temp-{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}")
+    os.makedirs(temp_dir, exist_ok=True)
+    if source_image and source_image.filename:
+        source_image_path = os.path.join(temp_dir, source_image.filename)
+        with open(source_image_path, "wb") as buffer:
+            buffer.write(await source_image.read())  # 将内容写入文件
+    else:
+        source_image_path = None
+
+    if driving_video and driving_video.filename:
+        driving_video_path = os.path.join(temp_dir, driving_video.filename)
+        with open(driving_video_path, "wb") as buffer:
+            buffer.write(await driving_video.read())  # 将内容写入文件
+    else:
+        driving_video_path = None
+
+    if driving_pickle and driving_pickle.filename:
+        driving_pickle_path = os.path.join(temp_dir, driving_pickle.filename)
+        with open(driving_pickle_path, "wb") as buffer:
+            buffer.write(await driving_pickle.read())  # 将内容写入文件
+    else:
+        driving_pickle_path = None
+
+    save_dir = os.path.join(result_dir, f"{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}")
+    os.makedirs(save_dir, exist_ok=True)
+
+    if infer_params.flag_pickle:
+        if source_image_path and driving_pickle_path:
+            run_with_pkl(source_image_path, driving_pickle_path, save_dir)
+    else:
+        if source_image_path and driving_video_path:
+            run_with_video(source_image_path, driving_video_path, save_dir)
+    # zip all files and return
+    # 使用 BytesIO 在内存中创建一个字节流
+    zip_buffer = io.BytesIO()
+
+    # 使用 ZipFile 将文件夹内容压缩到 zip_buffer 中
+    with ZipFile(zip_buffer, "w") as zip_file:
+        for root, dirs, files in os.walk(save_dir):
+            for file in files:
+                file_path = os.path.join(root, file)
+                # 添加文件到 ZIP 文件中
+                zip_file.write(file_path, arcname=os.path.relpath(file_path, save_dir))
+
+    # 确保缓冲区指针在开始位置，以便读取整个内容
+    zip_buffer.seek(0)
+    shutil.rmtree(temp_dir)
+    shutil.rmtree(save_dir)
+    # 通过 StreamingResponse 返回 zip 文件
+    return StreamingResponse(zip_buffer, media_type="application/zip",
+                             headers={"Content-Disposition": "attachment; filename=output.zip"})
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host=os.environ.get("FLIP_IP", "127.0.0.1"), port=os.environ.get("FLIP_PORT", 9871))
diff --git a/assets/.gitignore b/assets/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..892dfa4274b60c1629e26719bbd1e462fcce33e8
--- /dev/null
+++ b/assets/.gitignore
@@ -0,0 +1,2 @@
+examples/driving/*.pkl
+examples/driving/*_crop.mp4
diff --git a/assets/docs/API.md b/assets/docs/API.md
new file mode 100644
index 0000000000000000000000000000000000000000..2e4238b0d57438b37ae73131da3241645074be51
--- /dev/null
+++ b/assets/docs/API.md
@@ -0,0 +1,41 @@
+## FasterLivePortrait API Usage Guide
+
+### Building the Image
+* Decide on an image name, for example `shaoguo/faster_liveportrait_api:v1.0`. Replace the `-t` parameter in the following command with your chosen name.
+* Run `docker build -t shaoguo/faster_liveportrait_api:v1.0 -f DockerfileAPI .`
+
+### Running the Image
+Ensure that your machine has Nvidia GPU drivers installed. CUDA version should be 12.0 or higher. Two scenarios are described below.
+
+* Running on a Local Machine (typically for self-testing)
+  * Modify the image name according to what you defined above.
+  * Confirm the service port number, default is `9871`. You can define your own by changing the `SERVER_PORT` environment variable in the command below. Remember to also change `-p 9871:9871` to map the port.
+  * Set the model path environment variable `CHECKPOINT_DIR`. If you've previously downloaded FasterLivePortrait's onnx model and converted it to trt, I recommend mapping the model files into the container using `-v`, for example `-v E:\my_projects\FasterLivePortrait\checkpoints:/root/FasterLivePortrait/checkpoints`. This avoids re-downloading the onnx model and doing trt conversion. Otherwise, I will check if `CHECKPOINT_DIR` has models, and if not, I will automatically download (ensure network connectivity) and do trt conversion, which will take considerable time.
+  * Run command (note: modify the following command according to your settings):
+    ```shell
+    docker run -d --gpus=all \
+    --name faster_liveportrait_api \
+    -v E:\my_projects\FasterLivePortrait\checkpoints:/root/FasterLivePortrait/checkpoints \
+    -e CHECKPOINT_DIR=/root/FasterLivePortrait/checkpoints \
+    -e SERVER_PORT=9871 \
+    -p 9871:9871 \
+    --restart=always \
+    shaoguo/faster_liveportrait_api:v1.0 \
+    /bin/bash
+    ```
+  * Normal operation should display the following information(docker logs $container_id). The running logs are saved in `/root/FasterLivePortrait/logs/log_run.log`:
+    ```shell
+    INFO:     Application startup complete.
+    INFO:     Uvicorn running on http://0.0.0.0:9871 (Press CTRL+C to quit)
+    ```
+
+* Running on Cloud GPU Cluster (production environment)
+  * This needs to be configured according to different clusters, but the core is the configuration of docker image and environment variables.
+  * Load balancing may need to be set up.
+
+### API Call Testing
+Refer to `tests/test_api.py`. The default is the Animal model, but now it also supports the Human model.
+The return is a compressed package, by default unzipped to `./results/api_*`. Confirm according to the actual printed log.
+* `test_with_video_animal()`, image and video driving. Set `flag_pickle=False`. It will additionally return the driving video's pkl file, which can be called directly next time.
+* `test_with_pkl_animal()`, image and pkl driving.
+* `test_with_video_human()`, image and video driving under the Human model, set `flag_is_animal=False`
\ No newline at end of file
diff --git a/assets/docs/API_ZH.md b/assets/docs/API_ZH.md
new file mode 100644
index 0000000000000000000000000000000000000000..19e24a9dc9d138950116c8d907c3207f3af43b4b
--- /dev/null
+++ b/assets/docs/API_ZH.md
@@ -0,0 +1,47 @@
+## FasterLivePortrait API使用教程
+
+### 构建镜像
+
+* 确定镜像的名字，比如 `shaoguo/faster_liveportrait_api:v1.0`。确认后替换为下面命令 `-t` 的参数。
+* 运行 `docker build -t shaoguo/faster_liveportrait_api:v1.0 -f DockerfileAPI .`
+
+### 运行镜像
+
+请确保你的机器已经装了Nvidia显卡的驱动。CUDA的版本在cuda12.0及以上。以下分两种情况介绍。
+
+* 本地机器运行(一般自己测试使用)
+    * 镜像名称根据上面你自己定义的更改。
+    * 确认服务的端口号，默认为`9871`，你可以自己定义，更改下面命令里环境变量`SERVER_PORT`。同时要记得更改`-p 9871:9871`,
+      将端口映射出来。
+    * 设置模型路径环境变量 `CHECKPOINT_DIR`。如果你之前下载过FasterLivePortrait的onnx模型并做过trt的转换，我建议
+      是可以通过 `-v`把
+      模型文件映射进入容器，比如 `-v E:\my_projects\FasterLivePortrait\checkpoints:/root/FasterLivePortrait/checkpoints`,
+      这样就避免重新下载onnx模型和做trt的转换。否则我将会检测`CHECKPOINT_DIR`是否有模型，没有的话，我将自动下载（确保有网络）和做trt的转换，这将耗时比较久的时间。
+    * 运行命令(注意你要根据自己的设置更改以下命令的信息）：
+  ```shell
+    docker run -d --gpus=all \
+    --name faster_liveportrait_api \
+    -v E:\my_projects\FasterLivePortrait\checkpoints:/root/FasterLivePortrait/checkpoints \
+    -e CHECKPOINT_DIR=/root/FasterLivePortrait/checkpoints \
+    -e SERVER_PORT=9871 \
+    -p 9871:9871 \
+    --restart=always \
+    shaoguo/faster_liveportrait_api:v1.0
+  ```
+    * 正常运行应该会显示以下信息(docker logs container_id), 运行的日志保存在`/root/FasterLivePortrait/logs/log_run.log`:
+  ```shell
+    INFO:     Application startup complete.
+    INFO:     Uvicorn running on http://0.0.0.0:9871 (Press CTRL+C to quit)
+  ```
+* 云端GPU集群运行（生产环境）
+    * 这需要根据不同的集群做配置，但核心就是镜像和环境变量的配置。
+    * 可能要设置负载均衡。
+
+### API调用测试
+
+可以参考`tests/test_api.py`, 默认是Animal的模型，但现在同时也支持Human的模型了。
+返回的是压缩包，默认解压在`./results/api_*`, 根据实际打印出来的日志确认。
+
+* `test_with_video_animal()`, 图像和视频的驱动。设置`flag_pickle=False`。会额外返回driving video的pkl文件，下次可以直接调用。
+* `test_with_pkl_animal()`, 图像和pkl的驱动。
+* `test_with_video_human()`, Human模型下图像和视频的驱动，设置`flag_is_animal=False`
\ No newline at end of file
diff --git a/assets/gradio/gradio_description_animate_clear.md b/assets/gradio/gradio_description_animate_clear.md
new file mode 100644
index 0000000000000000000000000000000000000000..96d5fee236a75a418911512eaefd2b830ff03acf
--- /dev/null
+++ b/assets/gradio/gradio_description_animate_clear.md
@@ -0,0 +1,6 @@
+<div style="font-size: 1.2em; text-align: center;">
+    Step 3: Click the <strong>🚀 Animate</strong> button below to generate, or click <strong>🧹 Clear</strong> to erase the results
+</div>
+<!-- <div style="font-size: 1.1em; text-align: center;">
+    <strong style="color: red;">Note:</strong>  If both <strong>Source Image</strong> and <strong>Video</strong> are uploaded, the <strong>Source Image</strong> will be used. Please click the <strong>🧹 Clear</strong> button, then re-upload the <strong>Source Image</strong> or <strong>Video</strong>.
+</div> -->
diff --git a/assets/gradio/gradio_description_animation.md b/assets/gradio/gradio_description_animation.md
new file mode 100644
index 0000000000000000000000000000000000000000..126c4ce710212159279160aaf6c789315293c0d3
--- /dev/null
+++ b/assets/gradio/gradio_description_animation.md
@@ -0,0 +1,19 @@
+<span style="font-size: 1.2em;">🔥 To animate the source image or video with the driving video, please follow these steps:</span>
+<div style="font-size: 1.2em; margin-left: 20px;">
+1. In the <strong>Animation Options for Source Image or Video</strong> section, we recommend enabling the <code>do crop (source)</code> option if faces occupy a small portion of your source image or video.
+</div>
+<div style="font-size: 1.2em; margin-left: 20px;">
+2. In the <strong>Animation Options for Driving Video</strong> section, the <code>relative head rotation</code> and <code>smooth strength</code> options only take effect if the source input is a video.
+</div>
+<div style="font-size: 1.2em; margin-left: 20px;">
+3. Press the <strong>🚀 Animate</strong> button and wait for a moment. Your animated video will appear in the result block. This may take a few moments. If the input is a source video, the length of the animated video is the minimum of the length of the source video and the driving video.
+</div>
+<div style="font-size: 1.2em; margin-left: 20px;">
+4. If you want to upload your own driving video, <strong>the best practice</strong>:
+
+ - Crop it to a 1:1 aspect ratio (e.g., 512x512 or 256x256 pixels), or enable auto-driving by checking `do crop (driving video)`.
+ - Focus on the head area, similar to the example videos.
+ - Minimize shoulder movement.
+ - Make sure the first frame of driving video is a frontal face with **neutral expression**.
+
+</div>
diff --git a/assets/gradio/gradio_description_retargeting.md b/assets/gradio/gradio_description_retargeting.md
new file mode 100644
index 0000000000000000000000000000000000000000..64f1a7c1a791e33bbe39de03d910180edfb1b795
--- /dev/null
+++ b/assets/gradio/gradio_description_retargeting.md
@@ -0,0 +1,14 @@
+<br>
+
+<!-- ## Retargeting -->
+<!-- <span style="font-size: 1.2em;">🔥 To edit the eyes and lip open ratio of the source portrait, drag the sliders and click the <strong>🚗 Retargeting</strong> button. You can try running it multiple times. <strong>😊 Set both ratios to 0.8 to see what's going on!</strong> </span> -->
+
+
+<div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 1.2em;">
+  <div>
+    <h2>Retargeting</h2>
+    <p>Upload a Source Portrait as Retargeting Input, then drag the sliders and click the <strong>🚗 Retargeting</strong> button. You can try running it multiple times.
+    <br>
+    <strong>😊 Set both ratios to 0.8 to see what's going on!</strong></p>
+  </div>
+</div>
diff --git a/assets/gradio/gradio_description_upload.md b/assets/gradio/gradio_description_upload.md
new file mode 100644
index 0000000000000000000000000000000000000000..f5a018afa9ccacbc0a0b84c420c839567d87d628
--- /dev/null
+++ b/assets/gradio/gradio_description_upload.md
@@ -0,0 +1,16 @@
+<br>
+<div style="font-size: 1.2em; display: flex; justify-content: space-between;">
+    <div style="flex: 1; text-align: center; margin-right: 20px;">
+        <div style="display: inline-block;">
+            Step 1: Upload a <strong>Source Image</strong> or <strong>Video</strong> (any aspect ratio) ⬇️
+        </div>
+    </div>
+    <div style="flex: 1; text-align: center; margin-left: 20px;">
+        <div style="display: inline-block;">
+            Step 2: Upload a <strong>Driving Video</strong> (any aspect ratio) ⬇️
+        </div>
+        <div style="display: inline-block; font-size: 0.8em;">
+            <strong>Tips:</strong> Focus on the head, minimize shoulder movement, <strong>neutral expression</strong> in first frame.
+        </div>
+    </div>
+</div>
diff --git a/assets/gradio/gradio_title.md b/assets/gradio/gradio_title.md
new file mode 100644
index 0000000000000000000000000000000000000000..e1dd90e8e94f7d4c70c76d3e0cef7e090baffd40
--- /dev/null
+++ b/assets/gradio/gradio_title.md
@@ -0,0 +1,19 @@
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+  <div>
+    <h1>FasterLivePortrait: Bring Portraits to Life in Real Time</h1>
+    <span>Built on <a href="https://github.com/KwaiVGI/LivePortrait">LivePortrait</a></span>
+    <div style="display: flex; justify-content: center; align-items: center; text-align: center; margin-top: 10px;">
+      <a href="https://huggingface.co/warmshao/FasterLivePortrait">
+        <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue" alt="Hugging Face Spaces">
+      </a>
+      &nbsp;
+      <a href="https://github.com/warmshao/FasterLivePortrait">
+        <img src="https://img.shields.io/badge/Github-Code-blue" alt="Github Code">
+      </a>
+      &nbsp;
+      <a href="https://github.com/warmshao/FasterLivePortrait">
+        <img src="https://img.shields.io/github/stars/warmshao/FasterLivePortrait" alt="Github Stars">
+      </a>
+    </div>
+  </div>
+</div>
\ No newline at end of file
diff --git a/assets/mask_template.png b/assets/mask_template.png
new file mode 100644
index 0000000000000000000000000000000000000000..bca6ca5977ba820d0d2c05b3793c6231cc82e715
Binary files /dev/null and b/assets/mask_template.png differ
diff --git a/camera.bat b/camera.bat
new file mode 100644
index 0000000000000000000000000000000000000000..81400ab574b0c39010a4717a13c7e47006526c89
--- /dev/null
+++ b/camera.bat
@@ -0,0 +1,32 @@
+@echo off
+setlocal enabledelayedexpansion
+
+REM 设置默认源图像路径
+set "default_src_image=assets\examples\source\s12.jpg"
+set "src_image=%default_src_image%"
+set "animal_param="
+set "paste_back="
+
+REM 解析命名参数
+:parse_args
+if "%~1"=="" goto end_parse_args
+if /i "%~1"=="--src_image" (
+    set "src_image=%~2"
+    shift
+) else if /i "%~1"=="--animal" (
+    set "animal_param=--animal"
+) else if /i "%~1"=="--paste_back" (
+    set "paste_back=--paste_back"
+)
+shift
+goto parse_args
+:end_parse_args
+
+echo source image: [!src_image!]
+echo use animal: [!animal_param!]
+echo paste_back: [!paste_back!]
+
+REM 执行Python命令
+.\venv\python.exe .\run.py --cfg configs/trt_infer.yaml --realtime --dri_video 0 --src_image !src_image! !animal_param! !paste_back!
+
+endlocal
\ No newline at end of file
diff --git a/configs/onnx_infer.yaml b/configs/onnx_infer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ad99e954c7ca2bb7006799957b75fa3f8eebc5b
--- /dev/null
+++ b/configs/onnx_infer.yaml
@@ -0,0 +1,114 @@
+models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/warping_spade.onnx"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/motion_extractor.onnx"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.onnx"
+  face_analysis:
+    name: "FaceAnalysisModel"
+    predict_type: "ort"
+    model_path:
+      - "./checkpoints/liveportrait_onnx/retinaface_det_static.onnx"
+      - "./checkpoints/liveportrait_onnx/face_2dpose_106_static.onnx"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/appearance_feature_extractor.onnx"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching.onnx"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_eye.onnx"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_lip.onnx"
+
+animal_models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/warping_spade.onnx"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/motion_extractor.onnx"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/appearance_feature_extractor.onnx"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching.onnx"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_eye.onnx"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_lip.onnx"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.onnx"
+  face_analysis:
+    name: "FaceAnalysisModel"
+    predict_type: "ort"
+    model_path:
+      - "./checkpoints/liveportrait_onnx/retinaface_det_static.onnx"
+      - "./checkpoints/liveportrait_onnx/face_2dpose_106_static.onnx"
+
+joyvasa_models:
+  motion_model_path: "checkpoints/JoyVASA/motion_generator/motion_generator_hubert_chinese.pt"
+  audio_model_path: "checkpoints/chinese-hubert-base"
+  motion_template_path: "checkpoints/JoyVASA/motion_template/motion_template.pkl"
+
+crop_params:
+  src_dsize: 512
+  src_scale: 2.3
+  src_vx_ratio: 0.0
+  src_vy_ratio: -0.125
+  dri_scale: 2.2
+  dri_vx_ratio: 0.0
+  dri_vy_ratio: -0.1
+
+
+infer_params:
+  flag_crop_driving_video: False
+  flag_normalize_lip: True
+  flag_source_video_eye_retargeting: False
+  flag_video_editing_head_rotation: False
+  flag_eye_retargeting: False
+  flag_lip_retargeting: False
+  flag_stitching: True
+  flag_relative_motion: True
+  flag_pasteback: True
+  flag_do_crop: True
+  flag_do_rot: True
+
+  # NOT EXPOERTED PARAMS
+  lip_normalize_threshold: 0.03 # threshold for flag_normalize_lip
+  source_video_eye_retargeting_threshold: 0.18 # threshold for eyes retargeting if the input is a source video
+  driving_smooth_observation_variance: 1e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+  anchor_frame: 0 # TO IMPLEMENT
+  mask_crop_path: "./assets/mask_template.png"
+  driving_multiplier: 1.0
+  animation_region: "all"
+
+  cfg_mode: "incremental"
+  cfg_scale: 1.2
+
+  source_max_dim: 1280 # the max dim of height and width of source image
+  source_division: 2 # make sure the height and width of source image can be divided by this number
\ No newline at end of file
diff --git a/configs/onnx_mp_infer.yaml b/configs/onnx_mp_infer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..26e529a9f5bf8055ab2284635f9c3cedb98a1d33
--- /dev/null
+++ b/configs/onnx_mp_infer.yaml
@@ -0,0 +1,108 @@
+models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/warping_spade.onnx"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/motion_extractor.onnx"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.onnx"
+  face_analysis:
+    name: "MediaPipeFaceModel"
+    predict_type: "mp"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/appearance_feature_extractor.onnx"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching.onnx"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_eye.onnx"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_lip.onnx"
+
+animal_models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/warping_spade.onnx"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/motion_extractor.onnx"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/appearance_feature_extractor.onnx"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching.onnx"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_eye.onnx"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_lip.onnx"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.onnx"
+  face_analysis:
+    name: "MediaPipeFaceModel"
+    predict_type: "mp"
+
+joyvasa_models:
+  motion_model_path: "checkpoints/JoyVASA/motion_generator/motion_generator_hubert_chinese.pt"
+  audio_model_path: "checkpoints/chinese-hubert-base"
+  motion_template_path: "checkpoints/JoyVASA/motion_template/motion_template.pkl"
+
+crop_params:
+  src_dsize: 512
+  src_scale: 2.3
+  src_vx_ratio: 0.0
+  src_vy_ratio: -0.125
+  dri_scale: 2.2
+  dri_vx_ratio: 0.0
+  dri_vy_ratio: -0.1
+
+
+infer_params:
+  flag_crop_driving_video: False
+  flag_normalize_lip: True
+  flag_source_video_eye_retargeting: False
+  flag_video_editing_head_rotation: False
+  flag_eye_retargeting: False
+  flag_lip_retargeting: False
+  flag_stitching: True
+  flag_relative_motion: True
+  flag_pasteback: True
+  flag_do_crop: True
+  flag_do_rot: True
+
+  # NOT EXPOERTED PARAMS
+  lip_normalize_threshold: 0.03 # threshold for flag_normalize_lip
+  source_video_eye_retargeting_threshold: 0.18 # threshold for eyes retargeting if the input is a source video
+  driving_smooth_observation_variance: 1e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+  anchor_frame: 0 # TO IMPLEMENT
+  mask_crop_path: "./assets/mask_template.png"
+  driving_multiplier: 1.0
+  animation_region: "all"
+
+  cfg_mode: "incremental"
+  cfg_scale: 1.2
+
+  source_max_dim: 1280 # the max dim of height and width of source image
+  source_division: 2 # make sure the height and width of source image can be divided by this number
\ No newline at end of file
diff --git a/configs/trt_infer.yaml b/configs/trt_infer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f00f0a4a98552bc54e4589020d2f7ccd9841830a
--- /dev/null
+++ b/configs/trt_infer.yaml
@@ -0,0 +1,114 @@
+models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/warping_spade-fix.trt"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/motion_extractor.trt"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.trt"
+  face_analysis:
+    name: "FaceAnalysisModel"
+    predict_type: "trt"
+    model_path:
+      - "./checkpoints/liveportrait_onnx/retinaface_det_static.trt"
+      - "./checkpoints/liveportrait_onnx/face_2dpose_106_static.trt"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/appearance_feature_extractor.trt"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching.trt"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_eye.trt"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_lip.trt"
+
+animal_models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/warping_spade-fix-v1.1.trt"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/motion_extractor-v1.1.trt"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/appearance_feature_extractor-v1.1.trt"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching-v1.1.trt"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_eye-v1.1.trt"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_lip-v1.1.trt"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.trt"
+  face_analysis:
+    name: "FaceAnalysisModel"
+    predict_type: "trt"
+    model_path:
+      - "./checkpoints/liveportrait_onnx/retinaface_det_static.trt"
+      - "./checkpoints/liveportrait_onnx/face_2dpose_106_static.trt"
+
+joyvasa_models:
+  motion_model_path: "checkpoints/JoyVASA/motion_generator/motion_generator_hubert_chinese.pt"
+  audio_model_path: "checkpoints/chinese-hubert-base"
+  motion_template_path: "checkpoints/JoyVASA/motion_template/motion_template.pkl"
+
+crop_params:
+  src_dsize: 512
+  src_scale: 2.3
+  src_vx_ratio: 0.0
+  src_vy_ratio: -0.125
+  dri_scale: 2.2
+  dri_vx_ratio: 0.0
+  dri_vy_ratio: -0.1
+
+
+infer_params:
+  flag_crop_driving_video: False
+  flag_normalize_lip: True
+  flag_source_video_eye_retargeting: False
+  flag_video_editing_head_rotation: False
+  flag_eye_retargeting: False
+  flag_lip_retargeting: False
+  flag_stitching: True
+  flag_relative_motion: True
+  flag_pasteback: True
+  flag_do_crop: True
+  flag_do_rot: True
+
+  # NOT EXPOERTED PARAMS
+  lip_normalize_threshold: 0.1 # threshold for flag_normalize_lip
+  source_video_eye_retargeting_threshold: 0.18 # threshold for eyes retargeting if the input is a source video
+  driving_smooth_observation_variance: 1e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+  anchor_frame: 0 # TO IMPLEMENT
+  mask_crop_path: "./assets/mask_template.png"
+  driving_multiplier: 1.0
+  animation_region: "all"
+
+  cfg_mode: "incremental"
+  cfg_scale: 1.2
+
+  source_max_dim: 1280 # the max dim of height and width of source image
+  source_division: 2 # make sure the height and width of source image can be divided by this number
\ No newline at end of file
diff --git a/configs/trt_mp_infer.yaml b/configs/trt_mp_infer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8af46389933b8e69617ef30ccf4133cde49c3fde
--- /dev/null
+++ b/configs/trt_mp_infer.yaml
@@ -0,0 +1,108 @@
+models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/warping_spade-fix.trt"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/motion_extractor.trt"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.trt"
+  face_analysis:
+    name: "MediaPipeFaceModel"
+    predict_type: "mp"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/appearance_feature_extractor.trt"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching.trt"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_eye.trt"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_lip.trt"
+
+animal_models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/warping_spade-fix-v1.1.trt"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/motion_extractor-v1.1.trt"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/appearance_feature_extractor-v1.1.trt"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching-v1.1.trt"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_eye-v1.1.trt"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_lip-v1.1.trt"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.trt"
+  face_analysis:
+    name: "MediaPipeFaceModel"
+    predict_type: "mp"
+
+joyvasa_models:
+  motion_model_path: "checkpoints/JoyVASA/motion_generator/motion_generator_hubert_chinese.pt"
+  audio_model_path: "checkpoints/chinese-hubert-base"
+  motion_template_path: "checkpoints/JoyVASA/motion_template/motion_template.pkl"
+
+crop_params:
+  src_dsize: 512
+  src_scale: 2.3
+  src_vx_ratio: 0.0
+  src_vy_ratio: -0.125
+  dri_scale: 2.2
+  dri_vx_ratio: 0.0
+  dri_vy_ratio: -0.1
+
+
+infer_params:
+  flag_crop_driving_video: False
+  flag_normalize_lip: True
+  flag_source_video_eye_retargeting: False
+  flag_video_editing_head_rotation: False
+  flag_eye_retargeting: False
+  flag_lip_retargeting: False
+  flag_stitching: True
+  flag_relative_motion: True
+  flag_pasteback: True
+  flag_do_crop: True
+  flag_do_rot: True
+  animation_region: "all"
+
+  # NOT EXPOERTED PARAMS
+  lip_normalize_threshold: 0.03 # threshold for flag_normalize_lip
+  source_video_eye_retargeting_threshold: 0.18 # threshold for eyes retargeting if the input is a source video
+  driving_smooth_observation_variance: 1e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+  anchor_frame: 0 # TO IMPLEMENT
+  mask_crop_path: "./assets/mask_template.png"
+  driving_multiplier: 1.0
+
+  cfg_mode: "incremental"
+  cfg_scale: 1.2
+
+  source_max_dim: 1280 # the max dim of height and width of source image
+  source_division: 2 # make sure the height and width of source image can be divided by this number
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..52a0b3acb5fad81a365a2ff0ac78e3a3e1ae7b17
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,18 @@
+ffmpeg-python
+omegaconf
+onnx
+pycuda
+numpy
+opencv-python
+gradio
+scikit-image
+insightface
+huggingface_hub[cli]
+mediapipe
+torchgeometry
+soundfile
+munch
+phonemizer
+kokoro>=0.3.4
+misaki[ja]
+misaki[zh]
\ No newline at end of file
diff --git a/requirements_macos.txt b/requirements_macos.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e4d210654902d41d896f515674da6be9c82a417
--- /dev/null
+++ b/requirements_macos.txt
@@ -0,0 +1,18 @@
+ffmpeg-python
+omegaconf
+onnx
+onnxruntime
+numpy
+opencv-python
+gradio
+scikit-image
+insightface
+huggingface_hub[cli]
+mediapipe
+torchgeometry
+soundfile
+munch
+phonemizer
+kokoro>=0.3.4
+misaki[ja]
+misaki[zh]
\ No newline at end of file
diff --git a/requirements_win.txt b/requirements_win.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2d142da6eb8062aa8192b4228e28e8096a19c763
--- /dev/null
+++ b/requirements_win.txt
@@ -0,0 +1,17 @@
+ffmpeg-python
+omegaconf
+onnx
+numpy
+opencv-python
+gradio
+scikit-image
+insightface
+huggingface_hub[cli]
+mediapipe
+torchgeometry
+soundfile
+munch
+phonemizer
+kokoro>=0.3.4
+misaki[ja]
+misaki[zh]
\ No newline at end of file
diff --git a/run.py b/run.py
new file mode 100644
index 0000000000000000000000000000000000000000..52ed8d1f725f8d9984b95a508ddb3fbbff26f2b7
--- /dev/null
+++ b/run.py
@@ -0,0 +1,322 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: run.py
+
+"""
+# video
+ python run.py \
+ --src_image assets/examples/driving/d13.mp4 \
+ --dri_video assets/examples/driving/d11.mp4 \
+ --cfg configs/trt_infer.yaml \
+ --paste_back \
+ --animal
+# pkl
+ python run.py \
+ --src_image assets/examples/source/s12.jpg \
+ --dri_video ./results/2024-09-13-081710/d0.mp4.pkl \
+ --cfg configs/trt_infer.yaml \
+ --paste_back \
+ --animal
+"""
+import os
+import argparse
+import pdb
+import subprocess
+import ffmpeg
+import cv2
+import time
+import numpy as np
+import os
+import datetime
+import platform
+import pickle
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from colorama import Fore, Back, Style
+from src.pipelines.faster_live_portrait_pipeline import FasterLivePortraitPipeline
+from src.utils.utils import video_has_audio
+
+if platform.system().lower() == 'windows':
+    FFMPEG = "third_party/ffmpeg-7.0.1-full_build/bin/ffmpeg.exe"
+else:
+    FFMPEG = "ffmpeg"
+
+
+def run_with_video(args):
+    print(Fore.RED+'Render,  Q > exit,  S > Stitching,  Z > RelativeMotion,  X > AnimationRegion,  C > CropDrivingVideo, KL > AdjustSourceScale, NM > AdjustDriverScale,  Space > Webcamassource,  R > SwitchRealtimeWebcamUpdate'+Style.RESET_ALL)
+    infer_cfg = OmegaConf.load(args.cfg)
+    infer_cfg.infer_params.flag_pasteback = args.paste_back
+
+    pipe = FasterLivePortraitPipeline(cfg=infer_cfg, is_animal=args.animal)
+    ret = pipe.prepare_source(args.src_image, realtime=args.realtime)
+    if not ret:
+        print(f"no face in {args.src_image}! exit!")
+        exit(1)
+    if not args.dri_video or not os.path.exists(args.dri_video):
+        # read frame from camera if no driving video input
+        vcap = cv2.VideoCapture(0)
+        if not vcap.isOpened():
+            print("no camera found! exit!")
+            exit(1)
+    else:
+        vcap = cv2.VideoCapture(args.dri_video)
+    fps = int(vcap.get(cv2.CAP_PROP_FPS))
+    h, w = pipe.src_imgs[0].shape[:2]
+    save_dir = f"./results/{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}"
+    os.makedirs(save_dir, exist_ok=True)
+
+    # render output video
+    if not args.realtime:
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        vsave_crop_path = os.path.join(save_dir,
+                                       f"{os.path.basename(args.src_image)}-{os.path.basename(args.dri_video)}-crop.mp4")
+        vout_crop = cv2.VideoWriter(vsave_crop_path, fourcc, fps, (512 * 2, 512))
+        vsave_org_path = os.path.join(save_dir,
+                                      f"{os.path.basename(args.src_image)}-{os.path.basename(args.dri_video)}-org.mp4")
+        vout_org = cv2.VideoWriter(vsave_org_path, fourcc, fps, (w, h))
+
+    infer_times = []
+    motion_lst = []
+    c_eyes_lst = []
+    c_lip_lst = []
+
+    frame_ind = 0
+    while vcap.isOpened():
+        ret, frame = vcap.read()
+        if not ret:
+            break
+        t0 = time.time()
+        first_frame = frame_ind == 0
+        dri_crop, out_crop, out_org, dri_motion_info = pipe.run(frame, pipe.src_imgs[0], pipe.src_infos[0],
+                                                                first_frame=first_frame)
+        frame_ind += 1
+        if out_crop is None:
+            print(f"no face in driving frame:{frame_ind}")
+            continue
+
+        motion_lst.append(dri_motion_info[0])
+        c_eyes_lst.append(dri_motion_info[1])
+        c_lip_lst.append(dri_motion_info[2])
+
+        infer_times.append(time.time() - t0)
+        # print(time.time() - t0)
+        dri_crop = cv2.resize(dri_crop, (512, 512))
+        out_crop = np.concatenate([dri_crop, out_crop], axis=1)
+        out_crop = cv2.cvtColor(out_crop, cv2.COLOR_RGB2BGR)
+        if not args.realtime:
+            vout_crop.write(out_crop)
+            out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+            vout_org.write(out_org)
+        else:
+            if infer_cfg.infer_params.flag_pasteback:
+                out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+                cv2.imshow('Render', out_org)
+            else:
+                # image show in realtime mode
+                cv2.imshow('Render', out_crop)
+            # 按下'q'键退出循环
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                break
+    vcap.release()
+    if not args.realtime:
+        vout_crop.release()
+        vout_org.release()
+        if video_has_audio(args.dri_video):
+            vsave_crop_path_new = os.path.splitext(vsave_crop_path)[0] + "-audio.mp4"
+            subprocess.call(
+                [FFMPEG, "-i", vsave_crop_path, "-i", args.dri_video,
+                 "-b:v", "10M", "-c:v",
+                 "libx264", "-map", "0:v", "-map", "1:a",
+                 "-c:a", "aac",
+                 "-pix_fmt", "yuv420p", vsave_crop_path_new, "-y", "-shortest"])
+            vsave_org_path_new = os.path.splitext(vsave_org_path)[0] + "-audio.mp4"
+            subprocess.call(
+                [FFMPEG, "-i", vsave_org_path, "-i", args.dri_video,
+                 "-b:v", "10M", "-c:v",
+                 "libx264", "-map", "0:v", "-map", "1:a",
+                 "-c:a", "aac",
+                 "-pix_fmt", "yuv420p", vsave_org_path_new, "-y", "-shortest"])
+
+            print(vsave_crop_path_new)
+            print(vsave_org_path_new)
+        else:
+            print(vsave_crop_path)
+            print(vsave_org_path)
+    else:
+        cv2.destroyAllWindows()
+
+    print(
+        "inference median time: {} ms/frame, mean time: {} ms/frame".format(np.median(infer_times) * 1000,
+                                                                            np.mean(infer_times) * 1000))
+    # save driving motion to pkl
+    template_dct = {
+        'n_frames': len(motion_lst),
+        'output_fps': fps,
+        'motion': motion_lst,
+        'c_eyes_lst': c_eyes_lst,
+        'c_lip_lst': c_lip_lst,
+    }
+    template_pkl_path = os.path.join(save_dir,
+                                     f"{os.path.basename(args.dri_video)}.pkl")
+    with open(template_pkl_path, "wb") as fw:
+        pickle.dump(template_dct, fw)
+    print(f"save driving motion pkl file at : {template_pkl_path}")
+
+
+def run_with_pkl(args):
+    infer_cfg = OmegaConf.load(args.cfg)
+    infer_cfg.infer_params.flag_pasteback = args.paste_back
+
+    pipe = FasterLivePortraitPipeline(cfg=infer_cfg, is_animal=args.animal)
+    ret = pipe.prepare_source(args.src_image, realtime=args.realtime)
+    if not ret:
+        print(f"no face in {args.src_image}! exit!")
+        return
+    with open(args.dri_video, "rb") as fin:
+        dri_motion_infos = pickle.load(fin)
+
+    fps = int(dri_motion_infos["output_fps"])
+    h, w = pipe.src_imgs[0].shape[:2]
+    save_dir = f"./results/{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}"
+    os.makedirs(save_dir, exist_ok=True)
+
+    # render output video
+    if not args.realtime:
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        vsave_crop_path = os.path.join(save_dir,
+                                       f"{os.path.basename(args.src_image)}-{os.path.basename(args.dri_video)}-crop.mp4")
+        vout_crop = cv2.VideoWriter(vsave_crop_path, fourcc, fps, (512, 512))
+        vsave_org_path = os.path.join(save_dir,
+                                      f"{os.path.basename(args.src_image)}-{os.path.basename(args.dri_video)}-org.mp4")
+        vout_org = cv2.VideoWriter(vsave_org_path, fourcc, fps, (w, h))
+
+    infer_times = []
+    motion_lst = dri_motion_infos["motion"]
+    c_eyes_lst = dri_motion_infos["c_eyes_lst"] if "c_eyes_lst" in dri_motion_infos else dri_motion_infos[
+        "c_d_eyes_lst"]
+    c_lip_lst = dri_motion_infos["c_lip_lst"] if "c_lip_lst" in dri_motion_infos else dri_motion_infos["c_d_lip_lst"]
+
+    frame_num = len(motion_lst)
+    for frame_ind in tqdm(range(frame_num)):
+        t0 = time.time()
+        first_frame = frame_ind == 0
+        dri_motion_info_ = [motion_lst[frame_ind], c_eyes_lst[frame_ind], c_lip_lst[frame_ind]]
+        out_crop, out_org = pipe.run_with_pkl(dri_motion_info_, pipe.src_imgs[0], pipe.src_infos[0],
+                                              first_frame=first_frame)
+        if out_crop is None:
+            print(f"no face in driving frame:{frame_ind}")
+            continue
+
+        infer_times.append(time.time() - t0)
+        # print(time.time() - t0)
+        out_crop = cv2.cvtColor(out_crop, cv2.COLOR_RGB2BGR)
+        if not args.realtime:
+            vout_crop.write(out_crop)
+            out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+            vout_org.write(out_org)
+        else:
+            if infer_cfg.infer_params.flag_pasteback:
+                out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+                cv2.imshow('Render,  Q > exit,  S > Stitching,  Z > RelativeMotion,  X > AnimationRegion,  C > CropDrivingVideo, KL > AdjustSourceScale, NM > AdjustDriverScale,  Space > Webcamassource,  R > SwitchRealtimeWebcamUpdate',out_org)
+            else:
+                # image show in realtime mode
+                cv2.imshow('Render,  Q > exit,  S > Stitching,  Z > RelativeMotion,  X > AnimationRegion,  C > CropDrivingVideo, KL > AdjustSourceScale, NM > AdjustDriverScale,  Space > Webcamassource,  R > SwitchRealtimeWebcamUpdate', out_crop)
+            # Press the 'q' key to exit the loop, r to switch realtime src_webcam update, spacebar to switch sourceisWebcam
+            k = cv2.waitKey(1) & 0xFF
+            if k == ord('q'):
+                break
+            # Key for Interesting Params    
+            if k == ord('s'):
+                infer_cfg.infer_params.flag_stitching = not infer_cfg.infer_params.flag_stitching
+                print('flag_stitching:'+str(infer_cfg.infer_params.flag_stitching))
+            if k == ord('z'):
+                infer_cfg.infer_params.flag_relative_motion = not infer_cfg.infer_params.flag_relative_motion
+                print('flag_relative_motion:'+str(infer_cfg.infer_params.flag_relative_motion))                
+            if k == ord('x'):
+                if infer_cfg.infer_params.animation_region == "all": infer_cfg.infer_params.animation_region = "exp", print('animation_region = "exp"')
+                else:infer_cfg.infer_params.animation_region = "all", print('animation_region = "all"')
+            if k == ord('c'):
+                infer_cfg.infer_params.flag_crop_driving_video = not infer_cfg.infer_params.flag_crop_driving_video
+                print('flag_crop_driving_video:'+str(infer_cfg.infer_params.flag_crop_driving_video))  
+            if k == ord('v'):
+                infer_cfg.infer_params.flag_pasteback = not infer_cfg.infer_params.flag_pasteback
+                print('flag_pasteback:'+str(infer_cfg.infer_params.flag_pasteback)) 
+                
+            if k == ord('a'):
+                infer_cfg.infer_params.flag_normalize_lip = not infer_cfg.infer_params.flag_normalize_lip
+                print('flag_normalize_lip:'+str(infer_cfg.infer_params.flag_normalize_lip))  
+            if k == ord('d'):
+                infer_cfg.infer_params.flag_source_video_eye_retargeting = not infer_cfg.infer_params.flag_source_video_eye_retargeting
+                print('flag_source_video_eye_retargeting:'+str(infer_cfg.infer_params.flag_source_video_eye_retargeting))  
+            if k == ord('f'):
+                infer_cfg.infer_params.flag_video_editing_head_rotation = not infer_cfg.infer_params.flag_video_editing_head_rotation
+                print('flag_video_editing_head_rotation:'+str(infer_cfg.infer_params.flag_video_editing_head_rotation))                 
+            if k == ord('g'):
+                infer_cfg.infer_params.flag_eye_retargeting = not infer_cfg.infer_params.flag_eye_retargeting
+                print('flag_eye_retargeting:'+str(infer_cfg.infer_params.flag_eye_retargeting)) 
+                
+            if k == ord('k'):
+                infer_cfg.crop_params.src_scale -= 0.1
+                ret = pipe.prepare_source(args.src_image, realtime=args.realtime)
+                print('src_scale:'+str(infer_cfg.crop_params.src_scale))                
+            if k == ord('l'):
+                infer_cfg.crop_params.src_scale += 0.1
+                ret = pipe.prepare_source(args.src_image, realtime=args.realtime)
+                print('src_scale:'+str(infer_cfg.crop_params.src_scale))  
+            if k == ord('n'):
+                infer_cfg.crop_params.dri_scale -= 0.1
+                print('dri_scale:'+str(infer_cfg.crop_params.dri_scale))                
+            if k == ord('m'):
+                infer_cfg.crop_params.dri_scale += 0.1
+                print('dri_scale:'+str(infer_cfg.crop_params.dri_scale))
+
+    if not args.realtime:
+        vout_crop.release()
+        vout_org.release()
+        if video_has_audio(args.dri_video):
+            vsave_crop_path_new = os.path.splitext(vsave_crop_path)[0] + "-audio.mp4"
+            subprocess.call(
+                [FFMPEG, "-i", vsave_crop_path, "-i", args.dri_video,
+                 "-b:v", "10M", "-c:v",
+                 "libx264", "-map", "0:v", "-map", "1:a",
+                 "-c:a", "aac",
+                 "-pix_fmt", "yuv420p", vsave_crop_path_new, "-y", "-shortest"])
+            vsave_org_path_new = os.path.splitext(vsave_org_path)[0] + "-audio.mp4"
+            subprocess.call(
+                [FFMPEG, "-i", vsave_org_path, "-i", args.dri_video,
+                 "-b:v", "10M", "-c:v",
+                 "libx264", "-map", "0:v", "-map", "1:a",
+                 "-c:a", "aac",
+                 "-pix_fmt", "yuv420p", vsave_org_path_new, "-y", "-shortest"])
+
+            print(vsave_crop_path_new)
+            print(vsave_org_path_new)
+        else:
+            print(vsave_crop_path)
+            print(vsave_org_path)
+    else:
+        cv2.destroyAllWindows()
+
+    print(
+        "inference median time: {} ms/frame, mean time: {} ms/frame".format(np.median(infer_times) * 1000,
+                                                                            np.mean(infer_times) * 1000))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Faster Live Portrait Pipeline')
+    parser.add_argument('--src_image', required=False, type=str, default="assets/examples/source/s12.jpg",
+                        help='source image')
+    parser.add_argument('--dri_video', required=False, type=str, default="assets/examples/driving/d14.mp4",
+                        help='driving video')
+    parser.add_argument('--cfg', required=False, type=str, default="configs/onnx_infer.yaml", help='inference config')
+    parser.add_argument('--realtime', action='store_true', help='realtime inference')
+    parser.add_argument('--animal', action='store_true', help='use animal model')
+    parser.add_argument('--paste_back', action='store_true', default=False, help='paste back to origin image')
+    args, unknown = parser.parse_known_args()
+
+    if args.dri_video.endswith(".pkl"):
+        run_with_pkl(args)
+    else:
+        run_with_video(args)
diff --git a/scripts/all_onnx2trt.bat b/scripts/all_onnx2trt.bat
new file mode 100644
index 0000000000000000000000000000000000000000..660fc1edae2c9a289baffd366926e4d03383615d
--- /dev/null
+++ b/scripts/all_onnx2trt.bat
@@ -0,0 +1,29 @@
+@echo off
+
+REM warping+spade model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\warping_spade-fix.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\warping_spade-fix.onnx
+
+REM landmark model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\landmark.onnx
+
+REM motion_extractor model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\motion_extractor.onnx -p fp32
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\motion_extractor.onnx -p fp32
+
+REM face_analysis model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\retinaface_det_static.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\face_2dpose_106_static.onnx
+
+REM appearance_extractor model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\appearance_feature_extractor.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\appearance_feature_extractor.onnx
+
+REM stitching model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\stitching.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\stitching_eye.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\stitching_lip.onnx
+
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\stitching.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\stitching_eye.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\stitching_lip.onnx
diff --git a/scripts/all_onnx2trt.sh b/scripts/all_onnx2trt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0bb0a7aef0377e1c630e9093d9f46e3c7b0c8840
--- /dev/null
+++ b/scripts/all_onnx2trt.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# warping+spade model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/warping_spade-fix.onnx
+# landmark model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/landmark.onnx
+# motion_extractor model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/motion_extractor.onnx -p fp32
+# face_analysis model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/retinaface_det_static.onnx
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/face_2dpose_106_static.onnx
+# appearance_extractor model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/appearance_feature_extractor.onnx
+# stitching model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/stitching.onnx
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/stitching_eye.onnx
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/stitching_lip.onnx
diff --git a/scripts/all_onnx2trt_animal.sh b/scripts/all_onnx2trt_animal.sh
new file mode 100644
index 0000000000000000000000000000000000000000..741f43bb403feb3767aca836acb481cae0bddf8c
--- /dev/null
+++ b/scripts/all_onnx2trt_animal.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# warping+spade model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/warping_spade-fix-v1.1.onnx
+# motion_extractor model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/motion_extractor-v1.1.onnx -p fp32
+# appearance_extractor model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/appearance_feature_extractor-v1.1.onnx
+# stitching model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/stitching-v1.1.onnx
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/stitching_eye-v1.1.onnx
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/stitching_lip-v1.1.onnx
diff --git a/scripts/onnx2trt.py b/scripts/onnx2trt.py
new file mode 100644
index 0000000000000000000000000000000000000000..86f482a46839006885380bc207f9af46d40b4a7c
--- /dev/null
+++ b/scripts/onnx2trt.py
@@ -0,0 +1,161 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import pdb
+import sys
+import logging
+import argparse
+import platform
+
+import tensorrt as trt
+import ctypes
+import numpy as np
+
+logging.basicConfig(level=logging.INFO)
+logging.getLogger("EngineBuilder").setLevel(logging.INFO)
+log = logging.getLogger("EngineBuilder")
+
+
+def load_plugins(logger: trt.Logger):
+    # 加载插件库
+    if platform.system().lower() == 'linux':
+        ctypes.CDLL("./checkpoints/liveportrait_onnx/libgrid_sample_3d_plugin.so", mode=ctypes.RTLD_GLOBAL)
+    else:
+        ctypes.CDLL("./checkpoints/liveportrait_onnx/grid_sample_3d_plugin.dll", mode=ctypes.RTLD_GLOBAL, winmode=0)
+    # 初始化TensorRT的插件库
+    trt.init_libnvinfer_plugins(logger, "")
+
+
+class EngineBuilder:
+    """
+    Parses an ONNX graph and builds a TensorRT engine from it.
+    """
+
+    def __init__(self, verbose=False):
+        """
+        :param verbose: If enabled, a higher verbosity level will be set on the TensorRT logger.
+        """
+        self.trt_logger = trt.Logger(trt.Logger.INFO)
+        if verbose:
+            self.trt_logger.min_severity = trt.Logger.Severity.VERBOSE
+
+        trt.init_libnvinfer_plugins(self.trt_logger, namespace="")
+
+        self.builder = trt.Builder(self.trt_logger)
+        self.config = self.builder.create_builder_config()
+        self.config.max_workspace_size = 12 * (2 ** 30)  # 12 GB
+
+        profile = self.builder.create_optimization_profile()
+
+        # for face_2dpose_106.onnx
+        # profile.set_shape("data", (1, 3, 192, 192), (1, 3, 192, 192), (1, 3, 192, 192))
+        # for retinaface_det.onnx
+        # profile.set_shape("input.1", (1, 3, 512, 512), (1, 3, 512, 512), (1, 3, 512, 512))
+
+        self.config.add_optimization_profile(profile)
+        # 严格类型约束
+        self.config.set_flag(trt.BuilderFlag.STRICT_TYPES)
+
+        self.batch_size = None
+        self.network = None
+        self.parser = None
+
+        # 加载自定义插件
+        load_plugins(self.trt_logger)
+
+    def create_network(self, onnx_path):
+        """
+        Parse the ONNX graph and create the corresponding TensorRT network definition.
+        :param onnx_path: The path to the ONNX graph to load.
+        """
+        network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+        self.network = self.builder.create_network(network_flags)
+        self.parser = trt.OnnxParser(self.network, self.trt_logger)
+
+        onnx_path = os.path.realpath(onnx_path)
+        with open(onnx_path, "rb") as f:
+            if not self.parser.parse(f.read()):
+                log.error("Failed to load ONNX file: {}".format(onnx_path))
+                for error in range(self.parser.num_errors):
+                    log.error(self.parser.get_error(error))
+                sys.exit(1)
+
+        inputs = [self.network.get_input(i) for i in range(self.network.num_inputs)]
+        outputs = [self.network.get_output(i) for i in range(self.network.num_outputs)]
+
+        log.info("Network Description")
+        for input in inputs:
+            self.batch_size = input.shape[0]
+            log.info("Input '{}' with shape {} and dtype {}".format(input.name, input.shape, input.dtype))
+        for output in outputs:
+            log.info("Output '{}' with shape {} and dtype {}".format(output.name, output.shape, output.dtype))
+        # assert self.batch_size > 0
+        self.builder.max_batch_size = 1
+
+    def create_engine(
+            self,
+            engine_path,
+            precision
+    ):
+        """
+        Build the TensorRT engine and serialize it to disk.
+        :param engine_path: The path where to serialize the engine to.
+        :param precision: The datatype to use for the engine, either 'fp32', 'fp16' or 'int8'.
+        """
+        engine_path = os.path.realpath(engine_path)
+        engine_dir = os.path.dirname(engine_path)
+        os.makedirs(engine_dir, exist_ok=True)
+        log.info("Building {} Engine in {}".format(precision, engine_path))
+
+        if precision == "fp16":
+            if not self.builder.platform_has_fast_fp16:
+                log.warning("FP16 is not supported natively on this platform/device")
+            else:
+                self.config.set_flag(trt.BuilderFlag.FP16)
+
+        with self.builder.build_engine(self.network, self.config) as engine, open(engine_path, "wb") as f:
+            log.info("Serializing engine to file: {:}".format(engine_path))
+            f.write(engine.serialize())
+
+
+def main(args):
+    builder = EngineBuilder(args.verbose)
+    builder.create_network(args.onnx)
+    builder.create_engine(
+        args.engine,
+        args.precision
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--onnx", required=True, help="The input ONNX model file to load")
+    parser.add_argument("-e", "--engine", help="The output path for the TRT engine")
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default="fp16",
+        choices=["fp32", "fp16", "int8"],
+        help="The precision mode to build in, either 'fp32', 'fp16' or 'int8', default: 'fp16'",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Enable more verbose log output")
+    args = parser.parse_args()
+    if args.engine is None:
+        args.engine = args.onnx.replace(".onnx", ".trt")
+    main(args)
diff --git a/scripts/start_api.sh b/scripts/start_api.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ba92ae0ce13c14e96e042498f65d741cad16f815
--- /dev/null
+++ b/scripts/start_api.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+source ~/.bashrc
+python api.py
\ No newline at end of file
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d9085a37e4abb3b69ea913c0919667ff6ca3c8a
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo0611@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py
diff --git a/src/models/JoyVASA/__init__.py b/src/models/JoyVASA/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..be72427b1ff842b6de4fa76ff06a60c71650f309
--- /dev/null
+++ b/src/models/JoyVASA/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/12/15
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: __init__.py
diff --git a/src/models/JoyVASA/common.py b/src/models/JoyVASA/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ebbb510ec54d2a35790e0993156001f7e015a1
--- /dev/null
+++ b/src/models/JoyVASA/common.py
@@ -0,0 +1,46 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=600):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        # vanilla sinusoidal encoding
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        x = x + self.pe[:, x.shape[1], :]
+        return self.dropout(x)
+
+
+def enc_dec_mask(T, S, frame_width=2, expansion=0, device='cuda'):
+    mask = torch.ones(T, S)
+    for i in range(T):
+        mask[i, max(0, (i - expansion) * frame_width):(i + expansion + 1) * frame_width] = 0
+    return (mask == 1).to(device=device)
+
+
+def pad_audio(audio, audio_unit=320, pad_threshold=80):
+    batch_size, audio_len = audio.shape
+    n_units = audio_len // audio_unit
+    side_len = math.ceil((audio_unit * n_units + pad_threshold - audio_len) / 2)
+    if side_len >= 0:
+        reflect_len = side_len // 2
+        replicate_len = side_len % 2
+        if reflect_len > 0:
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+        if replicate_len > 0:
+            audio = F.pad(audio, (1, 1), mode='replicate')
+
+    return audio
diff --git a/src/models/JoyVASA/dit_talking_head.py b/src/models/JoyVASA/dit_talking_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc97977d536ed4539d420de4cbf3805695053142
--- /dev/null
+++ b/src/models/JoyVASA/dit_talking_head.py
@@ -0,0 +1,538 @@
+import pdb
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import platform
+from .common import PositionalEncoding, enc_dec_mask, pad_audio
+from tqdm import tqdm
+
+
+class DiffusionSchedule(nn.Module):
+    def __init__(self, num_steps, mode='linear', beta_1=1e-4, beta_T=0.02, s=0.008):
+        super().__init__()
+
+        if mode == 'linear':
+            betas = torch.linspace(beta_1, beta_T, num_steps)
+        elif mode == 'quadratic':
+            betas = torch.linspace(beta_1 ** 0.5, beta_T ** 0.5, num_steps) ** 2
+        elif mode == 'sigmoid':
+            betas = torch.sigmoid(torch.linspace(-5, 5, num_steps)) * (beta_T - beta_1) + beta_1
+        elif mode == 'cosine':
+            steps = num_steps + 1
+            x = torch.linspace(0, num_steps, steps)
+            alpha_bars = torch.cos(((x / num_steps) + s) / (1 + s) * torch.pi * 0.5) ** 2
+            alpha_bars = alpha_bars / alpha_bars[0]
+            betas = 1 - (alpha_bars[1:] / alpha_bars[:-1])
+            betas = torch.clip(betas, 0.0001, 0.999)
+        else:
+            raise ValueError(f'Unknown diffusion schedule {mode}!')
+        betas = torch.cat([torch.zeros(1), betas], dim=0)  # Padding beta_0 = 0
+
+        alphas = 1 - betas
+        log_alphas = torch.log(alphas)
+        for i in range(1, log_alphas.shape[0]):  # 1 to T
+            log_alphas[i] += log_alphas[i - 1]
+        alpha_bars = log_alphas.exp()
+
+        sigmas_flex = torch.sqrt(betas)
+        sigmas_inflex = torch.zeros_like(sigmas_flex)
+        for i in range(1, sigmas_flex.shape[0]):
+            sigmas_inflex[i] = ((1 - alpha_bars[i - 1]) / (1 - alpha_bars[i])) * betas[i]
+        sigmas_inflex = torch.sqrt(sigmas_inflex)
+
+        self.num_steps = num_steps
+        self.register_buffer('betas', betas)
+        self.register_buffer('alphas', alphas)
+        self.register_buffer('alpha_bars', alpha_bars)
+        self.register_buffer('sigmas_flex', sigmas_flex)
+        self.register_buffer('sigmas_inflex', sigmas_inflex)
+
+    def uniform_sample_t(self, batch_size):
+        ts = torch.randint(1, self.num_steps + 1, (batch_size,))
+        return ts.tolist()
+
+    def get_sigmas(self, t, flexibility=0):
+        assert 0 <= flexibility <= 1
+        sigmas = self.sigmas_flex[t] * flexibility + self.sigmas_inflex[t] * (1 - flexibility)
+        return sigmas
+
+
+class DitTalkingHead(nn.Module):
+    def __init__(self, device='cuda', target="sample", architecture="decoder",
+                 motion_feat_dim=76, fps=25, n_motions=100, n_prev_motions=10,
+                 audio_model="hubert", feature_dim=512, n_diff_steps=500, diff_schedule="cosine",
+                 cfg_mode="incremental", guiding_conditions="audio,", audio_encoder_path=''):
+        super().__init__()
+
+        # Model parameters
+        self.target = target  # 预测原始图像还是预测噪声
+        self.architecture = architecture
+        self.motion_feat_dim = motion_feat_dim  # motion 特征维度
+        self.fps = fps
+        self.n_motions = n_motions  # 当前motion100个, window_length, T_w
+        self.n_prev_motions = n_prev_motions  # 前续motion 10个, T_p
+        self.feature_dim = feature_dim
+
+        # Audio encoder
+        self.audio_model = audio_model
+        if self.audio_model == 'wav2vec2':
+            print("using wav2vec2 audio encoder ...")
+            from .wav2vec2 import Wav2Vec2Model
+            self.audio_encoder = Wav2Vec2Model.from_pretrained(audio_encoder_path)
+            # wav2vec 2.0 weights initialization
+            self.audio_encoder.feature_extractor._freeze_parameters()
+
+            frozen_layers = [0, 1]
+            for name, param in self.audio_encoder.named_parameters():
+                if name.startswith("feature_projection"):
+                    param.requires_grad = False
+                if name.startswith("encoder.layers"):
+                    layer = int(name.split(".")[2])
+                    if layer in frozen_layers:
+                        param.requires_grad = False
+        elif self.audio_model == "wav2vec2_ori":
+            from .wav2vec2 import Wav2Vec2Model
+            self.audio_encoder = Wav2Vec2Model.from_pretrained(audio_encoder_path)
+            # wav2vec 2.0 weights initialization
+            self.audio_encoder.feature_extractor._freeze_parameters()
+        elif self.audio_model == 'hubert':  # 根据经验，hubert特征提取器效果更好
+            from .hubert import HubertModel
+            # from hubert import HubertModel
+            self.audio_encoder = HubertModel.from_pretrained(audio_encoder_path)
+            self.audio_encoder.feature_extractor._freeze_parameters()
+            # print("hubert-en: ", self.audio_encoder)
+
+            frozen_layers = [0, 1]
+            for name, param in self.audio_encoder.named_parameters():
+                if name.startswith("feature_projection"):
+                    param.requires_grad = False
+                if name.startswith("encoder.layers"):
+                    layer = int(name.split(".")[2])
+                    if layer in frozen_layers:
+                        param.requires_grad = False
+        elif self.audio_model == 'hubert_zh':  # 根据经验，hubert特征提取器效果更好
+            print("using hubert chinese")
+            from .hubert import HubertModel
+            # from hubert import HubertModel
+            self.audio_encoder = HubertModel.from_pretrained(audio_encoder_path)
+            self.audio_encoder.feature_extractor._freeze_parameters()
+
+            frozen_layers = [0, 1]
+            for name, param in self.audio_encoder.named_parameters():
+                if name.startswith("feature_projection"):
+                    param.requires_grad = False
+                if name.startswith("encoder.layers"):
+                    layer = int(name.split(".")[2])
+                    if layer in frozen_layers:
+                        param.requires_grad = False
+        elif self.audio_model == 'hubert_zh_ori':  # 根据经验，hubert特征提取器效果更好
+            print("using hubert chinese ori")
+            from .hubert import HubertModel
+            self.audio_encoder = HubertModel.from_pretrained(audio_encoder_path)
+            self.audio_encoder.feature_extractor._freeze_parameters()
+        else:
+            raise ValueError(f'Unknown audio model {self.audio_model}!')
+
+        if architecture == 'decoder':
+            self.audio_feature_map = nn.Linear(768, feature_dim)
+            self.start_audio_feat = nn.Parameter(torch.randn(1, self.n_prev_motions, feature_dim))
+        else:
+            raise ValueError(f'Unknown architecture {architecture}!')
+
+        self.start_motion_feat = nn.Parameter(torch.randn(1, self.n_prev_motions, self.motion_feat_dim))  # 1, 10, 76
+
+        # Diffusion model
+        self.denoising_net = DenoisingNetwork(device=device, n_motions=self.n_motions,
+                                              n_prev_motions=self.n_prev_motions,
+                                              motion_feat_dim=self.motion_feat_dim, feature_dim=feature_dim)
+        # diffusion schedule
+        self.diffusion_sched = DiffusionSchedule(n_diff_steps, diff_schedule)
+
+        # Classifier-free settings
+        self.cfg_mode = cfg_mode
+        guiding_conditions = guiding_conditions.split(',') if guiding_conditions else []
+        self.guiding_conditions = [cond for cond in guiding_conditions if cond in ['audio']]
+        if 'audio' in self.guiding_conditions:
+            audio_feat_dim = feature_dim
+            self.null_audio_feat = nn.Parameter(torch.randn(1, 1, audio_feat_dim))  # 1, 1, 512
+
+        self.to(device)
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def forward(self, motion_feat, audio_or_feat, prev_motion_feat=None, prev_audio_feat=None, time_step=None,
+                indicator=None):
+        """
+        Args:
+            motion_feat: (N, L, d_coef) motion coefficients or features
+            audio_or_feat: (N, L_audio) raw audio or audio feature
+            prev_motion_feat: (N, n_prev_motions, d_motion) previous motion coefficients or feature
+            prev_audio_feat: (N, n_prev_motions, d_audio) previous audio features
+            time_step: (N,)
+            indicator: (N, L) 0/1 indicator of real (unpadded) motion coefficients
+
+        Returns:
+           motion_feat_noise: (N, L, d_motion)
+        """
+        batch_size = motion_feat.shape[0]
+
+        # 加载语音特征
+        if audio_or_feat.ndim == 2:  # 原始语音
+            # Extract audio features
+            assert audio_or_feat.shape[1] == 16000 * self.n_motions / self.fps, \
+                f'Incorrect audio length {audio_or_feat.shape[1]}'
+            audio_feat_saved = self.extract_audio_feature(audio_or_feat)  # (N, L, feature_dim)
+        elif audio_or_feat.ndim == 3:  # 语音特征
+            assert audio_or_feat.shape[1] == self.n_motions, f'Incorrect audio feature length {audio_or_feat.shape[1]}'
+            audio_feat_saved = audio_or_feat
+        else:
+            raise ValueError(f'Incorrect audio input shape {audio_or_feat.shape}')
+        audio_feat = audio_feat_saved.clone()
+
+        # 前续motion特征
+        if prev_motion_feat is None:
+            prev_motion_feat = self.start_motion_feat.expand(batch_size, -1, -1)  # (N, n_prev_motions, d_motion)
+
+        # 前续语音特征
+        if prev_audio_feat is None:
+            # (N, n_prev_motions, feature_dim)
+            prev_audio_feat = self.start_audio_feat.expand(batch_size, -1, -1)
+
+        # Classifier-free guidance
+        if len(self.guiding_conditions) > 0:
+            assert len(self.guiding_conditions) <= 2, 'Only support 1 or 2 CFG conditions!'
+            if len(self.guiding_conditions) == 1 or self.cfg_mode == 'independent':
+                null_cond_prob = 0.5 if len(self.guiding_conditions) >= 2 else 0.1
+                if 'audio' in self.guiding_conditions:
+                    mask_audio = torch.rand(batch_size, device=self.device) < null_cond_prob
+                    audio_feat = torch.where(mask_audio.view(-1, 1, 1),
+                                             self.null_audio_feat.expand(batch_size, self.n_motions, -1),
+                                             audio_feat)
+            else:
+                # len(self.guiding_conditions) > 1 and self.cfg_mode == 'incremental'
+                # full (0.45), w/o style (0.45), w/o style or audio (0.1)
+                mask_flag = torch.rand(batch_size, device=self.device)
+                if 'audio' in self.guiding_conditions:
+                    mask_audio = mask_flag > 0.9
+                    audio_feat = torch.where(mask_audio.view(-1, 1, 1),
+                                             self.null_audio_feat.expand(batch_size, self.n_motions, -1),
+                                             audio_feat)
+
+        if time_step is None:
+            # Sample time step
+            time_step = self.diffusion_sched.uniform_sample_t(batch_size)  # (N,)
+
+        # The forward diffusion process
+        alpha_bar = self.diffusion_sched.alpha_bars[time_step]  # (N,)
+        c0 = torch.sqrt(alpha_bar).view(-1, 1, 1)  # (N, 1, 1)
+        c1 = torch.sqrt(1 - alpha_bar).view(-1, 1, 1)  # (N, 1, 1)
+
+        eps = torch.randn_like(motion_feat)  # (N, L, d_motion)
+        motion_feat_noisy = c0 * motion_feat + c1 * eps
+
+        # The reverse diffusion process
+        motion_feat_target = self.denoising_net(motion_feat_noisy, audio_feat,
+                                                prev_motion_feat, prev_audio_feat, time_step, indicator)
+
+        return eps, motion_feat_target, motion_feat.detach(), audio_feat_saved.detach()
+
+    def extract_audio_feature(self, audio, frame_num=None):
+        frame_num = frame_num or self.n_motions
+
+        # # Strategy 1: resample during audio feature extraction
+        # hidden_states = self.audio_encoder(pad_audio(audio), self.fps, frame_num=frame_num).last_hidden_state  # (N, L, 768)
+
+        # Strategy 2: resample after audio feature extraction (BackResample)
+        hidden_states = self.audio_encoder(pad_audio(audio), self.fps,
+                                           frame_num=frame_num * 2).last_hidden_state  # (N, 2L, 768)
+        hidden_states = hidden_states.transpose(1, 2)  # (N, 768, 2L)
+        hidden_states = F.interpolate(hidden_states, size=frame_num, align_corners=False, mode='linear')  # (N, 768, L)
+        hidden_states = hidden_states.transpose(1, 2)  # (N, L, 768)
+
+        audio_feat = self.audio_feature_map(hidden_states)  # (N, L, feature_dim)
+        return audio_feat
+
+    @torch.no_grad()
+    def sample(self, audio_or_feat, prev_motion_feat=None, prev_audio_feat=None,
+               motion_at_T=None, indicator=None, cfg_mode=None, cfg_cond=None, cfg_scale=1.15, flexibility=0,
+               dynamic_threshold=None, ret_traj=False):
+        # Check and convert inputs
+        batch_size = audio_or_feat.shape[0]
+
+        # Check CFG conditions
+        if cfg_mode is None:  # Use default CFG mode
+            cfg_mode = self.cfg_mode
+        if cfg_cond is None:  # Use default CFG conditions
+            cfg_cond = self.guiding_conditions
+        cfg_cond = [c for c in cfg_cond if c in ['audio', ]]
+
+        if not isinstance(cfg_scale, list):
+            cfg_scale = [cfg_scale] * len(cfg_cond)
+
+        # sort cfg_cond and cfg_scale
+        if len(cfg_cond) > 0:
+            cfg_cond, cfg_scale = zip(*sorted(zip(cfg_cond, cfg_scale), key=lambda x: ['audio', ].index(x[0])))
+        else:
+            cfg_cond, cfg_scale = [], []
+
+        if audio_or_feat.ndim == 2:
+            # Extract audio features
+            assert audio_or_feat.shape[1] == 16000 * self.n_motions / self.fps, \
+                f'Incorrect audio length {audio_or_feat.shape[1]}'
+            audio_feat = self.extract_audio_feature(audio_or_feat)  # (N, L, feature_dim)
+        elif audio_or_feat.ndim == 3:
+            assert audio_or_feat.shape[1] == self.n_motions, f'Incorrect audio feature length {audio_or_feat.shape[1]}'
+            audio_feat = audio_or_feat
+        else:
+            raise ValueError(f'Incorrect audio input shape {audio_or_feat.shape}')
+
+        if prev_motion_feat is None:
+            prev_motion_feat = self.start_motion_feat.expand(batch_size, -1, -1)  # (N, n_prev_motions, d_motion)
+        if prev_audio_feat is None:
+            # (N, n_prev_motions, feature_dim)
+            prev_audio_feat = self.start_audio_feat.expand(batch_size, -1, -1)
+
+        if motion_at_T is None:
+            motion_at_T = torch.randn((batch_size, self.n_motions, self.motion_feat_dim)).to(self.device)
+
+        # Prepare input for the reverse diffusion process (including optional classifier-free guidance)
+        if 'audio' in cfg_cond:
+            audio_feat_null = self.null_audio_feat.expand(batch_size, self.n_motions, -1)
+        else:
+            audio_feat_null = audio_feat
+
+        audio_feat_in = [audio_feat_null]
+        for cond in cfg_cond:
+            if cond == 'audio':
+                audio_feat_in.append(audio_feat)
+
+        n_entries = len(audio_feat_in)
+        audio_feat_in = torch.cat(audio_feat_in, dim=0)
+        prev_motion_feat_in = torch.cat([prev_motion_feat] * n_entries, dim=0)
+        prev_audio_feat_in = torch.cat([prev_audio_feat] * n_entries, dim=0)
+        indicator_in = torch.cat([indicator] * n_entries, dim=0) if indicator is not None else None
+
+        traj = {self.diffusion_sched.num_steps: motion_at_T}
+        for t in tqdm(range(self.diffusion_sched.num_steps, 0, -1)):
+            if t > 1:
+                z = torch.randn_like(motion_at_T)
+            else:
+                z = torch.zeros_like(motion_at_T)
+
+            alpha = self.diffusion_sched.alphas[t]
+            alpha_bar = self.diffusion_sched.alpha_bars[t]
+            alpha_bar_prev = self.diffusion_sched.alpha_bars[t - 1]
+            sigma = self.diffusion_sched.get_sigmas(t, flexibility)
+
+            motion_at_t = traj[t]
+            motion_in = torch.cat([motion_at_t] * n_entries, dim=0)
+            step_in = torch.tensor([t] * batch_size, device=self.device)
+            step_in = torch.cat([step_in] * n_entries, dim=0)
+
+            results = self.denoising_net(motion_in, audio_feat_in, prev_motion_feat_in,
+                                         prev_audio_feat_in, step_in, indicator_in)
+
+            # Apply thresholding if specified
+            if dynamic_threshold:
+                dt_ratio, dt_min, dt_max = dynamic_threshold
+                abs_results = results[:, -self.n_motions:].reshape(batch_size * n_entries, -1).abs()
+                s = torch.quantile(abs_results, dt_ratio, dim=1)
+                s = torch.clamp(s, min=dt_min, max=dt_max)
+                s = s[..., None, None]
+                results = torch.clamp(results, min=-s, max=s)
+
+            results = results.chunk(n_entries)
+
+            # Unconditional target (CFG) or the conditional target (non-CFG)
+            target_theta = results[0][:, -self.n_motions:]
+            # Classifier-free Guidance (optional)
+            for i in range(0, n_entries - 1):
+                if cfg_mode == 'independent':
+                    target_theta += cfg_scale[i] * (
+                            results[i + 1][:, -self.n_motions:] - results[0][:, -self.n_motions:])
+                elif cfg_mode == 'incremental':
+                    target_theta += cfg_scale[i] * (
+                            results[i + 1][:, -self.n_motions:] - results[i][:, -self.n_motions:])
+                else:
+                    raise NotImplementedError(f'Unknown cfg_mode {cfg_mode}')
+
+            if self.target == 'noise':
+                c0 = 1 / torch.sqrt(alpha)
+                c1 = (1 - alpha) / torch.sqrt(1 - alpha_bar)
+                motion_next = c0 * (motion_at_t - c1 * target_theta) + sigma * z
+            elif self.target == 'sample':
+                c0 = (1 - alpha_bar_prev) * torch.sqrt(alpha) / (1 - alpha_bar)
+                c1 = (1 - alpha) * torch.sqrt(alpha_bar_prev) / (1 - alpha_bar)
+                motion_next = c0 * motion_at_t + c1 * target_theta + sigma * z
+            else:
+                raise ValueError('Unknown target type: {}'.format(self.target))
+
+            traj[t - 1] = motion_next.detach()  # Stop gradient and save trajectory.
+            traj[t] = traj[t].cpu()  # Move previous output to CPU memory.
+            if not ret_traj:
+                del traj[t]
+
+        if ret_traj:
+            return traj, motion_at_T, audio_feat
+        else:
+            return traj[0], motion_at_T, audio_feat
+
+
+class DenoisingNetwork(nn.Module):
+    def __init__(self, device='cuda', motion_feat_dim=76,
+                 use_indicator=None, architecture="decoder", feature_dim=512, n_heads=8,
+                 n_layers=8, mlp_ratio=4, align_mask_width=1, no_use_learnable_pe=True, n_prev_motions=10,
+                 n_motions=100, n_diff_steps=500, ):
+        super().__init__()
+
+        # Model parameters
+        self.motion_feat_dim = motion_feat_dim
+        self.use_indicator = use_indicator
+
+        # Transformer
+        self.architecture = architecture
+        self.feature_dim = feature_dim
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.mlp_ratio = mlp_ratio
+        self.align_mask_width = align_mask_width
+        self.use_learnable_pe = not no_use_learnable_pe
+
+        # sequence length
+        self.n_prev_motions = n_prev_motions
+        self.n_motions = n_motions
+
+        # Temporal embedding for the diffusion time step
+        self.TE = PositionalEncoding(self.feature_dim, max_len=n_diff_steps + 1)
+        self.diff_step_map = nn.Sequential(
+            nn.Linear(self.feature_dim, self.feature_dim),
+            nn.GELU(),
+            nn.Linear(self.feature_dim, self.feature_dim)
+        )
+
+        if self.use_learnable_pe:
+            # Learnable positional encoding
+            self.PE = nn.Parameter(torch.randn(1, 1 + self.n_prev_motions + self.n_motions, self.feature_dim))
+        else:
+            self.PE = PositionalEncoding(self.feature_dim)
+
+        # Transformer decoder
+        if self.architecture == 'decoder':
+            self.feature_proj = nn.Linear(self.motion_feat_dim + (1 if self.use_indicator else 0),
+                                          self.feature_dim)
+            decoder_layer = nn.TransformerDecoderLayer(
+                d_model=self.feature_dim, nhead=self.n_heads, dim_feedforward=self.mlp_ratio * self.feature_dim,
+                activation='gelu', batch_first=True
+            )
+            self.transformer = nn.TransformerDecoder(decoder_layer, num_layers=self.n_layers)
+            if self.align_mask_width > 0:
+                motion_len = self.n_prev_motions + self.n_motions
+                alignment_mask = enc_dec_mask(motion_len, motion_len, frame_width=1,
+                                              expansion=self.align_mask_width - 1)
+                # print(f"alignment_mask: ", alignment_mask.shape)
+                # alignment_mask = F.pad(alignment_mask, (0, 0, 1, 0), value=False)
+                self.register_buffer('alignment_mask', alignment_mask)
+            else:
+                self.alignment_mask = None
+        else:
+            raise ValueError(f'Unknown architecture: {self.architecture}')
+
+        # Motion decoder
+        self.motion_dec = nn.Sequential(
+            nn.Linear(self.feature_dim, self.feature_dim // 2),
+            nn.GELU(),
+            nn.Linear(self.feature_dim // 2, self.motion_feat_dim),
+            # nn.Tanh() # 增加了一个tanh
+            # nn.Softmax()
+        )
+
+        self.to(device)
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def forward(self, motion_feat, audio_feat, prev_motion_feat, prev_audio_feat, step, indicator=None):
+        """
+        Args:
+            motion_feat: (N, L, d_motion). Noisy motion feature
+            audio_feat: (N, L, feature_dim)
+            prev_motion_feat: (N, L_p, d_motion). Padded previous motion coefficients or feature
+            prev_audio_feat: (N, L_p, d_audio). Padded previous motion coefficients or feature
+            step: (N,)
+            indicator: (N, L). 0/1 indicator for the real (unpadded) motion feature
+
+        Returns:
+            motion_feat_target: (N, L_p + L, d_motion)
+        """
+        motion_feat = motion_feat.to(audio_feat.dtype)
+        # Diffusion time step embedding
+        diff_step_embedding = self.diff_step_map(self.TE.pe[0, step]).unsqueeze(1)  # (N, 1, diff_step_dim)
+
+        if indicator is not None:
+            indicator = torch.cat([torch.zeros((indicator.shape[0], self.n_prev_motions), device=indicator.device),
+                                   indicator], dim=1)  # (N, L_p + L)
+            indicator = indicator.unsqueeze(-1)  # (N, L_p + L, 1)
+
+        # Concat features and embeddings
+        if self.architecture == 'decoder':
+            # print("prev_motion_feat: ", prev_motion_feat.shape, "motion_feat: ", motion_feat.shape)
+            feats_in = torch.cat([prev_motion_feat, motion_feat], dim=1)  # (N, L_p + L, d_motion)
+        else:
+            raise ValueError(f'Unknown architecture: {self.architecture}')
+        if self.use_indicator:
+            feats_in = torch.cat([feats_in, indicator], dim=-1)  # (N, L_p + L, d_motion + d_audio + 1)
+        feats_in = self.feature_proj(feats_in)  # (N, L_p + L, feature_dim)
+        # feats_in = torch.cat([person_feat, feats_in], dim=1)  # (N, 1 + L_p + L, feature_dim)
+
+        if self.use_learnable_pe:
+            # feats_in = feats_in + self.PE
+            feats_in = feats_in + self.PE + diff_step_embedding
+        else:
+            # feats_in = self.PE(feats_in)
+            feats_in = self.PE(feats_in) + diff_step_embedding
+
+        # Transformer
+        if self.architecture == 'decoder':
+            audio_feat_in = torch.cat([prev_audio_feat, audio_feat], dim=1)  # (N, L_p + L, d_audio)
+            # print(f"feats_in: {feats_in.shape}, audio_feat_in: {audio_feat_in.shape}, memory_mask: {self.alignment_mask.shape}")
+            feat_out = self.transformer(feats_in, audio_feat_in, memory_mask=self.alignment_mask)
+        else:
+            raise ValueError(f'Unknown architecture: {self.architecture}')
+
+        # Decode predicted motion feature noise / sample
+        # motion_feat_target = self.motion_dec(feat_out[:, 1:])  # (N, L_p + L, d_motion)
+        motion_feat_target = self.motion_dec(feat_out)  # (N, L_p + L, d_motion)
+
+        return motion_feat_target
+
+
+if __name__ == "__main__":
+    device = "cuda"
+    motion_feat_dim = 76
+    n_motions = 100  # L
+    n_prev_motions = 10  # L_p
+
+    L_audio = int(16000 * n_motions / 25)  # 64000
+    d_audio = 768
+
+    N = 5
+    feature_dim = 512
+
+    motion_feat = torch.ones((N, n_motions, motion_feat_dim)).to(device)
+    prev_motion_feat = torch.ones((N, n_prev_motions, motion_feat_dim)).to(device)
+
+    audio_or_feat = torch.ones((N, L_audio)).to(device)
+    prev_audio_feat = torch.ones((N, n_prev_motions, d_audio)).to(device)
+
+    time_step = torch.ones(N, dtype=torch.long).to(device)
+
+    model = DitTalkingHead().to(device)
+
+    z = model(motion_feat, audio_or_feat, prev_motion_feat=None,
+              prev_audio_feat=None, time_step=None, indicator=None)
+    traj, motion_at_T, audio_feat = z[0], z[1], z[2]
+    print(motion_at_T.shape, audio_feat.shape)
diff --git a/src/models/JoyVASA/helper.py b/src/models/JoyVASA/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fb168f10ce6b83f5c3d885028cb30a4e63b6cb9
--- /dev/null
+++ b/src/models/JoyVASA/helper.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/12/15
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: helper.py
+import os.path as osp
+
+
+class NullableArgs:
+    def __init__(self, namespace):
+        for key, value in namespace.__dict__.items():
+            setattr(self, key, value)
+
+    def __getattr__(self, key):
+        # when an attribute lookup has not found the attribute
+        if key == 'align_mask_width':
+            if 'use_alignment_mask' in self.__dict__:
+                return 1 if self.use_alignment_mask else 0
+            else:
+                return 0
+        if key == 'no_head_pose':
+            return not self.predict_head_pose
+        if key == 'no_use_learnable_pe':
+            return not self.use_learnable_pe
+
+        return None
+
+
+def make_abs_path(fn):
+    # return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+    return osp.abspath(osp.join(osp.dirname(osp.realpath(__file__)), fn))
diff --git a/src/models/JoyVASA/hubert.py b/src/models/JoyVASA/hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..c98c8f040ae9905f8646c612bc63b5968f3737e5
--- /dev/null
+++ b/src/models/JoyVASA/hubert.py
@@ -0,0 +1,51 @@
+from transformers import HubertModel
+from transformers.modeling_outputs import BaseModelOutput
+
+from .wav2vec2 import linear_interpolation
+
+_CONFIG_FOR_DOC = 'HubertConfig'
+
+
+class HubertModel(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def forward(self, input_values, output_fps=25, attention_mask=None, output_attentions=None,
+                output_hidden_states=None, return_dict=None, frame_num=None):
+        self.config.output_attentions = True
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)  # (N, C, L)
+        # Resample the audio feature @ 50 fps to `output_fps`.
+        if frame_num is not None:
+            extract_features_len = round(frame_num * 50 / output_fps)
+            extract_features = extract_features[:, :, :extract_features_len]
+        extract_features = linear_interpolation(extract_features, 50, output_fps, output_len=frame_num)
+        extract_features = extract_features.transpose(1, 2)  # (N, L, C)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+
+        hidden_states = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(hidden_states)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_outputs.hidden_states,
+                               attentions=encoder_outputs.attentions, )
diff --git a/src/models/JoyVASA/wav2vec2.py b/src/models/JoyVASA/wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..499140bbe90d147d07ba180b261ec8ea6f752df2
--- /dev/null
+++ b/src/models/JoyVASA/wav2vec2.py
@@ -0,0 +1,119 @@
+from packaging import version
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from transformers import Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput
+
+_CONFIG_FOR_DOC = 'Wav2Vec2Config'
+
+
+# the implementation of Wav2Vec2Model is borrowed from
+# https://huggingface.co/transformers/_modules/transformers/models/wav2vec2/modeling_wav2vec2.html#Wav2Vec2Model
+# initialize our encoder with the pre-trained wav2vec 2.0 weights.
+def _compute_mask_indices(shape: Tuple[int, int], mask_prob: float, mask_length: int,
+                          attention_mask: Optional[torch.Tensor] = None, min_masks: int = 0, ) -> np.ndarray:
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+
+    all_num_mask = int(mask_prob * all_sz / float(mask_length) + np.random.rand())
+    all_num_mask = max(min_masks, all_num_mask)
+    mask_idcs = []
+    padding_mask = attention_mask.ne(1) if attention_mask is not None else None
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand())
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+
+        lengths = np.full(num_mask, mask_length)
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        min_len = min(lengths)
+        if sz - min_len <= num_mask:
+            min_len = sz - num_mask - 1
+
+        mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+        mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])])
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len:
+            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+    return mask
+
+
+# linear interpolation layer
+def linear_interpolation(features, input_fps, output_fps, output_len=None):
+    # features: (N, C, L)
+    seq_len = features.shape[2] / float(input_fps)
+    if output_len is None:
+        output_len = int(seq_len * output_fps)
+    output_features = F.interpolate(features, size=output_len, align_corners=False, mode='linear')
+    return output_features
+
+
+class Wav2Vec2Model(Wav2Vec2Model):
+    def __init__(self, config):
+        super().__init__(config)
+        self.is_old_version = version.parse(transformers.__version__) < version.parse('4.7.0')
+
+    def forward(self, input_values, output_fps=25, attention_mask=None, output_attentions=None,
+                output_hidden_states=None, return_dict=None, frame_num=None):
+        self.config.output_attentions = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.feature_extractor(input_values)  # (N, C, L)
+        # Resample the audio feature @ 50 fps to `output_fps`.
+        if frame_num is not None:
+            hidden_states_len = round(frame_num * 50 / output_fps)
+            hidden_states = hidden_states[:, :, :hidden_states_len]
+        hidden_states = linear_interpolation(hidden_states, 50, output_fps, output_len=frame_num)
+        hidden_states = hidden_states.transpose(1, 2)  # (N, L, C)
+
+        if attention_mask is not None:
+            output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+            attention_mask = torch.zeros(hidden_states.shape[:2], dtype=hidden_states.dtype,
+                                         device=hidden_states.device)
+            attention_mask[(torch.arange(attention_mask.shape[0], device=hidden_states.device), output_lengths - 1)] = 1
+            attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+
+        if self.is_old_version:
+            hidden_states = self.feature_projection(hidden_states)
+        else:
+            hidden_states = self.feature_projection(hidden_states)[0]
+
+        if self.config.apply_spec_augment and self.training:
+            batch_size, sequence_length, hidden_size = hidden_states.size()
+            if self.config.mask_time_prob > 0:
+                mask_time_indices = _compute_mask_indices((batch_size, sequence_length), self.config.mask_time_prob,
+                                                          self.config.mask_time_length, attention_mask=attention_mask,
+                                                          min_masks=2, )
+                hidden_states[torch.from_numpy(mask_time_indices)] = self.masked_spec_embed.to(hidden_states.dtype)
+            if self.config.mask_feature_prob > 0:
+                mask_feature_indices = _compute_mask_indices((batch_size, hidden_size), self.config.mask_feature_prob,
+                                                             self.config.mask_feature_length, )
+                mask_feature_indices = torch.from_numpy(mask_feature_indices).to(hidden_states.device)
+                hidden_states[mask_feature_indices[:, None].expand(-1, sequence_length, -1)] = 0
+        encoder_outputs = self.encoder(hidden_states, attention_mask=attention_mask,
+                                       output_attentions=output_attentions, output_hidden_states=output_hidden_states,
+                                       return_dict=return_dict, )
+        hidden_states = encoder_outputs[0]
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_outputs.hidden_states,
+                               attentions=encoder_outputs.attentions, )
diff --git a/src/models/XPose/__init__.py b/src/models/XPose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b3239d927e0762a4952006a55a8596998e0ac03
--- /dev/null
+++ b/src/models/XPose/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/8/5 21:58
+# @Author  : shaoguowen
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py
diff --git a/src/models/XPose/config_model/UniPose_SwinT.py b/src/models/XPose/config_model/UniPose_SwinT.py
new file mode 100644
index 0000000000000000000000000000000000000000..707b359fc414b525db5a11a9bc505105f6f66741
--- /dev/null
+++ b/src/models/XPose/config_model/UniPose_SwinT.py
@@ -0,0 +1,125 @@
+_base_ = ['coco_transformer.py']
+
+use_label_enc = True
+
+num_classes=2
+
+lr = 0.0001
+param_dict_type = 'default'
+lr_backbone = 1e-05
+lr_backbone_names = ['backbone.0']
+lr_linear_proj_names = ['reference_points', 'sampling_offsets']
+lr_linear_proj_mult = 0.1
+ddetr_lr_param = False
+batch_size = 2
+weight_decay = 0.0001
+epochs = 12
+lr_drop = 11
+save_checkpoint_interval = 100
+clip_max_norm = 0.1
+onecyclelr = False
+multi_step_lr = False
+lr_drop_list = [33, 45]
+
+
+modelname = 'UniPose'
+frozen_weights = None
+backbone = 'swin_T_224_1k'
+
+
+dilation = False
+position_embedding = 'sine'
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+unic_layers = 0
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+pdetr3_bbox_embed_diff_each_layer = False
+pdetr3_refHW = -1
+random_refpoints_xy = False
+fix_refpoints_hw = -1
+dabdetr_yolo_like_anchor_update = False
+dabdetr_deformable_encoder = False
+dabdetr_deformable_decoder = False
+use_deformable_box_attn = False
+box_attn_type = 'roi_align'
+dec_layer_number = None
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+decoder_layer_noise = False
+dln_xy_noise = 0.2
+dln_hw_noise = 0.2
+add_channel_attention = False
+add_pos_value = False
+two_stage_type = 'standard'
+two_stage_pat_embed = 0
+two_stage_add_query_num = 0
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+two_stage_learn_wh = False
+two_stage_default_hw = 0.05
+two_stage_keep_all_tokens = False
+num_select = 50
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+masks = False
+
+decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']
+matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
+decoder_module_seq = ['sa', 'ca', 'ffn']
+nms_iou_threshold = -1
+
+dec_pred_bbox_embed_share = True
+dec_pred_class_embed_share = True
+
+
+use_dn = True
+dn_number = 100
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef=1.0
+dn_bbox_coef=1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+
+match_unstable_error = True
+
+# for ema
+use_ema = True
+ema_decay = 0.9997
+ema_epoch = 0
+
+use_detached_boxes_dec_out = False
+
+max_text_len = 256
+shuffle_type = None
+
+use_text_enhancer = True
+use_fusion_layer = True
+
+use_checkpoint = False # True
+use_transformer_ckpt = True
+text_encoder_type = 'bert-base-uncased'
+
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+
+num_body_points=68
+binary_query_selection = False
+use_cdn = True
+ffn_extra_layernorm = False
+
+fix_size=False
diff --git a/src/models/XPose/config_model/__init__.py b/src/models/XPose/config_model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b3239d927e0762a4952006a55a8596998e0ac03
--- /dev/null
+++ b/src/models/XPose/config_model/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/8/5 21:58
+# @Author  : shaoguowen
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py
diff --git a/src/models/XPose/config_model/coco_transformer.py b/src/models/XPose/config_model/coco_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7b3feeaef9cc890891d3e1733e4fec91ccba426
--- /dev/null
+++ b/src/models/XPose/config_model/coco_transformer.py
@@ -0,0 +1,8 @@
+data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+data_aug_max_size = 1333
+data_aug_scales2_resize = [400, 500, 600]
+data_aug_scales2_crop = [384, 600]
+
+
+data_aug_scale_overlap = None
+
diff --git a/src/models/XPose/models/UniPose/__init__.py b/src/models/XPose/models/UniPose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..07659639dd12c66e36689df0a0456a6af3d4f96d
--- /dev/null
+++ b/src/models/XPose/models/UniPose/__init__.py
@@ -0,0 +1,10 @@
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+from .unipose import build_unipose
diff --git a/src/models/XPose/models/UniPose/attention.py b/src/models/XPose/models/UniPose/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..103cf175204e05a74c4d4dd20d0a9ed485a783a7
--- /dev/null
+++ b/src/models/XPose/models/UniPose/attention.py
@@ -0,0 +1,373 @@
+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from codes in torch.nn
+# ------------------------------------------------------------------------
+
+"""
+MultiheadAttention that support query, key, and value to have different dimensions.
+Query, key, and value projections are removed.
+
+Mostly copy-paste from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/activation.py#L873
+and https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py#L4837
+"""
+
+import warnings
+import torch
+from torch.nn.modules.linear import Linear
+from torch.nn.init import constant_
+from torch.nn.modules.module import Module
+from torch._jit_internal import Optional, Tuple
+try:
+    from torch.overrides import has_torch_function, handle_torch_function
+except:
+    from torch._overrides import has_torch_function, handle_torch_function
+from torch.nn.functional import linear, pad, softmax, dropout
+Tensor = torch.Tensor
+
+class MultiheadAttention(Module):
+    r"""Allows the model to jointly attend to information
+    from different representation subspaces.
+    See reference: Attention Is All You Need
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+    Args:
+        embed_dim: total dimension of the model.
+        num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
+        add_bias_kv: add bias to the key and value sequences at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        kdim: total number of features in key. Default: None.
+        vdim: total number of features in value. Default: None.
+        Note: if kdim and vdim are None, they will be set to embed_dim such that
+        query, key, and value have the same number of features.
+    Examples::
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    """
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        vdim = vdim if vdim is not None else embed_dim
+        self.out_proj = Linear(vdim , vdim)
+
+        self.in_proj_bias = None
+        self.in_proj_weight = None
+        self.bias_k = self.bias_v = None
+        self.q_proj_weight = None
+        self.k_proj_weight = None
+        self.v_proj_weight = None
+
+        self.add_zero_attn = add_zero_attn
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.out_proj.bias, 0.)
+
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+
+        super(MultiheadAttention, self).__setstate__(state)
+
+    def forward(self, query, key, value, key_padding_mask=None,
+                need_weights=True, attn_mask=None):
+        # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
+        r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. When given a binary mask and a value is True,
+            the corresponding value on the attention layer will be ignored. When given
+            a byte mask and a value is non-zero, the corresponding value on the attention
+            layer will be ignored
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+    Shape:
+        - Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the position
+          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*\text{num_heads}, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+        """
+        if not self._qkv_same_embed_dim:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight, out_dim=self.vdim)
+        else:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, out_dim=self.vdim)
+
+
+def multi_head_attention_forward(query: Tensor,
+                                 key: Tensor,
+                                 value: Tensor,
+                                 embed_dim_to_check: int,
+                                 num_heads: int,
+                                 in_proj_weight: Tensor,
+                                 in_proj_bias: Tensor,
+                                 bias_k: Optional[Tensor],
+                                 bias_v: Optional[Tensor],
+                                 add_zero_attn: bool,
+                                 dropout_p: float,
+                                 out_proj_weight: Tensor,
+                                 out_proj_bias: Tensor,
+                                 training: bool = True,
+                                 key_padding_mask: Optional[Tensor] = None,
+                                 need_weights: bool = True,
+                                 attn_mask: Optional[Tensor] = None,
+                                 use_separate_proj_weight: bool = False,
+                                 q_proj_weight: Optional[Tensor] = None,
+                                 k_proj_weight: Optional[Tensor] = None,
+                                 v_proj_weight: Optional[Tensor] = None,
+                                 static_k: Optional[Tensor] = None,
+                                 static_v: Optional[Tensor] = None,
+                                 out_dim: Optional[Tensor] = None
+                                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+    Shape:
+        Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
+          will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+    """
+    if not torch.jit.is_scripting():
+        tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
+                    out_proj_weight, out_proj_bias)
+        if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
+            return handle_torch_function(
+                multi_head_attention_forward, tens_ops, query, key, value,
+                embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
+                bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
+                out_proj_bias, training=training, key_padding_mask=key_padding_mask,
+                need_weights=need_weights, attn_mask=attn_mask,
+                use_separate_proj_weight=use_separate_proj_weight,
+                q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
+                v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+    head_dim = embed_dim // num_heads
+    v_head_dim = out_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+
+    q = query * scaling
+    k = key
+    v = value
+
+    if attn_mask is not None:
+        assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
+            attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
+            'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+            attn_mask = attn_mask.to(torch.bool)
+
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 2D attn_mask is not correct.')
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 3D attn_mask is not correct.')
+        else:
+            raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+        key_padding_mask = key_padding_mask.to(torch.bool)
+
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, "bias cannot be added to static key."
+            assert static_v is None, "bias cannot be added to static value."
+    else:
+        assert bias_k is None
+        assert bias_v is None
+
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)
+
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == v_head_dim
+        v = static_v
+
+    src_len = k.size(1)
+
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+        else:
+            attn_output_weights += attn_mask
+
+
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+
+    # attn_output_weights = softmax(
+    #     attn_output_weights, dim=-1)
+    attn_output_weights = softmax(
+            attn_output_weights - attn_output_weights.max(dim=-1, keepdim=True)[0], dim=-1)
+    attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
+
+    attn_output = torch.bmm(attn_output_weights, v)
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / num_heads
+    else:
+        return attn_output, None
+
diff --git a/src/models/XPose/models/UniPose/backbone.py b/src/models/XPose/models/UniPose/backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..c393d5d1a22d248bb3e3abb339a819233154ed8c
--- /dev/null
+++ b/src/models/XPose/models/UniPose/backbone.py
@@ -0,0 +1,211 @@
+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+"""
+Backbone modules.
+"""
+
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+
+from ...util.misc import NestedTensor, is_main_process
+
+from .position_encoding import build_position_encoding
+from .swin_transformer import build_swin_transformer
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+
+class BackboneBase(nn.Module):
+    def __init__(
+        self,
+        backbone: nn.Module,
+        train_backbone: bool,
+        num_channels: int,
+        return_interm_indices: list,
+    ):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if (
+                not train_backbone
+                or "layer2" not in name
+                and "layer3" not in name
+                and "layer4" not in name
+            ):
+                parameter.requires_grad_(False)
+
+        return_layers = {}
+        for idx, layer_index in enumerate(return_interm_indices):
+            return_layers.update(
+                {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)}
+            )
+
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        # import ipdb; ipdb.set_trace()
+        return out
+
+
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+
+    def __init__(
+        self,
+        name: str,
+        train_backbone: bool,
+        dilation: bool,
+        return_interm_indices: list,
+        batch_norm=FrozenBatchNorm2d,
+    ):
+        if name in ["resnet18", "resnet34", "resnet50", "resnet101"]:
+            backbone = getattr(torchvision.models, name)(
+                replace_stride_with_dilation=[False, False, dilation],
+                pretrained=is_main_process(),
+                norm_layer=batch_norm,
+            )
+        else:
+            raise NotImplementedError("Why you can get here with name {}".format(name))
+        # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
+        assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available."
+        assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+        num_channels_all = [256, 512, 1024, 2048]
+        num_channels = num_channels_all[4 - len(return_interm_indices) :]
+        super().__init__(backbone, train_backbone, num_channels, return_interm_indices)
+
+
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.tensors.dtype))
+
+        return out, pos
+
+
+def build_backbone(args):
+    """
+    Useful args:
+        - backbone: backbone name
+        - lr_backbone:
+        - dilation
+        - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]
+        - backbone_freeze_keywords:
+        - use_checkpoint: for swin only for now
+
+    """
+    position_embedding = build_position_encoding(args)
+    train_backbone = True
+    if not train_backbone:
+        raise ValueError("Please set lr_backbone > 0")
+    return_interm_indices = args.return_interm_indices
+    assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+    args.backbone_freeze_keywords
+    use_checkpoint = getattr(args, "use_checkpoint", False)
+
+    if args.backbone in ["resnet50", "resnet101"]:
+        backbone = Backbone(
+            args.backbone,
+            train_backbone,
+            args.dilation,
+            return_interm_indices,
+            batch_norm=FrozenBatchNorm2d,
+        )
+        bb_num_channels = backbone.num_channels
+    elif args.backbone in [
+        "swin_T_224_1k",
+        "swin_B_224_22k",
+        "swin_B_384_22k",
+        "swin_L_224_22k",
+        "swin_L_384_22k",
+    ]:
+        pretrain_img_size = int(args.backbone.split("_")[-2])
+        backbone = build_swin_transformer(
+            args.backbone,
+            pretrain_img_size=pretrain_img_size,
+            out_indices=tuple(return_interm_indices),
+            dilation=False,
+            use_checkpoint=use_checkpoint,
+        )
+
+        bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :]
+    else:
+        raise NotImplementedError("Unknown backbone {}".format(args.backbone))
+
+    assert len(bb_num_channels) == len(
+        return_interm_indices
+    ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}"
+
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = bb_num_channels
+    assert isinstance(
+        bb_num_channels, List
+    ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels))
+    return model
diff --git a/src/models/XPose/models/UniPose/deformable_transformer.py b/src/models/XPose/models/UniPose/deformable_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..aade408228f5e73839712ba15ce29b5b17d9176c
--- /dev/null
+++ b/src/models/XPose/models/UniPose/deformable_transformer.py
@@ -0,0 +1,1230 @@
+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+import math
+import copy
+import torch
+import torch.utils.checkpoint as checkpoint
+from torch import nn, Tensor
+from typing import Optional
+from ...util.misc import inverse_sigmoid
+
+from .transformer_vanilla import TransformerEncoderLayer
+from .fuse_modules import BiAttentionBlock
+from .utils import gen_encoder_output_proposals, MLP, _get_activation_fn, gen_sineembed_for_position, get_sine_pos_embed
+from .ops.modules import MSDeformAttn
+
+
+class DeformableTransformer(nn.Module):
+
+    def __init__(self, d_model=256, nhead=8,
+                 num_queries=300,
+                 num_encoder_layers=6,
+                 num_unicoder_layers=0,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048, dropout=0.0,
+                 activation="relu", normalize_before=False,
+                 return_intermediate_dec=False, query_dim=4,
+                 num_patterns=0,
+                 modulate_hw_attn=False,
+                 # for deformable encoder
+                 deformable_encoder=False,
+                 deformable_decoder=False,
+                 num_feature_levels=1,
+                 enc_n_points=4,
+                 dec_n_points=4,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 # init query
+                 learnable_tgt_init=False,
+                 decoder_query_perturber=None,
+                 add_channel_attention=False,
+                 add_pos_value=False,
+                 random_refpoints_xy=False,
+                 # two stage
+                 two_stage_type='no',
+                 two_stage_pat_embed=0,
+                 two_stage_add_query_num=0,
+                 two_stage_learn_wh=False,
+                 two_stage_keep_all_tokens=False,
+                 # evo of #anchors
+                 dec_layer_number=None,
+                 rm_enc_query_scale=True,
+                 rm_dec_query_scale=True,
+                 rm_self_attn_layers=None,
+                 key_aware_type=None,
+                 # layer share
+                 layer_share_type=None,
+                 # for detach
+                 rm_detach=None,
+                 decoder_sa_type='ca',
+                 module_seq=['sa', 'ca', 'ffn'],
+                 # for dn
+                 embed_init_tgt=False,
+
+                 use_detached_boxes_dec_out=False,
+                 use_text_enhancer=False,
+                 use_fusion_layer=False,
+                 use_checkpoint=False,
+                 use_transformer_ckpt=False,
+                 use_text_cross_attention=False,
+                 text_dropout=0.1,
+                 fusion_dropout=0.1,
+                 fusion_droppath=0.0,
+
+                 binary_query_selection=False,
+                 ffn_extra_layernorm=False,
+                 ):
+        super().__init__()
+        self.num_feature_levels = num_feature_levels
+        self.num_encoder_layers = num_encoder_layers
+        self.num_unicoder_layers = num_unicoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.deformable_encoder = deformable_encoder
+        self.deformable_decoder = deformable_decoder
+        self.two_stage_keep_all_tokens = two_stage_keep_all_tokens
+        self.num_queries = num_queries
+        self.random_refpoints_xy = random_refpoints_xy
+        self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
+        self.ffn_extra_layernorm = ffn_extra_layernorm
+        assert query_dim == 4
+
+        self.binary_query_selection = binary_query_selection
+        if self.binary_query_selection:
+            self.binary_query_selection_layer = nn.Linear(d_model, 1)
+        # assert not binary_query_selection, 'binary_query_selection not implemented yet'
+
+        if num_feature_levels > 1:
+            assert deformable_encoder, "only support deformable_encoder for num_feature_levels > 1"
+        if use_deformable_box_attn:
+            assert deformable_encoder or deformable_encoder
+
+        assert layer_share_type in [None, 'encoder', 'decoder', 'both']
+        if layer_share_type in ['encoder', 'both']:
+            enc_layer_share = True
+        else:
+            enc_layer_share = False
+        if layer_share_type in ['decoder', 'both']:
+            dec_layer_share = True
+        else:
+            dec_layer_share = False
+        assert layer_share_type is None
+
+        self.decoder_sa_type = decoder_sa_type
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+
+        # choose encoder layer type
+        if deformable_encoder:
+            encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
+                                                              dropout, activation,
+                                                              num_feature_levels, nhead, enc_n_points,
+                                                              add_channel_attention=add_channel_attention,
+                                                              use_deformable_box_attn=use_deformable_box_attn,
+                                                              box_attn_type=box_attn_type)
+        else:
+            raise NotImplementedError
+
+        if use_text_enhancer:
+            text_enhance_layer = TransformerEncoderLayer(
+                d_model=d_model,
+                nhead=nhead // 2,
+                dim_feedforward=dim_feedforward // 2,
+                dropout=text_dropout
+            )
+        else:
+            text_enhance_layer = None
+
+        if use_fusion_layer:
+            feature_fusion_layer = BiAttentionBlock(
+                v_dim=d_model,
+                l_dim=d_model,
+                embed_dim=dim_feedforward // 2,
+                num_heads=nhead // 2,
+                dropout=fusion_dropout,
+                drop_path=fusion_droppath
+            )
+        else:
+            feature_fusion_layer = None
+
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        assert encoder_norm is None
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, d_model=d_model,
+            num_queries=num_queries,
+            enc_layer_share=enc_layer_share,
+            text_enhance_layer=text_enhance_layer,
+            feature_fusion_layer=feature_fusion_layer,
+            use_checkpoint=use_checkpoint,
+            use_transformer_ckpt=use_transformer_ckpt,
+        )
+
+        # choose decoder layer type
+        if deformable_decoder:
+            decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
+                                                              dropout, activation,
+                                                              num_feature_levels, nhead, dec_n_points,
+                                                              use_text_cross_attention=use_text_cross_attention,
+                                                              ffn_extra_layernorm=ffn_extra_layernorm, )
+
+        else:
+            raise NotImplementedError
+
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec,
+                                          d_model=d_model, query_dim=query_dim,
+                                          modulate_hw_attn=modulate_hw_attn,
+                                          num_feature_levels=num_feature_levels,
+                                          deformable_decoder=deformable_decoder,
+                                          decoder_query_perturber=decoder_query_perturber,
+                                          dec_layer_number=dec_layer_number, rm_dec_query_scale=rm_dec_query_scale,
+                                          dec_layer_share=dec_layer_share,
+                                          use_detached_boxes_dec_out=use_detached_boxes_dec_out
+                                          )
+
+        self.d_model = d_model
+        self.nhead = nhead
+        self.dec_layers = num_decoder_layers
+        self.num_queries = num_queries  # useful for single stage model only
+        self.num_patterns = num_patterns
+        if not isinstance(num_patterns, int):
+            Warning("num_patterns should be int but {}".format(type(num_patterns)))
+            self.num_patterns = 0
+
+        if num_feature_levels > 1:
+            if self.num_encoder_layers > 0:
+                self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+            else:
+                self.level_embed = None
+
+        self.learnable_tgt_init = learnable_tgt_init
+        assert learnable_tgt_init, "why not learnable_tgt_init"
+        self.embed_init_tgt = embed_init_tgt
+        if (two_stage_type != 'no' and embed_init_tgt) or (two_stage_type == 'no'):
+            self.tgt_embed = nn.Embedding(self.num_queries, d_model)
+            nn.init.normal_(self.tgt_embed.weight.data)
+        else:
+            self.tgt_embed = None
+
+        # for two stage
+        self.two_stage_type = two_stage_type
+        self.two_stage_pat_embed = two_stage_pat_embed
+        self.two_stage_add_query_num = two_stage_add_query_num
+        self.two_stage_learn_wh = two_stage_learn_wh
+        assert two_stage_type in ['no', 'standard'], "unknown param {} of two_stage_type".format(two_stage_type)
+        if two_stage_type == 'standard':
+            # anchor selection at the output of encoder
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+
+            if two_stage_pat_embed > 0:
+                self.pat_embed_for_2stage = nn.Parameter(torch.Tensor(two_stage_pat_embed, d_model))
+                nn.init.normal_(self.pat_embed_for_2stage)
+
+            if two_stage_add_query_num > 0:
+                self.tgt_embed = nn.Embedding(self.two_stage_add_query_num, d_model)
+
+            if two_stage_learn_wh:
+                # import ipdb; ipdb.set_trace()
+                self.two_stage_wh_embedding = nn.Embedding(1, 2)
+            else:
+                self.two_stage_wh_embedding = None
+
+        if two_stage_type == 'no':
+            self.init_ref_points(num_queries)  # init self.refpoint_embed
+
+        self.enc_out_class_embed = None
+        self.enc_out_bbox_embed = None
+
+        # evolution of anchors
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            if self.two_stage_type != 'no' or num_patterns == 0:
+                assert dec_layer_number[
+                           0] == num_queries, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries})"
+            else:
+                assert dec_layer_number[
+                           0] == num_queries * num_patterns, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries}) * num_patterns({num_patterns})"
+
+        self._reset_parameters()
+
+        self.rm_self_attn_layers = rm_self_attn_layers
+        if rm_self_attn_layers is not None:
+            # assert len(rm_self_attn_layers) == num_decoder_layers
+            print("Removing the self-attn in {} decoder layers".format(rm_self_attn_layers))
+            for lid, dec_layer in enumerate(self.decoder.layers):
+                if lid in rm_self_attn_layers:
+                    dec_layer.rm_self_attn_modules()
+
+        self.rm_detach = rm_detach
+        if self.rm_detach:
+            assert isinstance(rm_detach, list)
+            assert any([i in ['enc_ref', 'enc_tgt', 'dec'] for i in rm_detach])
+        self.decoder.rm_detach = rm_detach
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        if self.num_feature_levels > 1 and self.level_embed is not None:
+            nn.init.normal_(self.level_embed)
+
+        if self.two_stage_learn_wh:
+            nn.init.constant_(self.two_stage_wh_embedding.weight, math.log(0.05 / (1 - 0.05)))
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, 4)
+
+        if self.random_refpoints_xy:
+            # import ipdb; ipdb.set_trace()
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+
+    def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, attn_mask2=None, text_dict=None,
+                dn_meta=None,targets=None,kpt_embed=None):
+        """
+        Input:
+            - srcs: List of multi features [bs, ci, hi, wi]
+            - masks: List of multi masks [bs, hi, wi]
+            - refpoint_embed: [bs, num_dn, 4]. None in infer
+            - pos_embeds: List of multi pos embeds [bs, ci, hi, wi]
+            - tgt: [bs, num_dn, d_model]. None in infer
+
+        """
+        # if self.two_stage_type != 'no' and self.two_stage_add_query_num == 0:
+        #     assert refpoint_embed is None
+
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+
+            src = src.flatten(2).transpose(1, 2)  # bs, hw, c
+            mask = mask.flatten(1)  # bs, hw
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)  # bs, hw, c
+            if self.num_feature_levels > 1 and self.level_embed is not None:
+                lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            else:
+                lvl_pos_embed = pos_embed
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)  # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)  # bs, \sum{hxw}
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)  # bs, \sum{hxw}, c
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+
+        # two stage
+        enc_topk_proposals = enc_refpoint_embed = None
+
+        #########################################################
+        # Begin Encoder
+        #########################################################
+        memory, memory_text = self.encoder(
+            src_flatten,
+            pos=lvl_pos_embed_flatten,
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            key_padding_mask=mask_flatten,
+            memory_text=text_dict['encoded_text'],
+            text_attention_mask=~text_dict['text_token_mask'],
+            # we ~ the mask . False means use the token; True means pad the token
+            position_ids=text_dict['position_ids'],
+            text_self_attention_masks=text_dict['text_self_attention_masks'],
+        )
+        #########################################################
+        # End Encoder
+        # - memory: bs, \sum{hw}, c
+        # - mask_flatten: bs, \sum{hw}
+        # - lvl_pos_embed_flatten: bs, \sum{hw}, c
+        # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        #########################################################
+        text_dict['encoded_text'] = memory_text
+
+        if self.two_stage_type == 'standard':
+            if self.two_stage_learn_wh:
+                input_hw = self.two_stage_wh_embedding.weight[0]
+            else:
+                input_hw = None
+            output_memory, output_proposals = gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes,
+                                                                           input_hw)
+            output_memory = self.enc_output_norm(self.enc_output(output_memory))
+
+            if self.two_stage_pat_embed > 0:
+                bs, nhw, _ = output_memory.shape
+                # output_memory: bs, n, 256; self.pat_embed_for_2stage: k, 256
+                output_memory = output_memory.repeat(1, self.two_stage_pat_embed, 1)
+                _pats = self.pat_embed_for_2stage.repeat_interleave(nhw, 0)
+                output_memory = output_memory + _pats
+                output_proposals = output_proposals.repeat(1, self.two_stage_pat_embed, 1)
+
+            if self.two_stage_add_query_num > 0:
+                assert refpoint_embed is not None
+                output_memory = torch.cat((output_memory, tgt), dim=1)
+                output_proposals = torch.cat((output_proposals, refpoint_embed), dim=1)
+
+            if self.binary_query_selection:
+                topk_logits = self.binary_query_selection_layer(output_memory).squeeze(-1)
+            else:
+                if text_dict is not None:
+                    enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)
+                else:
+                    enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
+
+                topk_logits = enc_outputs_class_unselected.max(-1)[0]
+            enc_outputs_coord_unselected = self.enc_out_bbox_embed(
+                output_memory) + output_proposals  # (bs, \sum{hw}, 4) unsigmoid
+            topk = self.num_queries
+
+            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]  # bs, nq
+
+            # gather boxes
+            refpoint_embed_undetach = torch.gather(enc_outputs_coord_unselected, 1,
+                                                   topk_proposals.unsqueeze(-1).repeat(1, 1, 4))  # unsigmoid
+            refpoint_embed_ = refpoint_embed_undetach.detach()
+            init_box_proposal = torch.gather(output_proposals, 1,
+                                             topk_proposals.unsqueeze(-1).repeat(1, 1, 4)).sigmoid()  # sigmoid
+
+            # gather tgt
+            tgt_undetach = torch.gather(output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
+            if self.embed_init_tgt:
+                tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, d_model
+            else:
+                tgt_ = tgt_undetach.detach()
+
+            if refpoint_embed is not None:
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+        elif self.two_stage_type == 'no':
+            tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, d_model
+            refpoint_embed_ = self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, 4
+
+            if refpoint_embed is not None:
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+            if self.num_patterns > 0:
+                tgt_embed = tgt.repeat(1, self.num_patterns, 1)
+                refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1)
+                tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(self.num_queries,
+                                                                             1)  # 1, n_q*n_pat, d_model
+                tgt = tgt_embed + tgt_pat
+
+            init_box_proposal = refpoint_embed_.sigmoid()
+
+        else:
+            raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type))
+        #########################################################
+        # End preparing tgt
+        # - tgt: bs, NQ, d_model
+        # - refpoint_embed(unsigmoid): bs, NQ, d_model
+        #########################################################
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     if refpoint_embed.isnan().any() | refpoint_embed.isinf().any():
+        #         import ipdb; ipdb.set_trace()
+        #     if tgt.isnan().any() | tgt.isinf().any():
+        #         import ipdb; ipdb.set_trace()
+
+        #########################################################
+        # Begin Decoder
+        #########################################################
+        hs, references = self.decoder(
+            tgt=tgt.transpose(0, 1),
+            memory=memory.transpose(0, 1),
+            memory_key_padding_mask=mask_flatten,
+            pos=lvl_pos_embed_flatten.transpose(0, 1),
+            refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios, tgt_mask=attn_mask,
+            tgt_mask2=attn_mask2,
+            memory_text=text_dict['encoded_text'],
+            text_attention_mask=~text_dict['text_token_mask'],
+            text_dict=text_dict,
+            dn_meta=dn_meta,
+            targets=targets,
+            kpt_embed=kpt_embed
+            # we ~ the mask . False means use the token; True means pad the token
+        )
+        #########################################################
+        # End Decoder
+        # hs: n_dec, bs, nq, d_model
+        # references: n_dec+1, bs, nq, query_dim
+        #########################################################
+
+        #########################################################
+        # Begin postprocess
+        #########################################################
+        if self.two_stage_type == 'standard':
+            if self.two_stage_keep_all_tokens:
+                hs_enc = output_memory.unsqueeze(0)
+                ref_enc = enc_outputs_coord_unselected.unsqueeze(0)
+                init_box_proposal = output_proposals
+                # import ipdb; ipdb.set_trace()
+            else:
+                hs_enc = tgt_undetach.unsqueeze(0)
+                ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0)
+        else:
+            hs_enc = ref_enc = None
+        #########################################################
+        # End postprocess
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None
+        # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None
+        #########################################################
+
+        return hs, references, hs_enc, ref_enc, init_box_proposal
+        # hs: (n_dec, bs, nq, d_model)
+        # references: sigmoid coordinates. (n_dec+1, bs, bq, 4)
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None
+        # ref_enc: sigmoid coordinates. \
+        #           (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self,
+                 encoder_layer, num_layers, d_model=256,
+                 num_queries=300,
+                 enc_layer_share=False,
+                 text_enhance_layer=None,
+                 feature_fusion_layer=None,
+                 use_checkpoint=False,
+                 use_transformer_ckpt=False,
+                 ):
+        """_summary_
+
+        Args:
+            encoder_layer (_type_): _description_
+            num_layers (_type_): _description_
+            norm (_type_, optional): _description_. Defaults to None.
+            d_model (int, optional): _description_. Defaults to 256.
+            num_queries (int, optional): _description_. Defaults to 300.
+            enc_layer_share (bool, optional): _description_. Defaults to False.
+
+        """
+        super().__init__()
+        # prepare layers
+        self.layers = []
+        self.text_layers = []
+        self.fusion_layers = []
+        if num_layers > 0:
+            self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)
+
+            if text_enhance_layer is not None:
+                self.text_layers = _get_clones(text_enhance_layer, num_layers, layer_share=enc_layer_share)
+            if feature_fusion_layer is not None:
+                self.fusion_layers = _get_clones(feature_fusion_layer, num_layers, layer_share=enc_layer_share)
+        else:
+            self.layers = []
+            del encoder_layer
+
+            if text_enhance_layer is not None:
+                self.text_layers = []
+                del text_enhance_layer
+            if feature_fusion_layer is not None:
+                self.fusion_layers = []
+                del feature_fusion_layer
+
+        self.query_scale = None
+        self.num_queries = num_queries
+        self.num_layers = num_layers
+        self.d_model = d_model
+
+        self.use_checkpoint = use_checkpoint
+        self.use_transformer_ckpt = use_transformer_ckpt
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),)
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(self,
+                # for images
+                src: Tensor,
+                pos: Tensor,
+                spatial_shapes: Tensor,
+                level_start_index: Tensor,
+                valid_ratios: Tensor,
+                key_padding_mask: Tensor,
+                # for texts
+                memory_text: Tensor = None,
+                text_attention_mask: Tensor = None,
+                pos_text: Tensor = None,
+                text_self_attention_masks: Tensor = None,
+                position_ids: Tensor = None,
+                ):
+        """
+        Input:
+            - src: [bs, sum(hi*wi), 256]
+            - pos: pos embed for src. [bs, sum(hi*wi), 256]
+            - spatial_shapes: h,w of each level [num_level, 2]
+            - level_start_index: [num_level] start point of level in sum(hi*wi).
+            - valid_ratios: [bs, num_level, 2]
+            - key_padding_mask: [bs, sum(hi*wi)]
+
+            - memory_text: bs, n_text, 256
+            - text_attention_mask: bs, n_text
+                False for no padding; True for padding
+            - pos_text: bs, n_text, 256
+
+            - position_ids: bs, n_text
+        Intermedia:
+            - reference_points: [bs, sum(hi*wi), num_level, 2]
+        Outpus:
+            - output: [bs, sum(hi*wi), 256]
+        """
+
+        output = src
+
+        # preparation and reshape
+        if self.num_layers > 0:
+            reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+
+        if self.text_layers:
+            # generate pos_text
+            bs, n_text, text_dim = memory_text.shape
+            if pos_text is None and position_ids is None:
+                pos_text = torch.arange(n_text, device=memory_text.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs,
+                                                                                                                     1,
+                                                                                                                     1)
+                pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)
+            if position_ids is not None:
+                pos_text = get_sine_pos_embed(position_ids[..., None], num_pos_feats=256, exchange_xy=False)
+
+        # main process
+        for layer_id, layer in enumerate(self.layers):
+            # if output.isnan().any() or memory_text.isnan().any():
+            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            #         import ipdb; ipdb.set_trace()
+            if self.fusion_layers:
+                if self.use_checkpoint:
+                    output, memory_text = checkpoint.checkpoint(
+                        self.fusion_layers[layer_id],
+                        output,
+                        memory_text,
+                        key_padding_mask,
+                        text_attention_mask
+                    )
+                else:
+                    output, memory_text = self.fusion_layers[layer_id](v=output, l=memory_text,
+                                                                       attention_mask_v=key_padding_mask,
+                                                                       attention_mask_l=text_attention_mask)
+
+            if self.text_layers:
+                memory_text = self.text_layers[layer_id](
+                    src=memory_text.transpose(0, 1),
+                    src_mask=~text_self_attention_masks,  # note we use ~ for mask here
+                    src_key_padding_mask=text_attention_mask,
+                    pos=(pos_text.transpose(0, 1) if pos_text is not None else None)
+                ).transpose(0, 1)
+
+            # main process
+            if self.use_transformer_ckpt:
+                output = checkpoint.checkpoint(
+                    layer,
+                    output,
+                    pos,
+                    reference_points,
+                    spatial_shapes,
+                    level_start_index,
+                    key_padding_mask
+                )
+            else:
+                output = layer(src=output, pos=pos, reference_points=reference_points, spatial_shapes=spatial_shapes,
+                               level_start_index=level_start_index, key_padding_mask=key_padding_mask)
+
+        return output, memory_text
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(self, decoder_layer, num_layers, norm=None,
+                 return_intermediate=False,
+                 d_model=256, query_dim=4,
+                 modulate_hw_attn=False,
+                 num_feature_levels=1,
+                 deformable_decoder=False,
+                 decoder_query_perturber=None,
+                 dec_layer_number=None,  # number of queries each layer in decoder
+                 rm_dec_query_scale=False,
+                 dec_layer_share=False,
+                 dec_layer_dropout_prob=None,
+                 use_detached_boxes_dec_out=False,
+                 num_box_decoder_layers=2,
+                 num_body_points=68,
+                 ):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(decoder_layer, num_layers, layer_share=dec_layer_share)
+        else:
+            self.layers = []
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+        assert return_intermediate, "support return_intermediate only"
+        self.query_dim = query_dim
+        assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
+        self.num_feature_levels = num_feature_levels
+        self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
+
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
+        if not deformable_decoder:
+            self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)
+        else:
+            self.query_pos_sine_scale = None
+
+        if rm_dec_query_scale:
+            self.query_scale = None
+        else:
+            raise NotImplementedError
+            self.query_scale = MLP(d_model, d_model, d_model, 2)
+        self.bbox_embed = None
+        self.class_embed = None
+        self.pose_embed = None
+        self.pose_hw_embed = None
+        self.d_model = d_model
+        self.modulate_hw_attn = modulate_hw_attn
+        self.deformable_decoder = deformable_decoder
+
+        if not deformable_decoder and modulate_hw_attn:
+            self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
+        else:
+            self.ref_anchor_head = None
+
+        self.decoder_query_perturber = decoder_query_perturber
+        self.box_pred_damping = None
+
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            assert isinstance(dec_layer_number, list)
+            assert len(dec_layer_number) == num_layers
+            # assert dec_layer_number[0] ==
+
+        self.dec_layer_dropout_prob = dec_layer_dropout_prob
+        if dec_layer_dropout_prob is not None:
+            assert isinstance(dec_layer_dropout_prob, list)
+            assert len(dec_layer_dropout_prob) == num_layers
+            for i in dec_layer_dropout_prob:
+                assert 0.0 <= i <= 1.0
+
+        self.rm_detach = None
+        self.num_body_points = num_body_points
+
+        self.hw = nn.Embedding(17, 2)
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.kpt_index = [x for x in range(50 * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]
+        self.hw_append = nn.Embedding(self.num_body_points-17, 2)
+
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                tgt_mask2: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                refpoints_unsigmoid: Optional[Tensor] = None,  # num_queries, bs, 2
+                # for memory
+                level_start_index: Optional[Tensor] = None,  # num_levels
+                spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+                valid_ratios: Optional[Tensor] = None,
+                # for text
+                memory_text: Optional[Tensor] = None,
+                text_attention_mask: Optional[Tensor] = None,
+                text_dict: Optional[Tensor] = None,
+                dn_meta: Optional[Tensor] = None,
+                targets: Optional[Tensor] = None,
+                kpt_embed: Optional[Tensor] = None
+                ):
+        """
+        Input:
+            - tgt: nq, bs, d_model
+            - memory: hw, bs, d_model
+            - pos: hw, bs, d_model
+            - refpoints_unsigmoid: nq, bs, 2/4
+            - valid_ratios/spatial_shapes: bs, nlevel, 2
+        """
+
+        output = tgt
+        output += self.hw.weight[0, 0] * 0.0
+
+
+        intermediate = []
+        reference_points = refpoints_unsigmoid.sigmoid()
+        ref_points = [reference_points]
+        effect_num_dn = dn_meta['pad_size'] if self.training else 0
+        inter_select_number = 50
+        for layer_id, layer in enumerate(self.layers):
+
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] \
+                                         * torch.cat([valid_ratios, valid_ratios], -1)[None, :]  # nq, bs, nlevel, 4
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
+            query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :])  # nq, bs, 256*2
+
+            # conditional query
+            raw_query_pos = self.ref_point_head(query_sine_embed)  # nq, bs, 256
+            pos_scale = self.query_scale(output) if self.query_scale is not None else 1
+            query_pos = pos_scale * raw_query_pos
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if query_pos.isnan().any() | query_pos.isinf().any():
+            #         import ipdb; ipdb.set_trace()
+
+            # main process
+            output = layer(
+                tgt=output,
+                tgt_query_pos=query_pos,
+                tgt_query_sine_embed=query_sine_embed,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                tgt_reference_points=reference_points_input,
+
+                memory_text=memory_text,
+                text_attention_mask=text_attention_mask,
+
+                memory=memory,
+                memory_key_padding_mask=memory_key_padding_mask,
+                memory_level_start_index=level_start_index,
+                memory_spatial_shapes=spatial_shapes,
+                memory_pos=pos,
+
+                self_attn_mask=tgt_mask,
+                cross_attn_mask=memory_mask
+            )
+            if output.isnan().any() | output.isinf().any():
+                print(f"output layer_id {layer_id} is nan")
+                try:
+                    num_nan = output.isnan().sum().item()
+                    num_inf = output.isinf().sum().item()
+                    print(f"num_nan {num_nan}, num_inf {num_inf}")
+                except Exception as e:
+                    print(e)
+
+
+
+
+            intermediate.append(self.norm(output))
+            # iter update
+            if layer_id < self.num_box_decoder_layers:
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                delta_unsig = self.bbox_embed[layer_id](output)
+                outputs_unsig = delta_unsig + reference_before_sigmoid
+                new_reference_points = outputs_unsig.sigmoid()
+
+            # select # ref points as anchors
+            if layer_id == self.num_box_decoder_layers - 1:
+                dn_output = output[:effect_num_dn]
+                dn_new_reference_points = new_reference_points[:effect_num_dn]
+                class_unselected = self.class_embed[layer_id](output.transpose(0, 1), text_dict)[:,
+                                   effect_num_dn:].transpose(0, 1)
+                topk_proposals = torch.topk(class_unselected.max(-1)[0], inter_select_number, dim=0)[1]
+                new_reference_points_for_box = torch.gather(new_reference_points[effect_num_dn:], 0,
+                                                            topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+                new_output_for_box = torch.gather(output[effect_num_dn:], 0,
+                                                  topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
+                keypoint_embed=kpt_embed.transpose(0, 1)
+
+                new_output_for_keypoint = keypoint_embed[None, :, :, :].repeat(new_output_for_box.shape[0],1,1,1)
+                delta_xy = self.pose_embed[-1](new_output_for_keypoint)[..., :2]
+                keypoint_xy = (inverse_sigmoid(new_reference_points_for_box[..., :2][:, None]) + delta_xy).sigmoid()
+                num_queries, _, bs, _ = keypoint_xy.shape
+                aa = torch.cat((self.hw.weight,self.hw_append.weight),dim=0)
+                keypoint_wh_weight = aa.unsqueeze(0).unsqueeze(-2).repeat(num_queries, 1, bs, 1).sigmoid()
+                keypoint_wh = keypoint_wh_weight * new_reference_points_for_box[..., 2:][:, None]
+                new_reference_points_for_keypoint = torch.cat((keypoint_xy, keypoint_wh), dim=-1)
+                new_reference_points = torch.cat(
+                    (new_reference_points_for_box.unsqueeze(1), new_reference_points_for_keypoint), dim=1).flatten(0, 1)
+                output = torch.cat((new_output_for_box.unsqueeze(1), new_output_for_keypoint), dim=1).flatten(0, 1)
+                new_reference_points = torch.cat((dn_new_reference_points, new_reference_points), dim=0)
+                output = torch.cat((dn_output, output), dim=0)
+                tgt_mask = tgt_mask2
+
+            if layer_id >= self.num_box_decoder_layers:
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                output_bbox_dn = output[:effect_num_dn]
+                output_bbox_norm = output[effect_num_dn:][0::(self.num_body_points + 1)]
+                reference_before_sigmoid_bbox_dn = reference_before_sigmoid[:effect_num_dn]
+                reference_before_sigmoid_bbox_norm = reference_before_sigmoid[effect_num_dn:][
+                                                     0::(self.num_body_points + 1)]
+                delta_unsig_dn = self.bbox_embed[layer_id](output_bbox_dn)
+                delta_unsig_norm = self.bbox_embed[layer_id](output_bbox_norm)
+                outputs_unsig_dn = delta_unsig_dn + reference_before_sigmoid_bbox_dn
+                outputs_unsig_norm = delta_unsig_norm + reference_before_sigmoid_bbox_norm
+                new_reference_points_for_box_dn = outputs_unsig_dn.sigmoid()
+                new_reference_points_for_box_norm = outputs_unsig_norm.sigmoid()
+                output_kpt = output[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index, device=output.device))
+                delta_xy_unsig = self.pose_embed[layer_id - self.num_box_decoder_layers](output_kpt)
+                outputs_unsig = reference_before_sigmoid[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index,
+                                                                                                      device=output.device)).clone()  ##
+                delta_hw_unsig = self.pose_hw_embed[layer_id - self.num_box_decoder_layers](output_kpt)
+                outputs_unsig[..., :2] += delta_xy_unsig[..., :2]
+                outputs_unsig[..., 2:] += delta_hw_unsig
+                new_reference_points_for_keypoint = outputs_unsig.sigmoid()
+                bs = new_reference_points_for_box_norm.shape[1]
+                new_reference_points_norm = torch.cat((new_reference_points_for_box_norm.unsqueeze(1),
+                                                       new_reference_points_for_keypoint.view(-1, self.num_body_points,
+                                                                                              bs, 4)), dim=1).flatten(0,
+                                                                                                                      1)
+                new_reference_points = torch.cat((new_reference_points_for_box_dn, new_reference_points_norm), dim=0)
+
+            if self.rm_detach and 'dec' in self.rm_detach:
+                reference_points = new_reference_points
+            else:
+                reference_points = new_reference_points.detach()
+
+            # if layer_id != self.num_layers - 1:
+            if self.use_detached_boxes_dec_out:
+                ref_points.append(reference_points)
+            else:
+                ref_points.append(new_reference_points)
+
+        return [
+            [itm_out.transpose(0, 1) for itm_out in intermediate],
+            [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points]
+        ]
+
+
+class DeformableTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 add_channel_attention=False,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 ):
+        super().__init__()
+
+        # self attention
+        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # channel attention
+        self.add_channel_attention = add_channel_attention
+        if add_channel_attention:
+            self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)
+            self.norm_channel = nn.LayerNorm(d_model)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):
+        # self attention
+        # import ipdb; ipdb.set_trace()
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index,
+                              key_padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # ffn
+        src = self.forward_ffn(src)
+
+        # channel attn
+        if self.add_channel_attention:
+            src = self.norm_channel(src + self.activ_channel(src))
+
+        return src
+
+
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 use_text_feat_guide=False,
+                 use_text_cross_attention=False,
+                 ffn_extra_layernorm=False
+                 ):
+        super().__init__()
+
+        # cross attention
+        # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # cross attention text
+        if use_text_cross_attention:
+            self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+            self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+            self.catext_norm = nn.LayerNorm(d_model)
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
+        self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm3 = nn.LayerNorm(d_model)
+        if ffn_extra_layernorm:
+            raise NotImplementedError('ffn_extra_layernorm not implemented')
+            self.norm_ext = nn.LayerNorm(d_ffn)
+        else:
+            self.norm_ext = None
+
+        self.key_aware_proj = None
+        self.use_text_feat_guide = use_text_feat_guide
+        assert not use_text_feat_guide
+        self.use_text_cross_attention = use_text_cross_attention
+
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt, ipdb_flag=False):
+
+        with torch.cuda.amp.autocast(enabled=False):
+            tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None,  # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None,  # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None,  # nq, bs, 4
+
+                memory_text: Optional[Tensor] = None,  # bs, num_token, d_model
+                text_attention_mask: Optional[Tensor] = None,  # bs, num_token
+
+                # for memory
+                memory: Optional[Tensor] = None,  # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None,  # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None,  # pos for memory
+
+                # sa
+                self_attn_mask: Optional[Tensor] = None,  # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None,  # mask used for cross-attention
+                ):
+        """
+        Input:
+            - tgt/tgt_query_pos: nq, bs, d_model
+            -
+        """
+        assert cross_attn_mask is None
+
+        # self attention
+        if self.self_attn is not None:
+            # import ipdb; ipdb.set_trace()
+            q = k = self.with_pos_embed(tgt, tgt_query_pos)
+            tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+            tgt = tgt + self.dropout2(tgt2)
+            tgt = self.norm2(tgt)
+
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if tgt.isnan().any() | tgt.isinf().any() :
+            #         import ipdb; ipdb.set_trace()
+
+        if self.use_text_cross_attention:
+            tgt2 = self.ca_text(self.with_pos_embed(tgt, tgt_query_pos), memory_text.transpose(0, 1),
+                                memory_text.transpose(0, 1), key_padding_mask=text_attention_mask)[0]
+            tgt = tgt + self.catext_dropout(tgt2)
+            tgt = self.catext_norm(tgt)
+
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            #         import ipdb; ipdb.set_trace()
+
+            # if tgt.isnan().any() | tgt.isinf().any() :
+            #     import ipdb; ipdb.set_trace()
+
+        tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+                               tgt_reference_points.transpose(0, 1).contiguous(),
+                               memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index,
+                               memory_key_padding_mask).transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     tgtk = tgt.clone()
+        #     if tgt.isnan().any() | tgt.isinf().any() :
+        #         import ipdb; ipdb.set_trace()
+
+        # ffn
+        tgt = self.forward_ffn(tgt)
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     if tgt.isnan().any() | tgt.isinf().any() :
+        #         tgtk = self.forward_ffn(tgtk, ipdb_flag=True)
+        #         import ipdb; ipdb.set_trace()
+
+        return tgt
+
+
+def _get_clones(module, N, layer_share=False):
+    # import ipdb; ipdb.set_trace()
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def build_deformable_transformer(args):
+    decoder_query_perturber = None
+    if args.decoder_layer_noise:
+        from .utils import RandomBoxPerturber
+        decoder_query_perturber = RandomBoxPerturber(
+            x_noise_scale=args.dln_xy_noise, y_noise_scale=args.dln_xy_noise,
+            w_noise_scale=args.dln_hw_noise, h_noise_scale=args.dln_hw_noise)
+
+    use_detached_boxes_dec_out = False
+    try:
+        use_detached_boxes_dec_out = args.use_detached_boxes_dec_out
+    except:
+        use_detached_boxes_dec_out = False
+
+    binary_query_selection = False
+    try:
+        binary_query_selection = args.binary_query_selection
+    except:
+        binary_query_selection = False
+
+    ffn_extra_layernorm = False
+    try:
+        ffn_extra_layernorm = args.ffn_extra_layernorm
+    except:
+        print('ffn_extra_layernorm not found, set to False')
+        ffn_extra_layernorm = False
+
+    return DeformableTransformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        num_queries=args.num_queries,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_unicoder_layers=args.unic_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+        query_dim=args.query_dim,
+        activation=args.transformer_activation,
+        num_patterns=args.num_patterns,
+        modulate_hw_attn=True,
+
+        deformable_encoder=True,
+        deformable_decoder=True,
+        num_feature_levels=args.num_feature_levels,
+        enc_n_points=args.enc_n_points,
+        dec_n_points=args.dec_n_points,
+        use_deformable_box_attn=args.use_deformable_box_attn,
+        box_attn_type=args.box_attn_type,
+
+        learnable_tgt_init=True,
+        decoder_query_perturber=decoder_query_perturber,
+
+        add_channel_attention=args.add_channel_attention,
+        add_pos_value=args.add_pos_value,
+        random_refpoints_xy=args.random_refpoints_xy,
+
+        # two stage
+        two_stage_type=args.two_stage_type,  # ['no', 'standard', 'early']
+        two_stage_pat_embed=args.two_stage_pat_embed,
+        two_stage_add_query_num=args.two_stage_add_query_num,
+        two_stage_learn_wh=args.two_stage_learn_wh,
+        two_stage_keep_all_tokens=args.two_stage_keep_all_tokens,
+        dec_layer_number=args.dec_layer_number,
+        rm_self_attn_layers=None,
+        key_aware_type=None,
+        layer_share_type=None,
+
+        rm_detach=None,
+        decoder_sa_type=args.decoder_sa_type,
+        module_seq=args.decoder_module_seq,
+
+        embed_init_tgt=args.embed_init_tgt,
+        use_detached_boxes_dec_out=use_detached_boxes_dec_out,
+        use_text_enhancer=args.use_text_enhancer,
+        use_fusion_layer=args.use_fusion_layer,
+        use_checkpoint=args.use_checkpoint,
+        use_transformer_ckpt=args.use_transformer_ckpt,
+        use_text_cross_attention=args.use_text_cross_attention,
+
+        text_dropout=args.text_dropout,
+        fusion_dropout=args.fusion_dropout,
+        fusion_droppath=args.fusion_droppath,
+
+        binary_query_selection=binary_query_selection,
+        ffn_extra_layernorm=ffn_extra_layernorm,
+    )
diff --git a/src/models/XPose/models/UniPose/fuse_modules.py b/src/models/XPose/models/UniPose/fuse_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d9e330fcb4764fa9b4c1f54936562708cc7a90f
--- /dev/null
+++ b/src/models/XPose/models/UniPose/fuse_modules.py
@@ -0,0 +1,276 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# from timm.models.layers import DropPath
+from src.models.util import DropPath
+
+
+class FeatureResizer(nn.Module):
+    """
+    This class takes as input a set of embeddings of dimension C1 and outputs a set of
+    embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
+    """
+
+    def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
+        super().__init__()
+        self.do_ln = do_ln
+        # Object feature encoding
+        self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
+        self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, encoder_features):
+        x = self.fc(encoder_features)
+        if self.do_ln:
+            x = self.layer_norm(x)
+        output = self.dropout(x)
+        return output
+
+
+def l1norm(X, dim, eps=1e-8):
+    """L1-normalize columns of X
+    """
+    norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps
+    X = torch.div(X, norm)
+    return X
+
+
+def l2norm(X, dim, eps=1e-8):
+    """L2-normalize columns of X
+    """
+    norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps
+    X = torch.div(X, norm)
+    return X
+
+
+def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8):
+    """
+    query: (n_context, queryL, d)
+    context: (n_context, sourceL, d)
+    """
+    batch_size_q, queryL = query.size(0), query.size(1)
+    batch_size, sourceL = context.size(0), context.size(1)
+
+    # Get attention
+    # --> (batch, d, queryL)
+    queryT = torch.transpose(query, 1, 2)
+
+    # (batch, sourceL, d)(batch, d, queryL)
+    # --> (batch, sourceL, queryL)
+    attn = torch.bmm(context, queryT)
+    if raw_feature_norm == "softmax":
+        # --> (batch*sourceL, queryL)
+        attn = attn.view(batch_size * sourceL, queryL)
+        attn = nn.Softmax()(attn)
+        # --> (batch, sourceL, queryL)
+        attn = attn.view(batch_size, sourceL, queryL)
+    elif raw_feature_norm == "l2norm":
+        attn = l2norm(attn, 2)
+    elif raw_feature_norm == "clipped_l2norm":
+        attn = nn.LeakyReLU(0.1)(attn)
+        attn = l2norm(attn, 2)
+    else:
+        raise ValueError("unknown first norm type:", raw_feature_norm)
+    # --> (batch, queryL, sourceL)
+    attn = torch.transpose(attn, 1, 2).contiguous()
+    # --> (batch*queryL, sourceL)
+    attn = attn.view(batch_size * queryL, sourceL)
+    attn = nn.Softmax()(attn * smooth)
+    # --> (batch, queryL, sourceL)
+    attn = attn.view(batch_size, queryL, sourceL)
+    # --> (batch, sourceL, queryL)
+    attnT = torch.transpose(attn, 1, 2).contiguous()
+
+    # --> (batch, d, sourceL)
+    contextT = torch.transpose(context, 1, 2)
+    # (batch x d x sourceL)(batch x sourceL x queryL)
+    # --> (batch, d, queryL)
+    weightedContext = torch.bmm(contextT, attnT)
+    # --> (batch, queryL, d)
+    weightedContext = torch.transpose(weightedContext, 1, 2)
+
+    return weightedContext, attnT
+
+
+class BiMultiHeadAttention(nn.Module):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):
+        super(BiMultiHeadAttention, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.v_dim = v_dim
+        self.l_dim = l_dim
+
+        assert (
+                self.head_dim * self.num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        self.scale = self.head_dim ** (-0.5)
+        self.dropout = dropout
+
+        self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
+
+        self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
+        self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
+
+        self.stable_softmax_2d = True
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+
+        self._reset_parameters()
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def _reset_parameters(self):
+        nn.init.xavier_uniform_(self.v_proj.weight)
+        self.v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.l_proj.weight)
+        self.l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_v_proj.weight)
+        self.values_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_l_proj.weight)
+        self.values_l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_v_proj.weight)
+        self.out_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_l_proj.weight)
+        self.out_l_proj.bias.data.fill_(0)
+
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        """_summary_
+
+        Args:
+            v (_type_): bs, n_img, dim
+            l (_type_): bs, n_text, dim
+            attention_mask_v (_type_, optional): _description_. bs, n_img
+            attention_mask_l (_type_, optional): _description_. bs, n_text
+
+        Returns:
+            _type_: _description_
+        """
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        bsz, tgt_len, _ = v.size()
+
+        query_states = self.v_proj(v) * self.scale
+        key_states = self._shape(self.l_proj(l), -1, bsz)
+        value_v_states = self._shape(self.values_v_proj(v), -1, bsz)
+        value_l_states = self._shape(self.values_l_proj(l), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_v_states = value_v_states.view(*proj_shape)
+        value_l_states = value_l_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))  # bs*nhead, nimg, ntxt
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if self.stable_softmax_2d:
+            attn_weights = attn_weights - attn_weights.max()
+
+        if self.clamp_min_for_underflow:
+            attn_weights = torch.clamp(attn_weights,
+                                       min=-50000)  # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights = torch.clamp(attn_weights,
+                                       max=50000)  # Do not increase 50000, data type half has quite limited range
+
+        attn_weights_T = attn_weights.transpose(1, 2)
+        attn_weights_l = (attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[
+            0])
+        if self.clamp_min_for_underflow:
+            attn_weights_l = torch.clamp(attn_weights_l,
+                                         min=-50000)  # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights_l = torch.clamp(attn_weights_l,
+                                         max=50000)  # Do not increase 50000, data type half has quite limited range
+
+        # mask vison for language
+        if attention_mask_v is not None:
+            attention_mask_v = attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            attn_weights_l.masked_fill_(attention_mask_v, float('-inf'))
+
+        attn_weights_l = attn_weights_l.softmax(dim=-1)
+
+        # mask language for vision
+        if attention_mask_l is not None:
+            attention_mask_l = attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            attn_weights.masked_fill_(attention_mask_l, float('-inf'))
+        attn_weights_v = attn_weights.softmax(dim=-1)
+
+        attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
+        attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)
+
+        attn_output_v = torch.bmm(attn_probs_v, value_l_states)
+        attn_output_l = torch.bmm(attn_probs_l, value_v_states)
+
+        if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}"
+            )
+
+        if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}"
+            )
+
+        attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output_v = attn_output_v.transpose(1, 2)
+        attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim)
+        attn_output_l = attn_output_l.transpose(1, 2)
+        attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
+
+        attn_output_v = self.out_v_proj(attn_output_v)
+        attn_output_l = self.out_l_proj(attn_output_l)
+
+        return attn_output_v, attn_output_l
+
+
+# Bi-Direction MHA (text->image, image->text)
+class BiAttentionBlock(nn.Module):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1,
+                 drop_path=.0, init_values=1e-4, cfg=None):
+        """
+        Inputs:
+            embed_dim - Dimensionality of input and attention feature vectors
+            hidden_dim - Dimensionality of hidden layer in feed-forward network
+                         (usually 2-4x larger than embed_dim)
+            num_heads - Number of heads to use in the Multi-Head Attention block
+            dropout - Amount of dropout to apply in the feed-forward network
+        """
+        super(BiAttentionBlock, self).__init__()
+
+        # pre layer norm
+        self.layer_norm_v = nn.LayerNorm(v_dim)
+        self.layer_norm_l = nn.LayerNorm(l_dim)
+        self.attn = BiMultiHeadAttention(v_dim=v_dim,
+                                         l_dim=l_dim,
+                                         embed_dim=embed_dim,
+                                         num_heads=num_heads,
+                                         dropout=dropout)
+
+        # add layer scale for training stability
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=False)
+        self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=False)
+
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        v = self.layer_norm_v(v)
+        l = self.layer_norm_l(l)
+        delta_v, delta_l = self.attn(v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l)
+        # v, l = v + delta_v, l + delta_l
+        v = v + self.drop_path(self.gamma_v * delta_v)
+        l = l + self.drop_path(self.gamma_l * delta_l)
+        return v, l
diff --git a/src/models/XPose/models/UniPose/mask_generate.py b/src/models/XPose/models/UniPose/mask_generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed79e74d724b11b761e9a762099017e105d87df1
--- /dev/null
+++ b/src/models/XPose/models/UniPose/mask_generate.py
@@ -0,0 +1,56 @@
+import torch
+
+
+def prepare_for_mask(kpt_mask):
+
+
+    tgt_size2 = 50 * 69
+    attn_mask2 = torch.ones(kpt_mask.shape[0], 8, tgt_size2, tgt_size2).to('cuda') < 0
+    group_bbox_kpt = 69
+    num_group=50
+    for matchj in range(num_group * group_bbox_kpt):
+        sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+        ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+        if sj > 0:
+            attn_mask2[:,:,matchj, :sj] = True
+        if ej < num_group * group_bbox_kpt:
+            attn_mask2[:,:,matchj, ej:] = True
+
+
+    bs, length = kpt_mask.shape
+    equal_mask = kpt_mask[:, :, None] == kpt_mask[:, None, :]
+    equal_mask= equal_mask.unsqueeze(1).repeat(1,8,1,1)
+    for idx in range(num_group):
+        start_idx = idx * length
+        end_idx = (idx + 1) * length
+        attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][equal_mask] = False
+        attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][~equal_mask] = True
+
+
+
+
+    input_query_label = None
+    input_query_bbox = None
+    attn_mask = None
+    dn_meta = None
+
+    return input_query_label, input_query_bbox, attn_mask, attn_mask2.flatten(0,1), dn_meta
+
+
+def post_process(outputs_class, outputs_coord, dn_meta, aux_loss, _set_aux_loss):
+
+    if dn_meta and dn_meta['pad_size'] > 0:
+
+        output_known_class = [outputs_class_i[:, :dn_meta['pad_size'], :] for outputs_class_i in outputs_class]
+        output_known_coord = [outputs_coord_i[:, :dn_meta['pad_size'], :] for outputs_coord_i in outputs_coord]
+
+        outputs_class = [outputs_class_i[:, dn_meta['pad_size']:, :] for outputs_class_i in outputs_class]
+        outputs_coord = [outputs_coord_i[:, dn_meta['pad_size']:, :] for outputs_coord_i in outputs_coord]
+
+        out = {'pred_logits': output_known_class[-1], 'pred_boxes': output_known_coord[-1]}
+        if aux_loss:
+            out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_coord)
+        dn_meta['output_known_lbs_bboxes'] = out
+    return outputs_class, outputs_coord
+
+
diff --git a/src/models/XPose/models/UniPose/ops/__init__.py b/src/models/XPose/models/UniPose/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b3239d927e0762a4952006a55a8596998e0ac03
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/8/5 21:58
+# @Author  : shaoguowen
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py
diff --git a/src/models/XPose/models/UniPose/ops/functions/__init__.py b/src/models/XPose/models/UniPose/ops/functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a2197bda3199aa32cafc5b9d396479609853dd2
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/functions/__init__.py
@@ -0,0 +1,10 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from .ms_deform_attn_func import MSDeformAttnFunction
+
diff --git a/src/models/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py b/src/models/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c5df8cf5d23aca963eec6c1133c180b37289607
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py
@@ -0,0 +1,61 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+import MultiScaleDeformableAttention as MSDA
+
+
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
+        ctx.im2col_step = im2col_step
+        output = MSDA.ms_deform_attn_forward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = \
+            MSDA.ms_deform_attn_backward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
+                                          mode='bilinear', padding_mode='zeros', align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
+    return output.transpose(1, 2).contiguous()
diff --git a/src/models/XPose/models/UniPose/ops/modules/__init__.py b/src/models/XPose/models/UniPose/ops/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f82cb1ad9d634a87b54ba6a71b58a230bcade5fe
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/modules/__init__.py
@@ -0,0 +1,9 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from .ms_deform_attn import MSDeformAttn
diff --git a/src/models/XPose/models/UniPose/ops/modules/ms_deform_attn.py b/src/models/XPose/models/UniPose/ops/modules/ms_deform_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad74ca501825b5fa975d6169ed883aca9372b3fd
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/modules/ms_deform_attn.py
@@ -0,0 +1,142 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import warnings
+import math, os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+
+from src.models.XPose.models.UniPose.ops.functions.ms_deform_attn_func import MSDeformAttnFunction
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+
+
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          "which is more efficient in our CUDA implementation.")
+
+        self.im2col_step = 64
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+
+        self.use_4D_normalizer = use_4D_normalizer
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+
+        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            if self.use_4D_normalizer:
+                offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
+            else:
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+
+
+        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+
+        # for amp
+        if value.dtype == torch.float16:
+            # for mixed precision
+            output = MSDeformAttnFunction.apply(
+            value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step)
+            output = output.to(torch.float16)
+            output = self.output_proj(output)
+            return output
+
+        output = MSDeformAttnFunction.apply(
+            value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+        return output
diff --git a/src/models/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py b/src/models/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a4c9a0d5dc7e9e8c80120e8a1cb10d5e9402408
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py
@@ -0,0 +1,130 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import warnings
+import math, os
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+
+try:
+    from src.models.XPose.models.UniPose.ops.functions import MSDeformAttnFunction
+except:
+    warnings.warn('Failed to import MSDeformAttnFunction.')
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+
+
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          "which is more efficient in our CUDA implementation.")
+
+        self.im2col_step = 64
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+
+        self.use_4D_normalizer = use_4D_normalizer
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self, query, key, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param key                          (N, 1, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+
+        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            if self.use_4D_normalizer:
+                offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
+            else:
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+        output = MSDeformAttnFunction.apply(
+            value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+        return output
diff --git a/src/models/XPose/models/UniPose/ops/setup.py b/src/models/XPose/models/UniPose/ops/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..681f5f0668c1752a4387c73a34d403197747bc0f
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/setup.py
@@ -0,0 +1,83 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+"""
+python setup.py build install
+"""
+import os
+import glob
+
+import torch
+
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+from setuptools import find_packages
+from setuptools import setup
+
+requirements = ["torch", "torchvision"]
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    # import ipdb; ipdb.set_trace()
+
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+            # 添加以下行来指定多个 CUDA 架构
+            "-gencode=arch=compute_60,code=sm_60",
+            "-gencode=arch=compute_70,code=sm_70",
+            "-gencode=arch=compute_75,code=sm_75",
+            "-gencode=arch=compute_80,code=sm_80",
+            "-gencode=arch=compute_86,code=sm_86",
+            "-gencode=arch=compute_89,code=sm_89",
+            "-gencode=arch=compute_90,code=sm_90"
+        ]
+    else:
+        raise NotImplementedError('Cuda is not availabel')
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "MultiScaleDeformableAttention",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+
+setup(
+    name="MultiScaleDeformableAttention",
+    version="1.0",
+    author="Weijie Su",
+    url="https://github.com/fundamentalvision/Deformable-DETR",
+    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
+    packages=find_packages(exclude=("configs", "tests",)),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/src/models/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp b/src/models/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1bf854de1f3860d20b6fef5c1a17817c268e70a
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp
@@ -0,0 +1,41 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
diff --git a/src/models/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h b/src/models/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..81b7b58a3d9502bbb684dc84687a526dedf94cae
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h
@@ -0,0 +1,33 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
+
diff --git a/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu b/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d6d583647cce987196d5ad1968a8a365a379e774
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,153 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+#include "cuda/ms_deform_im2col_cuda.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
\ No newline at end of file
diff --git a/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h b/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7ae53f99c820ce6193b608ad344550348a0b42c
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h
@@ -0,0 +1,30 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
diff --git a/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh b/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6bc2acb7aea0eab2e9e91e769a16861e1652c284
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh
@@ -0,0 +1,1327 @@
+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val); 
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        { 
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            } 
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size, 
+                              const int spatial_size, 
+                              const int num_heads,
+                              const int channels, 
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point, 
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
\ No newline at end of file
diff --git a/src/models/XPose/models/UniPose/ops/src/ms_deform_attn.h b/src/models/XPose/models/UniPose/ops/src/ms_deform_attn.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac0ef2ec25f7d0ee51ca2d807b159ddf85652017
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/src/ms_deform_attn.h
@@ -0,0 +1,62 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+
+#include "cpu/ms_deform_attn_cpu.h"
+
+#ifdef WITH_CUDA
+#include "cuda/ms_deform_attn_cuda.h"
+#endif
+
+
+at::Tensor
+ms_deform_attn_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_forward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_backward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
diff --git a/src/models/XPose/models/UniPose/ops/src/vision.cpp b/src/models/XPose/models/UniPose/ops/src/vision.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2201f63a51dca16d0b31148ed2c9e8e47ec15bdc
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/src/vision.cpp
@@ -0,0 +1,16 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "ms_deform_attn.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
+}
diff --git a/src/models/XPose/models/UniPose/ops/test.py b/src/models/XPose/models/UniPose/ops/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dbf6d5547d131f01a8c5c28b76557bd27a9334b
--- /dev/null
+++ b/src/models/XPose/models/UniPose/ops/test.py
@@ -0,0 +1,89 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+
+from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
+
+
+N, M, D = 1, 2, 2
+Lq, L, P = 2, 2, 2
+shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
+level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
+S = sum([(H*W).item() for H, W in shapes])
+
+
+torch.manual_seed(3)
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+
+    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+
+    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
+
+    value = torch.rand(N, S, M, channels).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    func = MSDeformAttnFunction.apply
+
+    value.requires_grad = grad_value
+    sampling_locations.requires_grad = grad_sampling_loc
+    attention_weights.requires_grad = grad_attn_weight
+
+    gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
+
+    print(f'* {gradok} check_gradient_numerical(D={channels})')
+
+
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+
+    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
+        check_gradient_numerical(channels, True, True, True)
+
+
+
diff --git a/src/models/XPose/models/UniPose/position_encoding.py b/src/models/XPose/models/UniPose/position_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ed620b0417e326cc40e9b4a324b61e7da08d331
--- /dev/null
+++ b/src/models/XPose/models/UniPose/position_encoding.py
@@ -0,0 +1,157 @@
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+
+from ...util.misc import NestedTensor
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            # if os.environ.get("SHILONG_AMP", None) == '1':
+            #     eps = 1e-4
+            # else:
+            #     eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+class PositionEmbeddingSineHW(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperatureH = temperatureH
+        self.temperatureW = temperatureW
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+
+        # import ipdb; ipdb.set_trace()
+
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_tx
+
+        dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / self.num_pos_feats)
+        pos_y = y_embed[:, :, :, None] / dim_ty
+
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+
+        # import ipdb; ipdb.set_trace()
+
+        return pos
+
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return pos
+
+
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ('v2', 'sine'):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSineHW(
+            N_steps, 
+            temperatureH=args.pe_temperatureH,
+            temperatureW=args.pe_temperatureW,
+            normalize=True
+        )
+    elif args.position_embedding in ('v3', 'learned'):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+
+    return position_embedding
diff --git a/src/models/XPose/models/UniPose/swin_transformer.py b/src/models/XPose/models/UniPose/swin_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e6c5b88df694016bee93a05d50075228e59f54e
--- /dev/null
+++ b/src/models/XPose/models/UniPose/swin_transformer.py
@@ -0,0 +1,701 @@
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import numpy as np
+
+from ...util.misc import NestedTensor
+# from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from src.models.util import DropPath, to_2tuple, trunc_normal_
+
+
+
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.view(B, H, W, C)
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+
+        return x
+
+
+class SwinTransformer(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        dilation (bool): if True, the output size if 16x downsample, ow 32x downsample.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 dilation=False,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.dilation = dilation
+
+        # if use_checkpoint:
+        #     print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!")
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
+
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        # prepare downsample list
+        downsamplelist = [PatchMerging for i in range(self.num_layers)]
+        downsamplelist[-1] = None
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        if self.dilation:
+            downsamplelist[-2] = None
+            num_features[-1] = int(embed_dim * 2 ** (self.num_layers - 1)) // 2
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                # dim=int(embed_dim * 2 ** i_layer),
+                dim=num_features[i_layer],
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                downsample=downsamplelist[i_layer],
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+
+
+    def forward_raw(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            # import ipdb; ipdb.set_trace()
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # outs:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+        return tuple(outs)
+
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # out:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+
+        # collect for nesttensors
+        outs_dict = {}
+        for idx, out_i in enumerate(outs):
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]
+            outs_dict[idx] = NestedTensor(out_i, mask)
+
+        return outs_dict
+
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+
+
+def build_swin_transformer(modelname, pretrain_img_size, **kw):
+    assert modelname in ['swin_T_224_1k', 'swin_B_224_22k', 'swin_B_384_22k', 'swin_L_224_22k', 'swin_L_384_22k']
+
+    model_para_dict = {
+        'swin_T_224_1k': dict(
+            embed_dim=96,
+            depths=[ 2, 2, 6, 2 ],
+            num_heads=[ 3, 6, 12, 24],
+            window_size=7
+        ),
+        'swin_B_224_22k': dict(
+            embed_dim=128,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 4, 8, 16, 32 ],
+            window_size=7
+        ),
+        'swin_B_384_22k': dict(
+            embed_dim=128,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 4, 8, 16, 32 ],
+            window_size=12
+        ),
+        'swin_L_224_22k': dict(
+            embed_dim=192,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 6, 12, 24, 48 ],
+            window_size=7
+        ),
+        'swin_L_384_22k': dict(
+            embed_dim=192,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 6, 12, 24, 48 ],
+            window_size=12
+        ),
+    }
+    kw_cgf = model_para_dict[modelname]
+    kw_cgf.update(kw)
+    model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf)
+    return model
+
+if __name__ == "__main__":
+    model = build_swin_transformer('swin_L_384_22k', 384, dilation=True)
+    x = torch.rand(2, 3, 1024, 1024)
+    y = model.forward_raw(x)
+    import ipdb; ipdb.set_trace()
+    x = torch.rand(2, 3, 384, 384)
+    y = model.forward_raw(x)
diff --git a/src/models/XPose/models/UniPose/transformer_deformable.py b/src/models/XPose/models/UniPose/transformer_deformable.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea1b5cbc231be610f13c7041f85a6581623f1c5d
--- /dev/null
+++ b/src/models/XPose/models/UniPose/transformer_deformable.py
@@ -0,0 +1,595 @@
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+
+import copy
+import math
+import torch
+from torch import nn, Tensor
+from torch.nn.init import xavier_uniform_, constant_, normal_
+from typing import Optional
+
+from ...util.misc import inverse_sigmoid
+from .ops.modules import MSDeformAttn
+from .utils import MLP, _get_activation_fn, gen_sineembed_for_position
+
+class DeformableTransformer(nn.Module):
+    def __init__(self, d_model=256, nhead=8,
+                 num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1,
+                 activation="relu", return_intermediate_dec=False,
+                 num_feature_levels=4, dec_n_points=4,  enc_n_points=4,
+                 two_stage=False, two_stage_num_proposals=300,
+                 use_dab=False, high_dim_query_update=False, no_sine_embed=False):
+        super().__init__()
+
+        self.d_model = d_model
+        self.nhead = nhead
+        self.two_stage = two_stage
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.use_dab = use_dab
+
+        encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
+                                                          dropout, activation,
+                                                          num_feature_levels, nhead, enc_n_points)
+        self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)
+
+        decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
+                                                          dropout, activation,
+                                                          num_feature_levels, nhead, dec_n_points)
+        self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec,
+                                                            use_dab=use_dab, d_model=d_model, high_dim_query_update=high_dim_query_update, no_sine_embed=no_sine_embed)
+
+        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+
+        if two_stage:
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+            self.pos_trans = nn.Linear(d_model * 2, d_model * 2)
+            self.pos_trans_norm = nn.LayerNorm(d_model * 2)
+        else:
+            if not self.use_dab:
+                self.reference_points = nn.Linear(d_model, 2)
+
+        self.high_dim_query_update = high_dim_query_update
+        if high_dim_query_update:
+            assert not self.use_dab, "use_dab must be True"
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        if not self.two_stage and not self.use_dab:
+            xavier_uniform_(self.reference_points.weight.data, gain=1.0)
+            constant_(self.reference_points.bias.data, 0.)
+        normal_(self.level_embed)
+
+    def get_proposal_pos_embed(self, proposals):
+        num_pos_feats = 128
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
+        N_, S_, C_ = memory.shape
+        base_scale = 4.0
+        proposals = []
+        _cur = 0
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+                                            torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+            proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+            proposals.append(proposal)
+            _cur += (H_ * W_)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def forward(self, srcs, masks, pos_embeds, query_embed=None):
+        """
+        Input:
+            - srcs: List([bs, c, h, w])
+            - masks: List([bs, h, w])
+        """
+        assert self.two_stage or query_embed is not None
+
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+
+            src = src.flatten(2).transpose(1, 2)                # bs, hw, c
+            mask = mask.flatten(1)                              # bs, hw
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)    # bs, hw, c
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)     # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)   # bs, \sum{hxw}
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+
+        # encoder
+        memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
+        # import ipdb; ipdb.set_trace()
+
+        # prepare input for decoder
+        bs, _, c = memory.shape
+        if self.two_stage:
+            output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
+
+            # hack implementation for two-stage Deformable DETR
+            enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory)
+            enc_outputs_coord_unact = self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals
+
+            topk = self.two_stage_num_proposals
+            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            init_reference_out = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
+            query_embed, tgt = torch.split(pos_trans_out, c, dim=2)
+        elif self.use_dab:
+            reference_points = query_embed[..., self.d_model:].sigmoid()
+            tgt = query_embed[..., :self.d_model]
+            tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
+            init_reference_out = reference_points
+        else:
+            query_embed, tgt = torch.split(query_embed, c, dim=1)
+            query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1)
+            tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
+            reference_points = self.reference_points(query_embed).sigmoid()
+                # bs, num_quires, 2
+            init_reference_out = reference_points
+
+        # decoder
+        # import ipdb; ipdb.set_trace()
+        hs, inter_references = self.decoder(tgt, reference_points, memory,
+                                            spatial_shapes, level_start_index, valid_ratios,
+                                            query_pos=query_embed if not self.use_dab else None,
+                                            src_padding_mask=mask_flatten)
+
+        inter_references_out = inter_references
+        if self.two_stage:
+            return hs, init_reference_out, inter_references_out, enc_outputs_class, enc_outputs_coord_unact
+        return hs, init_reference_out, inter_references_out, None, None
+
+
+class DeformableTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 add_channel_attention=False,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 ):
+        super().__init__()
+
+        # self attention
+        if use_deformable_box_attn:
+            self.self_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)
+        else:
+            self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # channel attention
+        self.add_channel_attention = add_channel_attention
+        if add_channel_attention:
+            self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)
+            self.norm_channel = nn.LayerNorm(d_model)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):
+        # self attention
+        # import ipdb; ipdb.set_trace()
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, key_padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # ffn
+        src = self.forward_ffn(src)
+
+        # channel attn
+        if self.add_channel_attention:
+            src = self.norm_channel(src + self.activ_channel(src))
+
+        return src
+
+
+class DeformableTransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(encoder_layer, num_layers)
+        else:
+            self.layers = []
+            del encoder_layer
+        self.num_layers = num_layers
+        self.norm = norm
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
+        """
+        Input:
+            - src: [bs, sum(hi*wi), 256]
+            - spatial_shapes: h,w of each level [num_level, 2]
+            - level_start_index: [num_level] start point of level in sum(hi*wi).
+            - valid_ratios: [bs, num_level, 2]
+            - pos: pos embed for src. [bs, sum(hi*wi), 256]
+            - padding_mask: [bs, sum(hi*wi)]
+        Intermedia:
+            - reference_points: [bs, sum(hi*wi), num_lebel, 2]
+        """
+        output = src
+        # bs, sum(hi*wi), 256
+        # import ipdb; ipdb.set_trace()
+        if self.num_layers > 0:
+            reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        for _, layer in enumerate(self.layers):
+            output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 key_aware_type=None,
+                 decoder_sa_type='ca',
+                 module_seq=['sa', 'ca', 'ffn'],
+                 ):
+        super().__init__()
+        self.module_seq = module_seq
+        assert sorted(module_seq) == ['ca', 'ffn', 'sa']
+
+        # cross attention
+        # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        if use_deformable_box_attn:
+            self.cross_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)
+        else:
+            self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+
+        self.key_aware_type = key_aware_type
+        self.key_aware_proj = None
+        self.decoder_sa_type = decoder_sa_type
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+
+        if decoder_sa_type == 'ca_content':
+            self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+
+
+
+
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward_sa(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
+
+                # for memory
+                memory: Optional[Tensor] = None, # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None, # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None, # pos for memory
+
+                # sa
+                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
+            ):
+        # self attention
+        if self.self_attn is not None:
+            # import ipdb; ipdb.set_trace()
+            if self.decoder_sa_type == 'sa':
+                q = k = self.with_pos_embed(tgt, tgt_query_pos)
+                tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+                tgt = tgt + self.dropout2(tgt2)
+                tgt = self.norm2(tgt)
+            elif self.decoder_sa_type == 'ca_label':
+                # import ipdb; ipdb.set_trace()
+                # q = self.with_pos_embed(tgt, tgt_query_pos)
+                bs = tgt.shape[1]
+                k = v = self.label_embedding.weight[:, None, :].repeat(1, bs, 1)
+                tgt2 = self.self_attn(tgt, k, v, attn_mask=self_attn_mask)[0]
+                tgt = tgt + self.dropout2(tgt2)
+                tgt = self.norm2(tgt)
+            elif self.decoder_sa_type == 'ca_content':
+                tgt2 = self.self_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+                            tgt_reference_points.transpose(0, 1).contiguous(),
+                            memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
+                tgt = tgt + self.dropout2(tgt2)
+                tgt = self.norm2(tgt)
+            else:
+                raise NotImplementedError("Unknown decoder_sa_type {}".format(self.decoder_sa_type))
+
+        return tgt
+
+    def forward_ca(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
+
+                # for memory
+                memory: Optional[Tensor] = None, # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None, # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None, # pos for memory
+
+                # sa
+                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
+            ):
+        # cross attention
+        # import ipdb; ipdb.set_trace()
+        if self.key_aware_type is not None:
+
+            if self.key_aware_type == 'mean':
+                tgt = tgt + memory.mean(0, keepdim=True)
+            elif self.key_aware_type == 'proj_mean':
+                tgt = tgt + self.key_aware_proj(memory).mean(0, keepdim=True)
+            else:
+                raise NotImplementedError("Unknown key_aware_type: {}".format(self.key_aware_type))
+        tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+                               tgt_reference_points.transpose(0, 1).contiguous(),
+                               memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        return tgt
+
+    def forward(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
+
+                # for memory
+                memory: Optional[Tensor] = None, # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None, # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None, # pos for memory
+
+                # sa
+                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
+            ):
+
+        for funcname in self.module_seq:
+            # if os.environ.get('IPDB_DEBUG_SHILONG') == 'INFO':
+            #     import ipdb; ipdb.set_trace()
+            if funcname == 'ffn':
+                tgt = self.forward_ffn(tgt)
+            elif funcname == 'ca':
+                tgt = self.forward_ca(tgt, tgt_query_pos, tgt_query_sine_embed, \
+                    tgt_key_padding_mask, tgt_reference_points, \
+                        memory, memory_key_padding_mask, memory_level_start_index, \
+                            memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)
+            elif funcname == 'sa':
+                tgt = self.forward_sa(tgt, tgt_query_pos, tgt_query_sine_embed, \
+                    tgt_key_padding_mask, tgt_reference_points, \
+                        memory, memory_key_padding_mask, memory_level_start_index, \
+                            memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)
+            else:
+                raise ValueError('unknown funcname {}'.format(funcname))
+
+        return tgt
+
+
+
+class DeformableTransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, return_intermediate=False, use_dab=False, d_model=256, query_dim=4):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+        assert return_intermediate
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+        self.use_dab = use_dab
+        self.d_model = d_model
+        self.query_dim = query_dim
+        if use_dab:
+            self.query_scale = MLP(d_model, d_model, d_model, 2)
+            self.ref_point_head = MLP(2 * d_model, d_model, d_model, 2)
+
+
+    def forward(self, tgt, reference_points, src, src_spatial_shapes,
+                src_level_start_index, src_valid_ratios,
+                query_pos=None, src_padding_mask=None):
+        output = tgt
+        if self.use_dab:
+            assert query_pos is None
+
+        intermediate = []
+        intermediate_reference_points = [reference_points]
+        for layer_id, layer in enumerate(self.layers):
+            # import ipdb; ipdb.set_trace()
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] \
+                                         * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None] # bs, nq, 4, 4
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None]
+
+            if self.use_dab:
+                # import ipdb; ipdb.set_trace()
+                query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # bs, nq, 256*2
+                raw_query_pos = self.ref_point_head(query_sine_embed) # bs, nq, 256
+                pos_scale = self.query_scale(output) if layer_id != 0 else 1
+                query_pos = pos_scale * raw_query_pos
+
+            output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask)
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                box_holder = self.bbox_embed(output)
+                box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points)
+                new_reference_points = box_holder[..., :self.query_dim].sigmoid()
+                reference_points = new_reference_points.detach()
+                if layer_id != self.num_layers - 1:
+                    intermediate_reference_points.append(new_reference_points)
+
+            intermediate.append(output)
+
+        return torch.stack(intermediate), torch.stack(intermediate_reference_points)
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def build_deforamble_transformer(args):
+    return DeformableTransformer(
+        d_model=args.hidden_dim,
+        nhead=args.nheads,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        dim_feedforward=args.dim_feedforward,
+        dropout=args.dropout,
+        activation="relu",
+        return_intermediate_dec=True,
+        num_feature_levels=args.ddetr_num_feature_levels,
+        dec_n_points=args.ddetr_dec_n_points,
+        enc_n_points=args.ddetr_enc_n_points,
+        two_stage=args.ddetr_two_stage,
+        two_stage_num_proposals=args.num_queries,
+        use_dab=args.ddetr_use_dab,
+        high_dim_query_update=args.ddetr_high_dim_query_update,
+        no_sine_embed=args.ddetr_no_sine_embed)
diff --git a/src/models/XPose/models/UniPose/transformer_vanilla.py b/src/models/XPose/models/UniPose/transformer_vanilla.py
new file mode 100644
index 0000000000000000000000000000000000000000..450885a97323f6d68cfbed845a2a91c32e79b4ca
--- /dev/null
+++ b/src/models/XPose/models/UniPose/transformer_vanilla.py
@@ -0,0 +1,102 @@
+# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DETR Transformer class.
+
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import torch
+from torch import Tensor, nn
+from typing import List, Optional
+
+from .utils import  _get_activation_fn, _get_clones
+
+
+class TextTransformer(nn.Module):
+    def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):
+        super().__init__()
+        self.num_layers = num_layers
+        self.d_model = d_model
+        self.nheads = nheads
+        self.dim_feedforward = dim_feedforward
+        self.norm = None
+
+        single_encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout)
+        self.layers = _get_clones(single_encoder_layer, num_layers)
+
+
+    def forward(self, memory_text:torch.Tensor, text_attention_mask:torch.Tensor):
+        """        
+
+        Args:
+            text_attention_mask: bs, num_token
+            memory_text: bs, num_token, d_model
+
+        Raises:
+            RuntimeError: _description_
+
+        Returns:
+            output: bs, num_token, d_model
+        """
+
+        output = memory_text.transpose(0, 1)
+
+        for layer in self.layers:
+            output = layer(output, src_key_padding_mask=text_attention_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output.transpose(0, 1)
+
+
+
+
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self.nhead = nhead
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        # repeat attn mask
+        if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]:
+            # bs, num_q, num_k
+            src_mask = src_mask.repeat(self.nhead, 1, 1)
+
+        q = k = self.with_pos_embed(src, pos)
+
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0]
+
+        # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
diff --git a/src/models/XPose/models/UniPose/unipose.py b/src/models/XPose/models/UniPose/unipose.py
new file mode 100644
index 0000000000000000000000000000000000000000..b35af239f7d56c491820605b2ffdbd558357343c
--- /dev/null
+++ b/src/models/XPose/models/UniPose/unipose.py
@@ -0,0 +1,621 @@
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# ------------------------------------------------------------------------
+import os
+import copy
+import torch
+import torch.nn.functional as F
+from torch import nn
+from typing import List
+
+from ...util.keypoint_ops import keypoint_xyzxyz_to_xyxyzz
+from ...util.misc import NestedTensor, nested_tensor_from_tensor_list,inverse_sigmoid
+
+from .utils import MLP
+from .backbone import build_backbone
+from ..registry import MODULE_BUILD_FUNCS
+from .mask_generate import prepare_for_mask, post_process
+from .deformable_transformer import build_deformable_transformer
+
+
+class UniPose(nn.Module):
+    """ This is the Cross-Attention Detector module that performs object detection """
+
+    def __init__(self, backbone, transformer, num_classes, num_queries,
+                 aux_loss=False, iter_update=False,
+                 query_dim=2,
+                 random_refpoints_xy=False,
+                 fix_refpoints_hw=-1,
+                 num_feature_levels=1,
+                 nheads=8,
+                 # two stage
+                 two_stage_type='no',  # ['no', 'standard']
+                 two_stage_add_query_num=0,
+                 dec_pred_class_embed_share=True,
+                 dec_pred_bbox_embed_share=True,
+                 two_stage_class_embed_share=True,
+                 two_stage_bbox_embed_share=True,
+                 decoder_sa_type='sa',
+                 num_patterns=0,
+                 dn_number=100,
+                 dn_box_noise_scale=0.4,
+                 dn_label_noise_ratio=0.5,
+                 dn_labelbook_size=100,
+                 use_label_enc=True,
+
+                 text_encoder_type='bert-base-uncased',
+
+                 binary_query_selection=False,
+                 use_cdn=True,
+                 sub_sentence_present=True,
+                 num_body_points=68,
+                 num_box_decoder_layers=2,
+                 ):
+        """ Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         Conditional DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+
+            fix_refpoints_hw: -1(default): learn w and h for each box seperately
+                                >0 : given fixed number
+                                -2 : learn a shared w and h
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim = transformer.d_model
+        self.num_feature_levels = num_feature_levels
+        self.nheads = nheads
+        self.use_label_enc = use_label_enc
+        if use_label_enc:
+            self.label_enc = nn.Embedding(dn_labelbook_size + 1, hidden_dim)
+        else:
+            raise NotImplementedError
+            self.label_enc = None
+        self.max_text_len = 256
+        self.binary_query_selection = binary_query_selection
+        self.sub_sentence_present = sub_sentence_present
+
+        # setting query dim
+        self.query_dim = query_dim
+        assert query_dim == 4
+        self.random_refpoints_xy = random_refpoints_xy
+        self.fix_refpoints_hw = fix_refpoints_hw
+
+        # for dn training
+        self.num_patterns = num_patterns
+        self.dn_number = dn_number
+        self.dn_box_noise_scale = dn_box_noise_scale
+        self.dn_label_noise_ratio = dn_label_noise_ratio
+        self.dn_labelbook_size = dn_labelbook_size
+        self.use_cdn = use_cdn
+
+
+        self.projection = MLP(512, hidden_dim, hidden_dim, 3)
+
+        self.projection_kpt = MLP(512, hidden_dim, hidden_dim, 3)
+
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # model, _ = clip.load("ViT-B/32", device=device)
+        # self.clip_model = model
+        # visual_parameters = list(self.clip_model.visual.parameters())
+        # #
+        # for param in visual_parameters:
+        #     param.requires_grad = False
+
+        self.pos_proj = nn.Linear(hidden_dim, 768)
+        self.padding = nn.Embedding(1, 768)
+
+        # prepare input projection layers
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone.num_channels)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[_]
+                input_proj_list.append(nn.Sequential(
+                    nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                ))
+            for _ in range(num_feature_levels - num_backbone_outs):
+                input_proj_list.append(nn.Sequential(
+                    nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
+                    nn.GroupNorm(32, hidden_dim),
+                ))
+                in_channels = hidden_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            assert two_stage_type == 'no', "two_stage_type should be no if num_feature_levels=1 !!!"
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(backbone.num_channels[-1], hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                )])
+
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.box_pred_damping = box_pred_damping = None
+
+        self.iter_update = iter_update
+        assert iter_update, "Why not iter_update?"
+
+        # prepare pred layers
+        self.dec_pred_class_embed_share = dec_pred_class_embed_share
+        self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share
+        # prepare class & box embed
+        _class_embed = ContrastiveAssign()
+
+
+
+        _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
+
+        _pose_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        _pose_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_pose_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_pose_embed.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            box_embed_layerlist = [_bbox_embed for i in range(transformer.num_decoder_layers)]
+        else:
+            box_embed_layerlist = [copy.deepcopy(_bbox_embed) for i in range(transformer.num_decoder_layers)]
+        if dec_pred_class_embed_share:
+            class_embed_layerlist = [_class_embed for i in range(transformer.num_decoder_layers)]
+        else:
+            class_embed_layerlist = [copy.deepcopy(_class_embed) for i in range(transformer.num_decoder_layers)]
+
+
+        if dec_pred_bbox_embed_share:
+
+            pose_embed_layerlist = [_pose_embed for i in
+                                    range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]
+        else:
+            pose_embed_layerlist = [copy.deepcopy(_pose_embed) for i in
+                                    range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]
+
+        pose_hw_embed_layerlist = [_pose_hw_embed for i in
+                                   range(transformer.num_decoder_layers - num_box_decoder_layers)]
+
+
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.bbox_embed = nn.ModuleList(box_embed_layerlist)
+        self.class_embed = nn.ModuleList(class_embed_layerlist)
+        self.num_body_points = num_body_points
+        self.pose_embed = nn.ModuleList(pose_embed_layerlist)
+        self.pose_hw_embed = nn.ModuleList(pose_hw_embed_layerlist)
+
+        self.transformer.decoder.bbox_embed = self.bbox_embed
+        self.transformer.decoder.class_embed = self.class_embed
+
+        self.transformer.decoder.pose_embed = self.pose_embed
+        self.transformer.decoder.pose_hw_embed = self.pose_hw_embed
+
+        self.transformer.decoder.num_body_points = num_body_points
+
+
+        # two stage
+        self.two_stage_type = two_stage_type
+        self.two_stage_add_query_num = two_stage_add_query_num
+        assert two_stage_type in ['no', 'standard'], "unknown param {} of two_stage_type".format(two_stage_type)
+        if two_stage_type != 'no':
+            if two_stage_bbox_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_bbox_embed = _bbox_embed
+            else:
+                self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed)
+
+            if two_stage_class_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_class_embed = _class_embed
+            else:
+                self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed)
+
+            self.refpoint_embed = None
+            if self.two_stage_add_query_num > 0:
+                self.init_ref_points(two_stage_add_query_num)
+
+        self.decoder_sa_type = decoder_sa_type
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+        # self.replace_sa_with_double_ca = replace_sa_with_double_ca
+        if decoder_sa_type == 'ca_label':
+            self.label_embedding = nn.Embedding(num_classes, hidden_dim)
+            for layer in self.transformer.decoder.layers:
+                layer.label_embedding = self.label_embedding
+        else:
+            for layer in self.transformer.decoder.layers:
+                layer.label_embedding = None
+            self.label_embedding = None
+
+        self._reset_parameters()
+
+    def open_set_transfer_init(self):
+        for name, param in self.named_parameters():
+            if 'fusion_layers' in name:
+                continue
+            if 'ca_text' in name:
+                continue
+            if 'catext_norm' in name:
+                continue
+            if 'catext_dropout' in name:
+                continue
+            if "text_layers" in name:
+                continue
+            if 'bert' in name:
+                continue
+            if 'bbox_embed' in name:
+                continue
+            if 'label_enc.weight' in name:
+                continue
+            if 'feat_map' in name:
+                continue
+            if 'enc_output' in name:
+                continue
+
+            param.requires_grad_(False)
+
+        # import ipdb; ipdb.set_trace()
+
+    def _reset_parameters(self):
+        # init input_proj
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
+
+        if self.random_refpoints_xy:
+            # import ipdb; ipdb.set_trace()
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+
+        if self.fix_refpoints_hw > 0:
+            print("fix_refpoints_hw: {}".format(self.fix_refpoints_hw))
+            assert self.random_refpoints_xy
+            self.refpoint_embed.weight.data[:, 2:] = self.fix_refpoints_hw
+            self.refpoint_embed.weight.data[:, 2:] = inverse_sigmoid(self.refpoint_embed.weight.data[:, 2:])
+            self.refpoint_embed.weight.data[:, 2:].requires_grad = False
+        elif int(self.fix_refpoints_hw) == -1:
+            pass
+        elif int(self.fix_refpoints_hw) == -2:
+            print('learn a shared h and w')
+            assert self.random_refpoints_xy
+            self.refpoint_embed = nn.Embedding(use_num_queries, 2)
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+            self.hw_embed = nn.Embedding(1, 1)
+        else:
+            raise NotImplementedError('Unknown fix_refpoints_hw {}'.format(self.fix_refpoints_hw))
+
+    def forward(self, samples: NestedTensor, targets: List = None, **kw):
+        """ The forward expects a NestedTensor, which consists of:
+               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+
+            It returns a dict with the following elements:
+               - "pred_logits": the classification logits (including no-object) for all queries.
+                                Shape= [batch_size x num_queries x num_classes]
+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                               (center_x, center_y, width, height). These values are normalized in [0, 1],
+                               relative to the size of each individual image (disregarding possible padding).
+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                                dictionnaries containing the two above keys for each decoder layer.
+        """
+
+        captions = [t['instance_text_prompt'] for t in targets]
+        bs=len(captions)
+        tensor_list = [tgt["object_embeddings_text"] for tgt in targets]
+        max_size = 350
+        padded_tensors = [torch.cat([tensor, torch.zeros(max_size - tensor.size(0), tensor.size(1),device=tensor.device)]) if tensor.size(0) < max_size else tensor for tensor in tensor_list]
+        object_embeddings_text = torch.stack(padded_tensors)
+
+        kpts_embeddings_text = torch.stack([tgt["kpts_embeddings_text"] for tgt in targets])[:, :self.num_body_points]
+        encoded_text=self.projection(object_embeddings_text) # bs, 81, 101, 256
+        kpt_embeddings_specific=self.projection_kpt(kpts_embeddings_text) # bs, 81, 101, 256
+
+
+        kpt_vis = torch.stack([tgt["kpt_vis_text"] for tgt in targets])[:, :self.num_body_points]
+        kpt_mask = torch.cat((torch.ones_like(kpt_vis, device=kpt_vis.device)[..., 0].unsqueeze(-1), kpt_vis), dim=-1)
+
+
+        num_classes = encoded_text.shape[1] # bs, 81, 101, 256
+        text_self_attention_masks = torch.eye(num_classes).unsqueeze(0).expand(bs, -1, -1).bool().to(samples.device)
+        text_token_mask = torch.zeros(samples.shape[0],num_classes).to(samples.device)>0
+        for i in range(bs):
+            text_token_mask[i,:len(captions[i])]=True
+
+        position_ids = torch.zeros(samples.shape[0], num_classes).to(samples.device)
+
+        for i in range(bs):
+            position_ids[i,:len(captions[i])]= 1
+
+
+        text_dict = {
+            'encoded_text': encoded_text, # bs, 195, d_model
+            'text_token_mask': text_token_mask, # bs, 195
+            'position_ids': position_ids, # bs, 195
+            'text_self_attention_masks': text_self_attention_masks # bs, 195,195
+        }
+
+
+        # import ipdb; ipdb.set_trace()
+
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, poss = self.backbone(samples)
+        if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            import ipdb;
+            ipdb.set_trace()
+
+
+        srcs = []
+        masks = []
+        for l, feat in enumerate(features):
+            src, mask = feat.decompose()
+            srcs.append(self.input_proj[l](src))
+            masks.append(mask)
+            assert mask is not None
+
+        if self.num_feature_levels > len(srcs):
+            _len_srcs = len(srcs)
+            for l in range(_len_srcs, self.num_feature_levels):
+                if l == _len_srcs:
+                    src = self.input_proj[l](features[-1].tensors)
+                else:
+                    src = self.input_proj[l](srcs[-1])
+                m = samples.mask
+                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
+                srcs.append(src)
+                masks.append(mask)
+                poss.append(pos_l)
+
+        if self.label_enc is not None:
+            label_enc = self.label_enc
+        else:
+            raise NotImplementedError
+            label_enc = encoded_text
+        if self.dn_number > 0 or targets is not None:
+            input_query_label, input_query_bbox, attn_mask, attn_mask2, dn_meta = \
+                prepare_for_mask(kpt_mask=kpt_mask)
+        else:
+            assert targets is None
+            input_query_bbox = input_query_label = attn_mask = attn_mask2 = dn_meta = None
+
+
+        hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(srcs, masks, input_query_bbox, poss,
+                                                                                 input_query_label, attn_mask, attn_mask2,
+                                                                                 text_dict, dn_meta,targets,kpt_embeddings_specific)
+
+        # In case num object=0
+        if self.label_enc is not None:
+            hs[0] += self.label_enc.weight[0, 0] * 0.0
+
+        hs[0] += self.pos_proj.weight[0, 0] * 0.0
+        hs[0] += self.pos_proj.bias[0] * 0.0
+        hs[0] += self.padding.weight[0, 0] * 0.0
+
+        num_group = 50
+        effective_dn_number = dn_meta['pad_size'] if self.training else 0
+        outputs_coord_list = []
+        outputs_class = []
+
+
+        for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_cls_embed, layer_hs) in enumerate(
+                zip(reference[:-1], self.bbox_embed, self.class_embed, hs)):
+
+
+            if dec_lid < self.num_box_decoder_layers:
+                layer_delta_unsig = layer_bbox_embed(layer_hs)
+                layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
+                layer_outputs_unsig = layer_outputs_unsig.sigmoid()
+                layer_cls = layer_cls_embed(layer_hs, text_dict)
+                outputs_coord_list.append(layer_outputs_unsig)
+                outputs_class.append(layer_cls)
+
+
+            else:
+
+                layer_hs_bbox_dn = layer_hs[:, :effective_dn_number, :]
+                layer_hs_bbox_norm = layer_hs[:, effective_dn_number:, :][:, 0::(self.num_body_points + 1), :]
+                bs = layer_ref_sig.shape[0]
+                reference_before_sigmoid_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]
+                reference_before_sigmoid_bbox_norm = layer_ref_sig[:, effective_dn_number:, :][:,
+                                                     0::(self.num_body_points + 1), :]
+                layer_delta_unsig_dn = layer_bbox_embed(layer_hs_bbox_dn)
+                layer_delta_unsig_norm = layer_bbox_embed(layer_hs_bbox_norm)
+                layer_outputs_unsig_dn = layer_delta_unsig_dn + inverse_sigmoid(reference_before_sigmoid_bbox_dn)
+                layer_outputs_unsig_dn = layer_outputs_unsig_dn.sigmoid()
+                layer_outputs_unsig_norm = layer_delta_unsig_norm + inverse_sigmoid(reference_before_sigmoid_bbox_norm)
+                layer_outputs_unsig_norm = layer_outputs_unsig_norm.sigmoid()
+                layer_outputs_unsig = torch.cat((layer_outputs_unsig_dn, layer_outputs_unsig_norm), dim=1)
+                layer_cls_dn = layer_cls_embed(layer_hs_bbox_dn, text_dict)
+                layer_cls_norm = layer_cls_embed(layer_hs_bbox_norm, text_dict)
+                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
+                outputs_class.append(layer_cls)
+                outputs_coord_list.append(layer_outputs_unsig)
+
+        # update keypoints
+        outputs_keypoints_list = []
+        outputs_keypoints_hw = []
+        kpt_index = [x for x in range(num_group * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]
+        for dec_lid, (layer_ref_sig, layer_hs) in enumerate(zip(reference[:-1], hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                assert isinstance(layer_hs, torch.Tensor)
+                bs = layer_hs.shape[0]
+                layer_res = layer_hs.new_zeros((bs, self.num_queries, self.num_body_points * 3))
+                outputs_keypoints_list.append(layer_res)
+            else:
+                bs = layer_ref_sig.shape[0]
+                layer_hs_kpt = layer_hs[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,
+                                                                                                 device=layer_hs.device))
+                delta_xy_unsig = self.pose_embed[dec_lid - self.num_box_decoder_layers](layer_hs_kpt)
+                layer_ref_sig_kpt = layer_ref_sig[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,
+                                                                                                           device=layer_hs.device))
+                layer_outputs_unsig_keypoints = delta_xy_unsig + inverse_sigmoid(layer_ref_sig_kpt[..., :2])
+                vis_xy_unsig = torch.ones_like(layer_outputs_unsig_keypoints,
+                                               device=layer_outputs_unsig_keypoints.device)
+                xyv = torch.cat((layer_outputs_unsig_keypoints, vis_xy_unsig[:, :, 0].unsqueeze(-1)), dim=-1)
+                xyv = xyv.sigmoid()
+                layer_res = xyv.reshape((bs, num_group, self.num_body_points, 3)).flatten(2, 3)
+                layer_hw = layer_ref_sig_kpt[..., 2:].reshape(bs, num_group, self.num_body_points, 2).flatten(2, 3)
+                layer_res = keypoint_xyzxyz_to_xyxyzz(layer_res)
+                outputs_keypoints_list.append(layer_res)
+                outputs_keypoints_hw.append(layer_hw)
+
+
+        if self.dn_number > 0 and dn_meta is not None:
+            outputs_class, outputs_coord_list = \
+                post_process(outputs_class, outputs_coord_list,
+                                dn_meta, self.aux_loss, self._set_aux_loss)
+        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord_list[-1],
+               'pred_keypoints': outputs_keypoints_list[-1]}
+
+        return out
+
+
+@MODULE_BUILD_FUNCS.registe_with_name(module_name='UniPose')
+def build_unipose(args):
+
+    num_classes = args.num_classes
+    device = torch.device(args.device)
+
+    backbone = build_backbone(args)
+
+    transformer = build_deformable_transformer(args)
+
+    try:
+        match_unstable_error = args.match_unstable_error
+        dn_labelbook_size = args.dn_labelbook_size
+    except:
+        match_unstable_error = True
+        dn_labelbook_size = num_classes
+
+    try:
+        dec_pred_class_embed_share = args.dec_pred_class_embed_share
+    except:
+        dec_pred_class_embed_share = True
+    try:
+        dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share
+    except:
+        dec_pred_bbox_embed_share = True
+
+    binary_query_selection = False
+    try:
+        binary_query_selection = args.binary_query_selection
+    except:
+        binary_query_selection = False
+
+    use_cdn = True
+    try:
+        use_cdn = args.use_cdn
+    except:
+        use_cdn = True
+
+    sub_sentence_present = True
+    try:
+        sub_sentence_present = args.sub_sentence_present
+    except:
+        sub_sentence_present = True
+    # print('********* sub_sentence_present', sub_sentence_present)
+
+    model = UniPose(
+        backbone,
+        transformer,
+        num_classes=num_classes,
+        num_queries=args.num_queries,
+        aux_loss=True,
+        iter_update=True,
+        query_dim=4,
+        random_refpoints_xy=args.random_refpoints_xy,
+        fix_refpoints_hw=args.fix_refpoints_hw,
+        num_feature_levels=args.num_feature_levels,
+        nheads=args.nheads,
+        dec_pred_class_embed_share=dec_pred_class_embed_share,
+        dec_pred_bbox_embed_share=dec_pred_bbox_embed_share,
+        # two stage
+        two_stage_type=args.two_stage_type,
+        # box_share
+        two_stage_bbox_embed_share=args.two_stage_bbox_embed_share,
+        two_stage_class_embed_share=args.two_stage_class_embed_share,
+        decoder_sa_type=args.decoder_sa_type,
+        num_patterns=args.num_patterns,
+        dn_number=args.dn_number if args.use_dn else 0,
+        dn_box_noise_scale=args.dn_box_noise_scale,
+        dn_label_noise_ratio=args.dn_label_noise_ratio,
+        dn_labelbook_size=dn_labelbook_size,
+        use_label_enc=args.use_label_enc,
+
+        text_encoder_type=args.text_encoder_type,
+
+        binary_query_selection=binary_query_selection,
+        use_cdn=use_cdn,
+        sub_sentence_present=sub_sentence_present
+    )
+
+    return model
+
+
+class ContrastiveAssign(nn.Module):
+    def __init__(self, project=False, cal_bias=None, max_text_len=256):
+        """
+        :param x: query
+        :param y: text embed
+        :param proj:
+        :return:
+        """
+        super().__init__()
+        self.project = project
+        self.cal_bias = cal_bias
+        self.max_text_len = max_text_len
+
+    def forward(self, x, text_dict):
+        """_summary_
+
+        Args:
+            x (_type_): _description_
+            text_dict (_type_): _description_
+            {
+                'encoded_text': encoded_text, # bs, 195, d_model
+                'text_token_mask': text_token_mask, # bs, 195
+                        # True for used tokens. False for padding tokens
+            }
+        Returns:
+            _type_: _description_
+        """
+        assert isinstance(text_dict, dict)
+
+        y = text_dict['encoded_text']
+
+
+        max_text_len = y.shape[1]
+
+
+
+        text_token_mask = text_dict['text_token_mask']
+
+        if self.cal_bias is not None:
+            raise NotImplementedError
+            return x @ y.transpose(-1, -2) + self.cal_bias.weight.repeat(x.shape[0], x.shape[1], 1)
+        res = x @ y.transpose(-1, -2)
+        res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))
+
+        # padding to max_text_len
+        new_res = torch.full((*res.shape[:-1], max_text_len), float('-inf'), device=res.device)
+        new_res[..., :res.shape[-1]] = res
+
+        return new_res
diff --git a/src/models/XPose/models/UniPose/utils.py b/src/models/XPose/models/UniPose/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..350d8316ae353434b6baca449d0ecd1d4dd9c813
--- /dev/null
+++ b/src/models/XPose/models/UniPose/utils.py
@@ -0,0 +1,348 @@
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+
+import copy
+import torch
+import random
+from torch import nn, Tensor
+import os
+import numpy as np
+import math
+import torch.nn.functional as F
+from torch import nn
+
+
+def _get_clones(module, N, layer_share=False):
+    # import ipdb; ipdb.set_trace()
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def get_sine_pos_embed(
+        pos_tensor: torch.Tensor,
+        num_pos_feats: int = 128,
+        temperature: int = 10000,
+        exchange_xy: bool = True,
+):
+    """generate sine position embedding from a position tensor
+    Args:
+        pos_tensor (torch.Tensor): shape: [..., n].
+        num_pos_feats (int): projected shape for each float in the tensor.
+        temperature (int): temperature in the sine/cosine function.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.
+    Returns:
+        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+
+    def sine_func(x: torch.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
+        return sin_x
+
+    pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = torch.cat(pos_res, dim=-1)
+    return pos_res
+
+
+def gen_encoder_output_proposals(memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None):
+    """
+    Input:
+        - memory: bs, \sum{hw}, d_model
+        - memory_padding_mask: bs, \sum{hw}
+        - spatial_shapes: nlevel, 2
+        - learnedwh: 2
+    Output:
+        - output_memory: bs, \sum{hw}, d_model
+        - output_proposals: bs, \sum{hw}, 4
+    """
+    N_, S_, C_ = memory.shape
+    base_scale = 4.0
+    proposals = []
+    _cur = 0
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
+        valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+        valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+        # import ipdb; ipdb.set_trace()
+
+        grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+                                        torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+        grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # H_, W_, 2
+
+        scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+        grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+
+        if learnedwh is not None:
+            # import ipdb; ipdb.set_trace()
+            wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0 ** lvl)
+        else:
+            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+
+        # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)
+        # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+        # wh = torch.ones_like(grid) / scale
+        proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+        proposals.append(proposal)
+        _cur += (H_ * W_)
+    # import ipdb; ipdb.set_trace()
+    output_proposals = torch.cat(proposals, 1)
+    output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+    output_proposals = torch.log(output_proposals / (1 - output_proposals))  # unsigmoid
+    output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+    output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+
+    output_memory = memory
+    output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+    output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+
+    # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+    # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf'))
+
+    return output_memory, output_proposals
+
+
+class RandomBoxPerturber():
+    def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None:
+        self.noise_scale = torch.Tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])
+
+    def __call__(self, refanchors: Tensor) -> Tensor:
+        nq, bs, query_dim = refanchors.shape
+        device = refanchors.device
+
+        noise_raw = torch.rand_like(refanchors)
+        noise_scale = self.noise_scale.to(device)[:query_dim]
+
+        new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
+        return new_refanchors.clamp_(0, 1)
+
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    if no_reduction:
+        return loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+def _get_activation_fn(activation, d_model=256, batch_dim=0):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    if activation == "prelu":
+        return nn.PReLU()
+    if activation == "selu":
+        return F.selu
+
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+
+
+def gen_sineembed_for_position(pos_tensor):
+    # n_query, bs, _ = pos_tensor.size()
+    # sineembed_tensor = torch.zeros(n_query, bs, 256)
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * (dim_t // 2) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    if pos_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=2)
+    elif pos_tensor.size(-1) == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    else:
+        raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
+    return pos
+
+
+def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):
+    sigmas = kpt_preds.new_tensor(sigmas)
+    variances = (sigmas * 2) ** 2
+
+    assert kpt_preds.size(0) == kpt_gts.size(0)
+    kpt_preds = kpt_preds.reshape(-1, kpt_preds.size(-1) // 2, 2)
+    kpt_gts = kpt_gts.reshape(-1, kpt_gts.size(-1) // 2, 2)
+
+    squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \
+                       (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2
+    # import pdb
+    # pdb.set_trace()
+    # assert (kpt_valids.sum(-1) > 0).all()
+    squared_distance0 = squared_distance / (kpt_areas[:, None] * variances[None, :] * 2)
+    squared_distance1 = torch.exp(-squared_distance0)
+    squared_distance1 = squared_distance1 * kpt_valids
+    oks = squared_distance1.sum(dim=1) / (kpt_valids.sum(dim=1) + 1e-6)
+
+    return oks
+
+
+def oks_loss(pred,
+             target,
+             valid=None,
+             area=None,
+             linear=False,
+             sigmas=None,
+             eps=1e-6):
+    """Oks loss.
+    Computing the oks loss between a set of predicted poses and target poses.
+    The loss is calculated as negative log of oks.
+    Args:
+        pred (torch.Tensor): Predicted poses of format (x1, y1, x2, y2, ...),
+            shape (n, 2K).
+        target (torch.Tensor): Corresponding gt poses, shape (n, 2K).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        eps (float): Eps to avoid log(0).
+    Return:
+        torch.Tensor: Loss tensor.
+    """
+    oks = oks_overlaps(pred, target, valid, area, sigmas).clamp(min=eps)
+    if linear:
+        loss = 1 - oks
+    else:
+        loss = -oks.log()
+    return loss
+
+
+class OKSLoss(nn.Module):
+    """IoULoss.
+    Computing the oks loss between a set of predicted poses and target poses.
+    Args:
+        linear (bool): If True, use linear scale of loss instead of log scale.
+            Default: False.
+        eps (float): Eps to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 linear=False,
+                 num_keypoints=17,
+                 eps=1e-6,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(OKSLoss, self).__init__()
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        if num_keypoints == 68:
+            self.sigmas = np.array([
+                .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
+                1.07, .87, .87, .89, .89, .25, .25, .25, .25, .25, .25, .25, .25,
+                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
+                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
+                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
+            ], dtype=np.float32) / 10.0
+        else:
+            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
+
+    def forward(self,
+                pred,
+                target,
+                valid,
+                area,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            valid (torch.Tensor): The visible flag of the target pose.
+            area (torch.Tensor): The area of the target pose.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * oks_loss(
+            pred,
+            target,
+            valid=valid,
+            area=area,
+            linear=self.linear,
+            sigmas=self.sigmas,
+            eps=self.eps)
+        return loss
diff --git a/src/models/XPose/models/__init__.py b/src/models/XPose/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab6c39b38edc376198b02e3d63c5bfc538703530
--- /dev/null
+++ b/src/models/XPose/models/__init__.py
@@ -0,0 +1,16 @@
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .UniPose.unipose import build_unipose
+
+def build_model(args):
+    # we use register to maintain models from catdet6 on.
+    from .registry import MODULE_BUILD_FUNCS
+
+    assert args.modelname in MODULE_BUILD_FUNCS._module_dict
+    build_func = MODULE_BUILD_FUNCS.get(args.modelname)
+    model = build_func(args)
+    return model
diff --git a/src/models/XPose/models/registry.py b/src/models/XPose/models/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..f438c6e3918a84cc2004b5da9c1d79d18cfb3118
--- /dev/null
+++ b/src/models/XPose/models/registry.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+# @Author: Yihao Chen
+# @Date:   2021-08-16 16:03:17
+# @Last Modified by:   Shilong Liu
+# @Last Modified time: 2022-01-23 15:26
+# modified from mmcv
+
+import inspect
+from functools import partial
+
+
+class Registry(object):
+
+    def __init__(self, name):
+        self._name = name
+        self._module_dict = dict()
+
+    def __repr__(self):
+        format_str = self.__class__.__name__ + '(name={}, items={})'.format(
+            self._name, list(self._module_dict.keys()))
+        return format_str
+
+    def __len__(self):
+        return len(self._module_dict)
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def module_dict(self):
+        return self._module_dict
+
+    def get(self, key):
+        return self._module_dict.get(key, None)
+
+    def registe_with_name(self, module_name=None, force=False):
+        return partial(self.register, module_name=module_name, force=force)
+
+    def register(self, module_build_function, module_name=None, force=False):
+        """Register a module build function.
+        Args:
+            module (:obj:`nn.Module`): Module to be registered.
+        """
+        if not inspect.isfunction(module_build_function):
+            raise TypeError('module_build_function must be a function, but got {}'.format(
+                type(module_build_function)))
+        if module_name is None:
+            module_name = module_build_function.__name__
+        if not force and module_name in self._module_dict:
+            raise KeyError('{} is already registered in {}'.format(
+                module_name, self.name))
+        self._module_dict[module_name] = module_build_function
+
+        return module_build_function
+
+MODULE_BUILD_FUNCS = Registry('model build functions')
+
diff --git a/src/models/XPose/predefined_keypoints.py b/src/models/XPose/predefined_keypoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..c32c5adb346783095b6dd192090cde30488f0194
--- /dev/null
+++ b/src/models/XPose/predefined_keypoints.py
@@ -0,0 +1,56 @@
+person = {"keypoints":['nose', 'left eye', 'right eye', 'left ear', 'right ear', 'left shoulder', 'right shoulder', 'left elbow', 'right elbow', 'left wrist', 'right wrist', 'left hip', 'right hip', 'left knee', 'right knee', 'left ankle', 'right ankle'],"skeleton": [[16,14],[14,12],[17,15],[15,13],[12,13],[6,12],[7,13],[6,7],[6,8],[7,9],[8,10],[9,11],[2,3],[1,2],[1,3],[2,4],[3,5],[4,6],[5,7]]}
+
+face = {"keypoints": ['right cheekbone 1', 'right cheekbone 2', 'right cheek 1', 'right cheek 2', 'right cheek 3', 'right cheek 4', 'right cheek 5', 'right chin', 'chin center', 'left chin', 'left cheek 5', 'left cheek 4', 'left cheek 3', 'left cheek 2', 'left cheek 1', 'left cheekbone 2', 'left cheekbone 1', 'right eyebrow 1', 'right eyebrow 2', 'right eyebrow 3', 'right eyebrow 4', 'right eyebrow 5', 'left eyebrow 1', 'left eyebrow 2', 'left eyebrow 3', 'left eyebrow 4', 'left eyebrow 5', 'nasal bridge 1', 'nasal bridge 2', 'nasal bridge 3', 'nasal bridge 4', 'right nasal wing 1', 'right nasal wing 2', 'nasal wing center', 'left nasal wing 1', 'left nasal wing 2', 'right eye eye corner 1', 'right eye upper eyelid 1', 'right eye upper eyelid 2', 'right eye eye corner 2', 'right eye lower eyelid 2', 'right eye lower eyelid 1', 'left eye eye corner 1', 'left eye upper eyelid 1', 'left eye upper eyelid 2', 'left eye eye corner 2', 'left eye lower eyelid 2', 'left eye lower eyelid 1', 'right mouth corner', 'upper lip outer edge 1', 'upper lip outer edge 2', 'upper lip outer edge 3', 'upper lip outer edge 4', 'upper lip outer edge 5', 'left mouth corner', 'lower lip outer edge 5', 'lower lip outer edge 4', 'lower lip outer edge 3', 'lower lip outer edge 2', 'lower lip outer edge 1', 'upper lip inter edge 1', 'upper lip inter edge 2', 'upper lip inter edge 3', 'upper lip inter edge 4', 'upper lip inter edge 5', 'lower lip inter edge 3', 'lower lip inter edge 2', 'lower lip inter edge 1'], "skeleton": []}
+
+hand = {"keypoints":['wrist', 'thumb root', "thumb's third knuckle", "thumb's second knuckle", 'thumb’s first knuckle', "forefinger's root", "forefinger's third knuckle", "forefinger's second knuckle", "forefinger's first knuckle", "middle finger's root", "middle finger's third knuckle", "middle finger's second knuckle", "middle finger's first knuckle", "ring finger's root", "ring finger's third knuckle", "ring finger's second knuckle", "ring finger's first knuckle", "pinky finger's root", "pinky finger's third knuckle", "pinky finger's second knuckle", "pinky finger's first knuckle"],"skeleton": []}
+
+animal_in_AnimalKindom = {"keypoints":['head mid top', 'eye left', 'eye right', 'mouth front top', 'mouth back left', 'mouth back right', 'mouth front bottom', 'shoulder left', 'shoulder right', 'elbow left', 'elbow right', 'wrist left', 'wrist right', 'torso mid back', 'hip left', 'hip right', 'knee left', 'knee right', 'ankle left ', 'ankle right', 'tail top back', 'tail mid back', 'tail end back'],"skeleton": [[1, 0], [2, 0], [3, 4], [3, 5], [4, 6], [5, 6], [0, 7], [0, 8], [7, 9], [8, 10], [9, 11], [10, 12], [0, 13], [13, 20], [20, 14], [20, 15], [14, 16], [15, 17], [16, 18], [17, 19], [20, 21], [21, 22]]}
+
+animal_in_AP10K = {"keypoints": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], "skeleton": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}
+
+animal= {"keypoints": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], "skeleton": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}
+
+animal_face = {"keypoints": ['right eye right', 'right eye left', 'left eye right', 'left eye left', 'nose tip', 'lip right', 'lip left', 'upper lip', 'lower lip'], "skeleton": []}
+
+fly = {"keypoints": ['head', 'eye left', 'eye right', 'neck', 'thorax', 'abdomen', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'wing left', 'wing right'], "skeleton": [[2, 1], [3, 1], [4, 1], [5, 4], [6, 5], [8, 7], [9, 8], [10, 9], [12, 11], [13, 12], [14, 13], [16, 15], [17, 16], [18, 17], [20, 19], [21, 20], [22, 21], [24, 23], [25, 24], [26, 25], [28, 27], [29, 28], [30, 29], [31, 4], [32, 4]]}
+
+locust = {"keypoints": ['head', 'neck', 'thorax', 'abdomen1', 'abdomen2', 'anttip left', 'antbase left', 'eye left', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'anttip right', 'antbase right', 'eye right', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip'],"skeleton": [[2, 1], [3, 2], [4, 3], [5, 4], [7, 6], [8, 7], [10, 9], [11, 10], [12, 11], [14, 13], [15, 14],[16, 15], [18, 17], [19, 18], [20, 19], [22, 21], [23, 22], [25, 24], [26, 25], [27, 26],[29, 28], [30, 29], [31, 30], [33, 32], [34, 33], [35, 34]]}
+
+car ={"keypoints": ['right front wheel center', 'left front wheel center', 'right rear wheel center', 'left rear wheel center', 'front right', 'front left', 'back right', 'back left', 'none', 'roof front right', 'roof front left', 'roof back right', 'roof back left', 'none'],"skeleton": [[0, 2], [1, 3], [0, 1], [2, 3], [9, 11], [10, 12], [9, 10], [11, 12], [4, 0], [4, 9], [4, 5], [5, 1], [5, 10], [6, 2], [6, 11], [7, 3], [7, 12], [6, 7]]}
+
+short_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+
+long_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'right sleeve inside 3', 'right sleeve inside 4', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 4', 'left sleeve inside 3', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}
+
+short_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}
+
+sling={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}
+
+vest = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}
+
+long_sleeved_dress={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'center hem', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+
+long_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+
+trousers = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right side outside 3', 'right cuff outside', 'right cuff inside', 'right side inside 1', 'crotch', 'left side inside 1', 'left cuff inside', 'left cuff outside', 'left side outside 3', 'left side outside 2'], 'skeleton': []}
+
+sling_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}
+
+vest_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}
+
+skirt = {'keypoints': ['right side 1', 'upper center', 'left side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2'], 'skeleton': []}
+
+short_sleeved_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'left side 1', 'left side 2', 'left side 3', 'left side 4', 'left side 5', 'center hem', 'right side 5', 'right side 4', 'right side 3', 'right side 2', 'right side 1', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+
+shorts = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right cuff outside', 'right cuff inside', 'crotch', 'left cuff inside', 'left cuff outside', 'left side outside 2'], 'skeleton': []}
+
+table = {'keypoints': ['desktop corner 1', 'desktop corner 2', 'desktop corner 3', 'desktop corner 4', 'table leg 1', 'table leg 2', 'table leg 3', 'table leg 4'], 'skeleton': []}
+
+chair = {'keypoints': ['legs righttopcorner', 'legs lefttopcorner', 'legs leftbottomcorner', 'legs rightbottomcorner', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'headboard righttop', 'headboard lefttop'], 'skeleton': []}
+
+bed = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'backrest lefttop'], 'skeleton': []}
+
+sofa = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'armrests rightbottomcorner', 'armrests righttopcorner', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'armrests leftbottomcorner', 'armrests lefttopcorner', 'backrest lefttop'], 'skeleton': []}
+
+swivelchair = {'keypoints': ['rotatingbase 1', 'rotatingbase 2', 'rotatingbase 3', 'rotatingbase 4', 'rotatingbase 5', 'rotatingbase center', 'base center', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'backrest righttop', 'backrest lefttop'], 'skeleton': []}
+
diff --git a/src/models/XPose/transforms.py b/src/models/XPose/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..9155913bc34afe0cf9c23495a1dac3d8225d2a94
--- /dev/null
+++ b/src/models/XPose/transforms.py
@@ -0,0 +1,394 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import os
+import sys
+import random
+
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from util.box_ops import box_xyxy_to_cxcywh
+from util.misc import interpolate
+
+
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+
+    if target is not None:
+        target = target.copy()
+        i, j, h, w = region
+        id2catname = target["id2catname"]
+        caption_list = target["caption_list"]
+        target["size"] = torch.tensor([h, w])
+
+        fields = ["labels", "area", "iscrowd", "positive_map","keypoints"]
+
+        if "boxes" in target:
+            boxes = target["boxes"]
+            max_size = torch.as_tensor([w, h], dtype=torch.float32)
+            cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+            cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+            cropped_boxes = cropped_boxes.clamp(min=0)
+            area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+            target["boxes"] = cropped_boxes.reshape(-1, 4)
+            target["area"] = area
+            fields.append("boxes")
+
+        if "masks" in target:
+            # FIXME should we update the area here if there are no boxes?
+            target['masks'] = target['masks'][:, i:i + h, j:j + w]
+            fields.append("masks")
+
+
+        # remove elements for which the boxes or masks that have zero area
+        if "boxes" in target or "masks" in target:
+            # favor boxes selection when defining which elements to keep
+            # this is compatible with previous implementation
+            if "boxes" in target:
+                cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+                keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+            else:
+                keep = target['masks'].flatten(1).any(1)
+
+            for field in fields:
+                if field in target:
+                    target[field] = target[field][keep]
+
+        if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            # for debug and visualization only.
+            if 'strings_positive' in target:
+                target['strings_positive'] = [_i for _i, _j in zip(target['strings_positive'], keep) if _j]
+
+
+        if "keypoints" in target:
+            max_size = torch.as_tensor([w, h], dtype=torch.float32)
+            keypoints = target["keypoints"]
+            cropped_keypoints = keypoints.view(-1, 3)[:,:2] - torch.as_tensor([j, i])
+            cropped_keypoints = torch.min(cropped_keypoints, max_size)
+            cropped_keypoints = cropped_keypoints.clamp(min=0)
+            cropped_keypoints = torch.cat([cropped_keypoints, keypoints.view(-1, 3)[:,2].unsqueeze(1)], dim=1)
+            target["keypoints"] = cropped_keypoints.view(target["keypoints"].shape[0], target["keypoints"].shape[1], 3)
+
+        target["id2catname"] = id2catname
+        target["caption_list"] = caption_list
+
+    return cropped_image, target
+
+
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+
+    w, h = image.size
+
+    if target is not None:
+        target = target.copy()
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+            target["boxes"] = boxes
+
+        if "masks" in target:
+            target['masks'] = target['masks'].flip(-1)
+
+
+        if "keypoints" in target:
+            dataset_name=target["dataset_name"]
+            if dataset_name == "coco_person" or dataset_name == "macaque":
+                flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8],
+                                   [9, 10], [11, 12], [13, 14], [15, 16]]
+
+            elif dataset_name=="animalkindom_ak_P1_animal":
+                flip_pairs = [[1, 2], [4, 5],[7,8],[9,10],[11,12],[14,15],[16,17],[18,19]]
+
+            elif dataset_name=="animalweb_animal":
+                flip_pairs = [[0, 3], [1, 2], [5, 6]]
+
+            elif dataset_name=="face":
+                flip_pairs = [
+                                [0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9],
+                                [17, 26], [18, 25], [19, 24], [20, 23], [21, 22],
+                                [31, 35], [32, 34],
+                                [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],
+                                [48, 54], [49, 53], [50, 52],
+                                [55, 59], [56, 58],
+                                [60, 64], [61, 63],
+                                [65, 67]
+                            ]
+
+            elif dataset_name=="hand":
+                flip_pairs = []
+
+            elif dataset_name=="foot":
+                flip_pairs = []
+
+            elif dataset_name=="locust":
+                flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], [10, 25], [11, 26], [12, 27], [13, 28], [14, 29], [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]]
+
+            elif dataset_name=="fly":
+                flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22], [11, 23], [12, 24], [13, 25], [14, 26], [15, 27], [16, 28], [17, 29], [30, 31]]
+
+            elif dataset_name == "ap_36k_animal" or dataset_name == "ap_10k_animal":
+                flip_pairs = [[0, 1],[5, 8], [6, 9], [7, 10], [11, 14], [12, 15], [13, 16]]
+
+
+
+            keypoints = target["keypoints"]
+            keypoints[:,:,0] = w - keypoints[:,:, 0]-1
+            for pair in flip_pairs:
+                keypoints[:,pair[0], :], keypoints[:,pair[1], :] = keypoints[:,pair[1], :], keypoints[:,pair[0], :].clone()
+            target["keypoints"] = keypoints
+    return flipped_image, target
+
+
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+
+
+    if "keypoints" in target:
+        keypoints = target["keypoints"]
+        scaled_keypoints = keypoints * torch.as_tensor([ratio_width, ratio_height, 1])
+        target["keypoints"] = scaled_keypoints
+
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+
+    if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+
+    return rescaled_image, target
+
+
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+
+
+class ResizeDebug(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        return resize(img, target, self.size)
+
+
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+
+
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):
+        # respect_boxes:    True to keep all boxes
+        #                   False to tolerence box filter
+        self.min_size = min_size
+        self.max_size = max_size
+        self.respect_boxes = respect_boxes
+
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        init_boxes = len(target["boxes"]) if (target is not None and "boxes" in target) else 0
+        max_patience = 10
+        for i in range(max_patience):
+            w = random.randint(self.min_size, min(img.width, self.max_size))
+            h = random.randint(self.min_size, min(img.height, self.max_size))
+            region = T.RandomCrop.get_params(img, [h, w])
+            result_img, result_target = crop(img, target, region)
+            if target is not None:
+                if not self.respect_boxes or len(result_target["boxes"]) == init_boxes or i == max_patience - 1:
+                    return result_img, result_target
+        return result_img, result_target
+
+
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+
+
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+
+
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+
+
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+
+
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+
+
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+
+
+class RandomErasing(object):
+
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+
+    def __call__(self, img, target):
+        return self.eraser(img), target
+
+
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+
+        if "area" in target:
+            area = target["area"]
+            area = area / (torch.tensor(w, dtype=torch.float32)*torch.tensor(h, dtype=torch.float32))
+            target["area"] = area
+
+        if "keypoints" in target:
+            keypoints = target["keypoints"]
+            V = keypoints[:, :, 2]
+            V[V == 2] = 1
+            Z=keypoints[:, :, :2]
+            Z = Z.contiguous().view(-1, 2 * V.shape[-1])
+            Z = Z / torch.tensor([w, h] * V.shape[-1], dtype=torch.float32)
+            target["valid_kpt_num"] = V.shape[1]
+            Z_pad = torch.zeros(Z.shape[0],68 * 2 - Z.shape[1])
+            V_pad = torch.zeros(V.shape[0],68 - V.shape[1])
+            V=torch.cat([V, V_pad], dim=1)
+            Z=torch.cat([Z, Z_pad], dim=1)
+            all_keypoints = torch.cat([Z, V], dim=1)
+            target["keypoints"] = all_keypoints
+
+
+        return image, target
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
diff --git a/src/models/XPose/util/__init__.py b/src/models/XPose/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b3239d927e0762a4952006a55a8596998e0ac03
--- /dev/null
+++ b/src/models/XPose/util/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/8/5 21:58
+# @Author  : shaoguowen
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py
diff --git a/src/models/XPose/util/addict.py b/src/models/XPose/util/addict.py
new file mode 100644
index 0000000000000000000000000000000000000000..55e02d1d17596c77a6f3642ba02eeb30971048bd
--- /dev/null
+++ b/src/models/XPose/util/addict.py
@@ -0,0 +1,159 @@
+import copy
+
+
+class Dict(dict):
+
+    def __init__(__self, *args, **kwargs):
+        object.__setattr__(__self, '__parent', kwargs.pop('__parent', None))
+        object.__setattr__(__self, '__key', kwargs.pop('__key', None))
+        object.__setattr__(__self, '__frozen', False)
+        for arg in args:
+            if not arg:
+                continue
+            elif isinstance(arg, dict):
+                for key, val in arg.items():
+                    __self[key] = __self._hook(val)
+            elif isinstance(arg, tuple) and (not isinstance(arg[0], tuple)):
+                __self[arg[0]] = __self._hook(arg[1])
+            else:
+                for key, val in iter(arg):
+                    __self[key] = __self._hook(val)
+
+        for key, val in kwargs.items():
+            __self[key] = __self._hook(val)
+
+    def __setattr__(self, name, value):
+        if hasattr(self.__class__, name):
+            raise AttributeError("'Dict' object attribute "
+                                 "'{0}' is read-only".format(name))
+        else:
+            self[name] = value
+
+    def __setitem__(self, name, value):
+        isFrozen = (hasattr(self, '__frozen') and
+                    object.__getattribute__(self, '__frozen'))
+        if isFrozen and name not in super(Dict, self).keys():
+                raise KeyError(name)
+        super(Dict, self).__setitem__(name, value)
+        try:
+            p = object.__getattribute__(self, '__parent')
+            key = object.__getattribute__(self, '__key')
+        except AttributeError:
+            p = None
+            key = None
+        if p is not None:
+            p[key] = self
+            object.__delattr__(self, '__parent')
+            object.__delattr__(self, '__key')
+
+    def __add__(self, other):
+        if not self.keys():
+            return other
+        else:
+            self_type = type(self).__name__
+            other_type = type(other).__name__
+            msg = "unsupported operand type(s) for +: '{}' and '{}'"
+            raise TypeError(msg.format(self_type, other_type))
+
+    @classmethod
+    def _hook(cls, item):
+        if isinstance(item, dict):
+            return cls(item)
+        elif isinstance(item, (list, tuple)):
+            return type(item)(cls._hook(elem) for elem in item)
+        return item
+
+    def __getattr__(self, item):
+        return self.__getitem__(item)
+
+    def __missing__(self, name):
+        if object.__getattribute__(self, '__frozen'):
+            raise KeyError(name)
+        return self.__class__(__parent=self, __key=name)
+
+    def __delattr__(self, name):
+        del self[name]
+
+    def to_dict(self):
+        base = {}
+        for key, value in self.items():
+            if isinstance(value, type(self)):
+                base[key] = value.to_dict()
+            elif isinstance(value, (list, tuple)):
+                base[key] = type(value)(
+                    item.to_dict() if isinstance(item, type(self)) else
+                    item for item in value)
+            else:
+                base[key] = value
+        return base
+
+    def copy(self):
+        return copy.copy(self)
+
+    def deepcopy(self):
+        return copy.deepcopy(self)
+
+    def __deepcopy__(self, memo):
+        other = self.__class__()
+        memo[id(self)] = other
+        for key, value in self.items():
+            other[copy.deepcopy(key, memo)] = copy.deepcopy(value, memo)
+        return other
+
+    def update(self, *args, **kwargs):
+        other = {}
+        if args:
+            if len(args) > 1:
+                raise TypeError()
+            other.update(args[0])
+        other.update(kwargs)
+        for k, v in other.items():
+            if ((k not in self) or
+                (not isinstance(self[k], dict)) or
+                (not isinstance(v, dict))):
+                self[k] = v
+            else:
+                self[k].update(v)
+
+    def __getnewargs__(self):
+        return tuple(self.items())
+
+    def __getstate__(self):
+        return self
+
+    def __setstate__(self, state):
+        self.update(state)
+
+    def __or__(self, other):
+        if not isinstance(other, (Dict, dict)):
+            return NotImplemented
+        new = Dict(self)
+        new.update(other)
+        return new
+
+    def __ror__(self, other):
+        if not isinstance(other, (Dict, dict)):
+            return NotImplemented
+        new = Dict(other)
+        new.update(self)
+        return new
+
+    def __ior__(self, other):
+        self.update(other)
+        return self
+
+    def setdefault(self, key, default=None):
+        if key in self:
+            return self[key]
+        else:
+            self[key] = default
+            return default
+
+    def freeze(self, shouldFreeze=True):
+        object.__setattr__(self, '__frozen', shouldFreeze)
+        for key, val in self.items():
+            if isinstance(val, Dict):
+                val.freeze(shouldFreeze)
+
+    def unfreeze(self):
+        self.freeze(False)
diff --git a/src/models/XPose/util/box_ops.py b/src/models/XPose/util/box_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fff6624064ca10682f0da4c52073fd8006456a9b
--- /dev/null
+++ b/src/models/XPose/util/box_ops.py
@@ -0,0 +1,139 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Utilities for bounding box manipulation and GIoU.
+"""
+import torch, os
+from torchvision.ops.boxes import box_area
+
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    # import ipdb; ipdb.set_trace()
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / (union + 1e-6)
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+
+    The boxes should be in [x0, y0, x1, y1] format
+
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    # except:
+    #     import ipdb; ipdb.set_trace()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / (area + 1e-6)
+
+
+
+# modified from torchvision to also return the union
+def box_iou_pairwise(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]
+    rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    inter = wh[:, 0] * wh[:, 1]  # [N]
+
+    union = area1 + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou_pairwise(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+
+    Input:
+        - boxes1, boxes2: N,4
+    Output:
+        - giou: N, 4
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    assert boxes1.shape == boxes2.shape
+    iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
+
+    lt = torch.min(boxes1[:, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    area = wh[:, 0] * wh[:, 1]
+
+    return iou - (area - union) / area
+
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+
+if __name__ == '__main__':
+    x = torch.rand(5, 4)
+    y = torch.rand(3, 4)
+    iou, union = box_iou(x, y)
+    import ipdb; ipdb.set_trace()
diff --git a/src/models/XPose/util/config.py b/src/models/XPose/util/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..8df911a9aacf8dcf8a0356bafcf2a1bb57ef60da
--- /dev/null
+++ b/src/models/XPose/util/config.py
@@ -0,0 +1,425 @@
+# ==========================================================
+# Modified from mmcv
+# ==========================================================
+import sys
+import os.path as osp
+import ast
+import tempfile
+import shutil
+from importlib import import_module
+from argparse import Action
+
+from .addict import Dict
+import os
+
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+RESERVED_KEYS = ['filename', 'text', 'pretty_text', 'get', 'dump', 'merge_from_dict']
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+class ConfigDict(Dict):
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super(ConfigDict, self).__getattr__(name)
+        except KeyError:
+            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
+                                f"attribute '{name}'")
+        except Exception as e:
+            ex = e
+        else:
+            return value
+        raise ex
+
+
+class Config(object):
+    """
+    config files.
+    only support .py file as config now.
+
+    ref: mmcv.utils.config
+
+    Example:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/kchen/projects/mmcv/tests/data/config/a.py"
+        >>> cfg.item4
+        'test'
+        >>> cfg
+        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
+    """
+    @staticmethod
+    def _validate_py_syntax(filename):
+        with open(filename) as f:
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}')
+
+    @staticmethod
+    def _file2dict(filename):
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        if filename.lower().endswith('.py'):
+            with tempfile.TemporaryDirectory() as temp_config_dir:
+                # 使用 mkstemp 代替 NamedTemporaryFile
+                fd, temp_path = tempfile.mkstemp(dir=temp_config_dir, suffix='.py')
+                os.close(fd)  # 立即关闭文件描述符
+                temp_config_name = os.path.basename(temp_path)
+                shutil.copyfile(filename, os.path.join(temp_config_dir, temp_config_name))
+                temp_module_name = os.path.splitext(temp_config_name)[0]
+                sys.path.insert(0, temp_config_dir)
+                Config._validate_py_syntax(filename)
+                mod = import_module(temp_module_name)
+                sys.path.pop(0)
+                cfg_dict = {
+                    name: value
+                    for name, value in mod.__dict__.items()
+                    if not name.startswith('__')
+                }
+                # delete imported module
+                del sys.modules[temp_module_name]
+        elif filename.lower().endswith(('.yml', '.yaml', '.json')):
+            from .slio import slload
+            cfg_dict = slload(filename)
+        else:
+            raise IOError('Only py/yml/yaml/json type are supported now!')
+
+        cfg_text = filename + '\n'
+        with open(filename, 'r') as f:
+            cfg_text += f.read()
+
+        # parse the base file
+        if BASE_KEY in cfg_dict:
+            cfg_dir = osp.dirname(filename)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = base_filename if isinstance(
+                base_filename, list) else [base_filename]
+
+            cfg_dict_list = list()
+            cfg_text_list = list()
+            for f in base_filename:
+                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+                cfg_text_list.append(_cfg_text)
+
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                if len(base_cfg_dict.keys() & c.keys()) > 0:
+                    raise KeyError('Duplicate key is not allowed among bases')
+                    # TODO Allow the duplicate key while warnning user
+                base_cfg_dict.update(c)
+
+            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+            cfg_dict = base_cfg_dict
+
+            # merge cfg_text
+            cfg_text_list.append(cfg_text)
+            cfg_text = '\n'.join(cfg_text_list)
+
+        return cfg_dict, cfg_text
+
+    @staticmethod
+    def _merge_a_into_b(a, b):
+        """merge dict `a` into dict `b` (non-inplace).
+            values in `a` will overwrite `b`.
+            copy first to avoid inplace modification
+            
+        Args:
+            a ([type]): [description]
+            b ([type]): [description]
+
+        Returns:
+            [dict]: [description]
+        """
+        # import ipdb; ipdb.set_trace()
+        if not isinstance(a, dict):
+            return a
+
+        b = b.copy()
+        for k, v in a.items():
+            if isinstance(v, dict) and k in b and not v.pop(DELETE_KEY, False):
+            
+                if not isinstance(b[k], dict) and not isinstance(b[k], list):
+                    # if :
+                    # import ipdb; ipdb.set_trace()
+                    raise TypeError(
+                        f'{k}={v} in child config cannot inherit from base '
+                        f'because {k} is a dict in the child config but is of '
+                        f'type {type(b[k])} in base config. You may set '
+                        f'`{DELETE_KEY}=True` to ignore the base config')
+                b[k] = Config._merge_a_into_b(v, b[k])
+            elif isinstance(b, list):
+                try:
+                    _ = int(k)
+                except:
+                    raise TypeError(
+                        f'b is a list, '
+                        f'index {k} should be an int when input but {type(k)}'
+                    )
+                b[int(k)] = Config._merge_a_into_b(v, b[int(k)])
+            else:   
+                b[k] = v
+                
+        return b
+
+    @staticmethod
+    def fromfile(filename):
+        cfg_dict, cfg_text = Config._file2dict(filename)
+        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
+
+
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+
+        super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
+        super(Config, self).__setattr__('_filename', filename)
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename, 'r') as f:
+                text = f.read()
+        else:
+            text = ''
+        super(Config, self).__setattr__('_text', text)
+
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def pretty_text(self):
+
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = f"'{v}'"
+            else:
+                v_str = str(v)
+
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list(k, v, use_mapping=False):
+            # check if all items in the list are dict
+            if all(isinstance(_, dict) for _ in v):
+                v_str = '[\n'
+                v_str += '\n'.join(
+                    f'dict({_indent(_format_dict(v_), indent)}),'
+                    for v_ in v).rstrip(',')
+                if use_mapping:
+                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                    attr_str = f'{k_str}: {v_str}'
+                else:
+                    attr_str = f'{str(k)}={v_str}'
+                attr_str = _indent(attr_str, indent) + ']'
+            else:
+                attr_str = _format_basic_types(k, v, use_mapping)
+            return attr_str
+
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(input_dict.items()):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, list):
+                    attr_str = _format_list(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+
+        cfg_dict = self._cfg_dict.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        return text
+    
+
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name):
+        # # debug
+        # print('+'*15)
+        # print('name=%s' % name)
+        # print("addr:", id(self))
+        # # print('type(self):', type(self))
+        # print(self.__dict__)
+        # print('+'*15)
+        # if self.__dict__ == {}:
+        #     raise ValueError
+
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def dump(self, file=None):
+        # import ipdb; ipdb.set_trace()
+        if file is None:
+            return self.pretty_text
+        else:
+            with open(file, 'w') as f:
+                f.write(self.pretty_text)
+
+    def merge_from_dict(self, options):
+        """Merge list into cfg_dict
+
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Examples:
+            >>> options = {'model.backbone.depth': 50,
+            ...            'model.backbone.with_cp':True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(
+            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
+
+        Args:
+            options (dict): dict of configs to merge from.
+        """
+        option_cfg_dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+        super(Config, self).__setattr__(
+            '_cfg_dict', Config._merge_a_into_b(option_cfg_dict, cfg_dict))
+
+    # for multiprocess
+    def __setstate__(self, state):
+        self.__init__(state)
+
+
+    def copy(self):
+        return Config(self._cfg_dict.copy())
+
+    def deepcopy(self):
+        return Config(self._cfg_dict.deepcopy())
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options should
+    be passed as comma separated values, i.e KEY=V1,V2,V3
+    """
+
+    @staticmethod
+    def _parse_int_float_bool(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        if val.lower() in ['none', 'null']:
+            return None
+        return val
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            val = [self._parse_int_float_bool(v) for v in val.split(',')]
+            if len(val) == 1:
+                val = val[0]
+            options[key] = val
+        setattr(namespace, self.dest, options)
+
diff --git a/src/models/XPose/util/keypoint_ops.py b/src/models/XPose/util/keypoint_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..036d813d555f2f9beee252319c40b05c2f716168
--- /dev/null
+++ b/src/models/XPose/util/keypoint_ops.py
@@ -0,0 +1,29 @@
+import torch, os
+
+def keypoint_xyxyzz_to_xyzxyz(keypoints: torch.Tensor):
+    """_summary_
+
+    Args:
+        keypoints (torch.Tensor): ..., 51
+    """
+    res = torch.zeros_like(keypoints)
+    num_points = keypoints.shape[-1] // 3
+    Z = keypoints[..., :2*num_points]
+    V = keypoints[..., 2*num_points:]
+    res[...,0::3] = Z[..., 0::2]
+    res[...,1::3] = Z[..., 1::2]
+    res[...,2::3] = V[...]
+    return res
+
+def keypoint_xyzxyz_to_xyxyzz(keypoints: torch.Tensor):
+    """_summary_
+
+    Args:
+        keypoints (torch.Tensor): ..., 51
+    """
+    res = torch.zeros_like(keypoints)
+    num_points = keypoints.shape[-1] // 3
+    res[...,0:2*num_points:2] = keypoints[..., 0::3]
+    res[...,1:2*num_points:2] = keypoints[..., 1::3]
+    res[...,2*num_points:] = keypoints[..., 2::3]
+    return res
\ No newline at end of file
diff --git a/src/models/XPose/util/misc.py b/src/models/XPose/util/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fa90f3be6f389cd3ecf7323b55583f021616247
--- /dev/null
+++ b/src/models/XPose/util/misc.py
@@ -0,0 +1,701 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Misc functions, including distributed helpers.
+
+Mostly copy-paste from torchvision references.
+"""
+import functools
+import io
+import os
+import random 
+import subprocess
+import time
+from collections import OrderedDict, defaultdict, deque
+import datetime
+import pickle
+from typing import Optional, List
+
+import json, time
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch import Tensor
+
+import colorsys
+
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+__torchvision_need_compat_flag = float(torchvision.__version__.split('.')[1]) < 7
+if __torchvision_need_compat_flag:
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        if d.shape[0] == 0:
+            return 0
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        if os.environ.get("SHILONG_AMP", None) == '1':
+            eps = 1e-4
+        else:
+            eps = 1e-6
+        return self.total / (self.count + eps)
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """
+    Return a process group based on gloo backend, containing all the ranks
+    The result is cached.
+    """
+
+    if dist.get_backend() == "nccl":
+        return dist.new_group(backend="gloo")
+
+    return dist.group.WORLD
+
+def all_gather_cpu(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    cpu_group = _get_global_gloo_group()
+
+    buffer = io.BytesIO()
+    torch.save(data, buffer)
+    data_view = buffer.getbuffer()
+    device = "cuda" if cpu_group is None else "cpu"
+    tensor = torch.ByteTensor(data_view).to(device)
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device=device, dtype=torch.long)
+    size_list = [torch.tensor([0], device=device, dtype=torch.long) for _ in range(world_size)]
+    if cpu_group is None:
+        dist.all_gather(size_list, local_size)
+    else:
+        print("gathering on cpu")
+        dist.all_gather(size_list, local_size, group=cpu_group)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    assert isinstance(local_size.item(), int)
+    local_size = int(local_size.item())
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device=device))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device=device)
+        tensor = torch.cat((tensor, padding), dim=0)
+    if cpu_group is None:
+        dist.all_gather(tensor_list, tensor)
+    else:
+        dist.all_gather(tensor_list, tensor, group=cpu_group)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        tensor = torch.split(tensor, [size, max_size - size], dim=0)[0]
+        buffer = io.BytesIO(tensor.cpu().numpy())
+        obj = torch.load(buffer)
+        data_list.append(obj)
+
+    return data_list
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+
+    if os.getenv("CPU_REDUCE") == "1":
+        return all_gather_cpu(data)
+
+
+
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            # print(name, str(meter))
+            # import ipdb;ipdb.set_trace()
+            if meter.count > 0:
+                loss_str.append(
+                    "{}: {}".format(name, str(meter))
+                )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None, logger=None):
+        if logger is None:
+            print_func = print
+        else:
+            print_func = logger.info
+
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            # import ipdb; ipdb.set_trace()
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print_func(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print_func(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print_func('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+
+
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
+    sha = 'N/A'
+    diff = "clean"
+    branch = 'N/A'
+    try:
+        sha = _run(['git', 'rev-parse', 'HEAD'])
+        subprocess.check_output(['git', 'diff'], cwd=cwd)
+        diff = _run(['git', 'diff-index', 'HEAD'])
+        diff = "has uncommited changes" if diff else "clean"
+        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+
+
+def collate_fn(batch):
+    # import ipdb; ipdb.set_trace()
+    batch = list(zip(*batch))
+    batch[0] = nested_tensor_from_tensor_list(batch[0])
+    return tuple(batch)
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+        if mask == 'auto':
+            self.mask = torch.zeros_like(tensors).to(tensors.device)
+            if self.mask.dim() == 3:
+                self.mask = self.mask.sum(0).to(bool)
+            elif self.mask.dim() == 4:
+                self.mask = self.mask.sum(1).to(bool)
+            else:
+                raise ValueError("tensors dim must be 3 or 4 but {}({})".format(self.tensors.dim(), self.tensors.shape))
+
+    def imgsize(self):
+        res = []
+        for i in range(self.tensors.shape[0]):
+            mask = self.mask[i]
+            maxH = (~mask).sum(0).max()
+            maxW = (~mask).sum(1).max()
+            res.append(torch.Tensor([maxH, maxW]))
+        return res
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def to_img_list_single(self, tensor, mask):
+        assert tensor.dim() == 3, "dim of tensor should be 3 but {}".format(tensor.dim())
+        maxH = (~mask).sum(0).max()
+        maxW = (~mask).sum(1).max()
+        img = tensor[:, :maxH, :maxW]
+        return img
+
+    def to_img_list(self):
+        """remove the padding and convert to img list
+
+        Returns:
+            [type]: [description]
+        """
+        if self.tensors.dim() == 3:
+            return self.to_img_list_single(self.tensors, self.mask)
+        else:
+            res = []
+            for i in range(self.tensors.shape[0]):
+                tensor_i = self.tensors[i]
+                mask_i = self.mask[i]
+                res.append(self.to_img_list_single(tensor_i, mask_i))
+            return res
+
+    @property
+    def device(self):
+        return self.tensors.device
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+    @property
+    def shape(self):
+        return {
+            'tensors.shape': self.tensors.shape,
+            'mask.shape': self.mask.shape
+        }
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], :img.shape[2]] = False
+    else:
+        raise ValueError('not supported')
+    return NestedTensor(tensor, mask)
+
+
+# _onnx_nested_tensor_from_tensor_list() is an implementation of
+# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
+    max_size = []
+    for i in range(tensor_list[0].dim()):
+        max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
+        max_size.append(max_size_i)
+    max_size = tuple(max_size)
+
+    # work around for
+    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+    # m[: img.shape[1], :img.shape[2]] = False
+    # which is not yet supported in onnx
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_imgs.append(padded_img)
+
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_masks.append(padded_mask.to(torch.bool))
+
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+
+    return NestedTensor(tensor, mask=mask)
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+def init_distributed_mode(args):
+    if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != '': # 'RANK' in os.environ and 
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
+
+        # launch by torch.distributed.launch
+        # Single node
+        #   python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 1 --rank 0 ...
+        # Multi nodes
+        #   python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 2 --rank 0 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' ...
+        #   python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 2 --rank 1 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' ...
+        # args.rank = int(os.environ.get('OMPI_COMM_WORLD_RANK'))        
+        # local_world_size = int(os.environ['GPU_PER_NODE_COUNT'])
+        # args.world_size = args.world_size * local_world_size
+        # args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
+        # args.rank = args.rank * local_world_size + args.local_rank
+        print('world size: {}, rank: {}, local rank: {}'.format(args.world_size, args.rank, args.local_rank))
+        print(json.dumps(dict(os.environ), indent=2))
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.local_rank = int(os.environ['SLURM_LOCALID'])
+        args.world_size = int(os.environ['SLURM_NPROCS'])
+
+        if os.environ.get('HAND_DEFINE_DIST_URL', 0) == '1':
+            pass
+        else:
+            import util.hostlist as uh
+            nodenames = uh.parse_nodelist(os.environ['SLURM_JOB_NODELIST'])
+            gpu_ids = [int(node[3:]) for node in nodenames]
+            fixid = int(os.environ.get('FIX_DISTRIBUTED_PORT_NUMBER', 0))
+            # fixid += random.randint(0, 300)
+            port = str(3137 + int(min(gpu_ids)) + fixid)
+            args.dist_url = "tcp://{ip}:{port}".format(ip=uh.nodename_to_ip(nodenames[0]), port=port)
+
+        print('world size: {}, world rank: {}, local rank: {}, device_count: {}'.format(args.world_size, args.rank, args.local_rank, torch.cuda.device_count()))
+
+
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        args.world_size = 1
+        args.rank = 0
+        args.local_rank = 0
+        return
+
+    print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank))
+    args.distributed = True
+    torch.cuda.set_device(args.local_rank)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True)
+
+    torch.distributed.init_process_group(
+        backend=args.dist_backend, 
+        world_size=args.world_size, 
+        rank=args.rank,
+        init_method=args.dist_url,
+    )
+
+    print("Before torch.distributed.barrier()")
+    torch.distributed.barrier()
+    print("End torch.distributed.barrier()")
+    setup_for_distributed(args.rank == 0)
+
+
+@torch.no_grad()
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    if target.numel() == 0:
+        return [torch.zeros([], device=output.device)]
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+@torch.no_grad()
+def accuracy_onehot(pred, gt):
+    """_summary_
+
+    Args:
+        pred (_type_): n, c
+        gt (_type_): n, c
+    """
+    tp = ((pred - gt).abs().sum(-1) < 1e-4).float().sum()
+    acc = tp / gt.shape[0] * 100
+    return acc
+
+
+
+
+
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if __torchvision_need_compat_flag < 0.7:
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(
+                input, size, scale_factor, mode, align_corners
+            )
+
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+
+class color_sys():
+    def __init__(self, num_colors) -> None:
+        self.num_colors = num_colors
+        colors=[]
+        for i in np.arange(0., 360., 360. / num_colors):
+            hue = i/360.
+            lightness = (50 + np.random.rand() * 10)/100.
+            saturation = (90 + np.random.rand() * 10)/100.
+            colors.append(tuple([int(j*255) for j in colorsys.hls_to_rgb(hue, lightness, saturation)]))
+        self.colors = colors
+
+    def __call__(self, idx):
+        return self.colors[idx]
+
+def inverse_sigmoid(x, eps=1e-3):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1/x2)
+
+def clean_state_dict(state_dict):
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k[:7] == 'module.':
+            k = k[7:]  # remove `module.`
+        new_state_dict[k] = v
+    return new_state_dict
\ No newline at end of file
diff --git a/src/models/__init__.py b/src/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e6b22f2d4cc6c83d49bcdbcd9a92a3a54224897
--- /dev/null
+++ b/src/models/__init__.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py
+
+from .warping_spade_model import WarpingSpadeModel
+from .motion_extractor_model import MotionExtractorModel
+from .appearance_feature_extractor_model import AppearanceFeatureExtractorModel
+from .landmark_model import LandmarkModel
+from .face_analysis_model import FaceAnalysisModel
+from .stitching_model import StitchingModel
+from .mediapipe_face_model import MediaPipeFaceModel
diff --git a/src/models/appearance_feature_extractor_model.py b/src/models/appearance_feature_extractor_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f09517aa0de898ef33a03f8620aa88c55fe8841
--- /dev/null
+++ b/src/models/appearance_feature_extractor_model.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: motion_extractor_model.py
+import pdb
+import numpy as np
+from .base_model import BaseModel
+import torch
+from torch.cuda import nvtx
+from .predictor import numpy_to_torch_dtype_dict
+
+
+class AppearanceFeatureExtractorModel(BaseModel):
+    """
+    AppearanceFeatureExtractorModel
+    """
+
+    def __init__(self, **kwargs):
+        super(AppearanceFeatureExtractorModel, self).__init__(**kwargs)
+        self.predict_type = kwargs.get("predict_type", "trt")
+        print(self.predict_type)
+
+    def input_process(self, *data):
+        img = data[0].astype(np.float32)
+        img /= 255.0
+        img = np.transpose(img, (2, 0, 1))
+        return img[None]
+
+    def output_process(self, *data):
+        return data[0]
+
+    def predict_trt(self, *data):
+        nvtx.range_push("forward")
+        feed_dict = {}
+        for i, inp in enumerate(self.predictor.inputs):
+            if isinstance(data[i], torch.Tensor):
+                feed_dict[inp['name']] = data[i]
+            else:
+                feed_dict[inp['name']] = torch.from_numpy(data[i]).to(device=self.device,
+                                                                      dtype=numpy_to_torch_dtype_dict[inp['dtype']])
+        preds_dict = self.predictor.predict(feed_dict, self.cudaStream)
+        outs = []
+        for i, out in enumerate(self.predictor.outputs):
+            outs.append(preds_dict[out["name"]].cpu().numpy())
+        nvtx.range_pop()
+        return outs
+
+    def predict(self, *data):
+        data = self.input_process(*data)
+        if self.predict_type == "trt":
+            preds = self.predict_trt(data)
+        else:
+            preds = self.predictor.predict(data)
+        outputs = self.output_process(*preds)
+        return outputs
diff --git a/src/models/base_model.py b/src/models/base_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..142790c68534e0f724834110d8f6c12ef0607a15
--- /dev/null
+++ b/src/models/base_model.py
@@ -0,0 +1,49 @@
+import copy
+import torch
+from .predictor import get_predictor
+
+
+class BaseModel:
+    """
+    模型预测的基类
+    """
+
+    def __init__(self, **kwargs):
+        self.kwargs = copy.deepcopy(kwargs)
+        self.predictor = get_predictor(**self.kwargs)
+        self.device = torch.cuda.current_device()
+        self.cudaStream = torch.cuda.current_stream().cuda_stream
+        self.predict_type = kwargs.get("predict_type", "trt")
+
+        if self.predictor is not None:
+            self.input_shapes = self.predictor.input_spec()
+            self.output_shapes = self.predictor.output_spec()
+
+    def input_process(self, *data):
+        """
+        输入预处理
+        :return:
+        """
+        pass
+
+    def output_process(self, *data):
+        """
+        输出后处理
+        :return:
+        """
+        pass
+
+    def predict(self, *data):
+        """
+        预测
+        :return:
+        """
+        pass
+
+    def __del__(self):
+        """
+        删除实例
+        :return:
+        """
+        if self.predictor is not None:
+            del self.predictor
diff --git a/src/models/face_analysis_model.py b/src/models/face_analysis_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..52f16ebb7f643bf7b393005c9f0f3410bae936ea
--- /dev/null
+++ b/src/models/face_analysis_model.py
@@ -0,0 +1,326 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo0611@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: face_analysis_model.py
+import pdb
+
+import numpy as np
+from insightface.app.common import Face
+import cv2
+from .predictor import get_predictor
+from ..utils import face_align
+import torch
+from torch.cuda import nvtx
+from .predictor import numpy_to_torch_dtype_dict
+
+
+def sort_by_direction(faces, direction: str = 'large-small', face_center=None):
+    if len(faces) <= 0:
+        return faces
+
+    if direction == 'left-right':
+        return sorted(faces, key=lambda face: face['bbox'][0])
+    if direction == 'right-left':
+        return sorted(faces, key=lambda face: face['bbox'][0], reverse=True)
+    if direction == 'top-bottom':
+        return sorted(faces, key=lambda face: face['bbox'][1])
+    if direction == 'bottom-top':
+        return sorted(faces, key=lambda face: face['bbox'][1], reverse=True)
+    if direction == 'small-large':
+        return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]))
+    if direction == 'large-small':
+        return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]),
+                      reverse=True)
+    if direction == 'distance-from-retarget-face':
+        return sorted(faces, key=lambda face: (((face['bbox'][2] + face['bbox'][0]) / 2 - face_center[0]) ** 2 + (
+                (face['bbox'][3] + face['bbox'][1]) / 2 - face_center[1]) ** 2) ** 0.5)
+    return faces
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return np.stack([x1, y1, x2, y2], axis=-1)
+
+
+def distance2kps(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    preds = []
+    for i in range(0, distance.shape[1], 2):
+        px = points[:, i % 2] + distance[:, i]
+        py = points[:, i % 2 + 1] + distance[:, i + 1]
+        if max_shape is not None:
+            px = px.clamp(min=0, max=max_shape[1])
+            py = py.clamp(min=0, max=max_shape[0])
+        preds.append(px)
+        preds.append(py)
+    return np.stack(preds, axis=-1)
+
+
+class FaceAnalysisModel:
+    def __init__(self, **kwargs):
+        self.model_paths = kwargs.get("model_path", [])
+        self.predict_type = kwargs.get("predict_type", "trt")
+        self.device = torch.cuda.current_device()
+        self.cudaStream = torch.cuda.current_stream().cuda_stream
+
+        assert self.model_paths
+        self.face_det = get_predictor(predict_type=self.predict_type, model_path=self.model_paths[0])
+        self.face_det.input_spec()
+        self.face_det.output_spec()
+        self.face_pose = get_predictor(predict_type=self.predict_type, model_path=self.model_paths[1])
+        self.face_pose.input_spec()
+        self.face_pose.output_spec()
+
+        # face det
+        self.input_mean = 127.5
+        self.input_std = 128.0
+        # print(self.output_names)
+        # assert len(outputs)==10 or len(outputs)==15
+        self.use_kps = False
+        self._anchor_ratio = 1.0
+        self._num_anchors = 1
+        self.center_cache = {}
+        self.nms_thresh = 0.4
+        self.det_thresh = 0.5
+        self.input_size = (512, 512)
+        if len(self.face_det.outputs) == 6:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+        elif len(self.face_det.outputs) == 9:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+            self.use_kps = True
+        elif len(self.face_det.outputs) == 10:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+        elif len(self.face_det.outputs) == 15:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+            self.use_kps = True
+
+        self.lmk_dim = 2
+        self.lmk_num = 212 // self.lmk_dim
+
+    def nms(self, dets):
+        thresh = self.nms_thresh
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = dets[:, 2]
+        y2 = dets[:, 3]
+        scores = dets[:, 4]
+
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+    def detect_face(self, *data):
+        img = data[0]  # BGR mode
+        im_ratio = float(img.shape[0]) / img.shape[1]
+        input_size = self.input_size
+        model_ratio = float(input_size[1]) / input_size[0]
+        if im_ratio > model_ratio:
+            new_height = input_size[1]
+            new_width = int(new_height / im_ratio)
+        else:
+            new_width = input_size[0]
+            new_height = int(new_width * im_ratio)
+        det_scale = float(new_height) / img.shape[0]
+        resized_img = cv2.resize(img, (new_width, new_height))
+        det_img = np.zeros((input_size[1], input_size[0], 3), dtype=np.uint8)
+        det_img[:new_height, :new_width, :] = resized_img
+
+        scores_list = []
+        bboxes_list = []
+        kpss_list = []
+        input_size = tuple(img.shape[0:2][::-1])
+
+        det_img = cv2.cvtColor(det_img, cv2.COLOR_BGR2RGB)
+        det_img = np.transpose(det_img, (2, 0, 1))
+        det_img = (det_img - self.input_mean) / self.input_std
+        if self.predict_type == "trt":
+            nvtx.range_push("forward")
+            feed_dict = {}
+            inp = self.face_det.inputs[0]
+            det_img_torch = torch.from_numpy(det_img[None]).to(device=self.device,
+                                                               dtype=numpy_to_torch_dtype_dict[inp['dtype']])
+            feed_dict[inp['name']] = det_img_torch
+            preds_dict = self.face_det.predict(feed_dict, self.cudaStream)
+            outs = []
+            for key in ["448", "471", "494", "451", "474", "497", "454", "477", "500"]:
+                outs.append(preds_dict[key].cpu().numpy())
+            o448, o471, o494, o451, o474, o497, o454, o477, o500 = outs
+            nvtx.range_pop()
+        else:
+            o448, o471, o494, o451, o474, o497, o454, o477, o500 = self.face_det.predict(det_img[None])
+        faces_det = [o448, o471, o494, o451, o474, o497, o454, o477, o500]
+        input_height = det_img.shape[1]
+        input_width = det_img.shape[2]
+        fmc = self.fmc
+        for idx, stride in enumerate(self._feat_stride_fpn):
+            scores = faces_det[idx]
+            bbox_preds = faces_det[idx + fmc]
+            bbox_preds = bbox_preds * stride
+            if self.use_kps:
+                kps_preds = faces_det[idx + fmc * 2] * stride
+            height = input_height // stride
+            width = input_width // stride
+            K = height * width
+            key = (height, width, stride)
+            if key in self.center_cache:
+                anchor_centers = self.center_cache[key]
+            else:
+                # solution-3:
+                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+                # print(anchor_centers.shape)
+                anchor_centers = (anchor_centers * stride).reshape((-1, 2))
+                if self._num_anchors > 1:
+                    anchor_centers = np.stack([anchor_centers] * self._num_anchors, axis=1).reshape((-1, 2))
+                if len(self.center_cache) < 100:
+                    self.center_cache[key] = anchor_centers
+
+            pos_inds = np.where(scores >= self.det_thresh)[0]
+            bboxes = distance2bbox(anchor_centers, bbox_preds)
+            pos_scores = scores[pos_inds]
+            pos_bboxes = bboxes[pos_inds]
+            scores_list.append(pos_scores)
+            bboxes_list.append(pos_bboxes)
+            if self.use_kps:
+                kpss = distance2kps(anchor_centers, kps_preds)
+                # kpss = kps_preds
+                kpss = kpss.reshape((kpss.shape[0], -1, 2))
+                pos_kpss = kpss[pos_inds]
+                kpss_list.append(pos_kpss)
+        scores = np.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        bboxes = np.vstack(bboxes_list) / det_scale
+        if self.use_kps:
+            kpss = np.vstack(kpss_list) / det_scale
+        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
+        pre_det = pre_det[order, :]
+        keep = self.nms(pre_det)
+        det = pre_det[keep, :]
+        if self.use_kps:
+            kpss = kpss[order, :, :]
+            kpss = kpss[keep, :, :]
+        else:
+            kpss = None
+        return det, kpss
+
+    def estimate_face_pose(self, *data):
+        """
+        检测脸部关键点
+        :param data:
+        :return:
+        """
+        img, face = data
+        bbox = face.bbox
+        w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
+        center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
+        rotate = 0
+        input_size = (192, 192)
+        _scale = input_size[0] / (max(w, h) * 1.5)
+        aimg, M = face_align.transform(img, center, input_size[0], _scale, rotate)
+        input_size = tuple(aimg.shape[0:2][::-1])
+
+        aimg = cv2.cvtColor(aimg, cv2.COLOR_BGR2RGB)
+        aimg = np.transpose(aimg, (2, 0, 1))
+        if self.predict_type == "trt":
+            nvtx.range_push("forward")
+            feed_dict = {}
+            inp = self.face_pose.inputs[0]
+            det_img_torch = torch.from_numpy(aimg[None]).to(device=self.device,
+                                                            dtype=numpy_to_torch_dtype_dict[inp['dtype']])
+            feed_dict[inp['name']] = det_img_torch
+            preds_dict = self.face_pose.predict(feed_dict, self.cudaStream)
+            outs = []
+            for i, out in enumerate(self.face_pose.outputs):
+                outs.append(preds_dict[out["name"]].cpu().numpy())
+            pred = outs[0]
+            nvtx.range_pop()
+        else:
+            pred = self.face_pose.predict(aimg[None])[0]
+        pred = pred.reshape((-1, 2))
+        if self.lmk_num < pred.shape[0]:
+            pred = pred[self.lmk_num * -1:, :]
+        pred[:, 0:2] += 1
+        pred[:, 0:2] *= (input_size[0] // 2)
+        if pred.shape[1] == 3:
+            pred[:, 2] *= (input_size[0] // 2)
+
+        IM = cv2.invertAffineTransform(M)
+        pred = face_align.trans_points(pred, IM)
+        face["landmark"] = pred
+        return pred
+
+    def predict(self, *data, **kwargs):
+        bboxes, kpss = self.detect_face(*data)
+        if bboxes.shape[0] == 0:
+            return []
+        ret = []
+        for i in range(bboxes.shape[0]):
+            bbox = bboxes[i, 0:4]
+            det_score = bboxes[i, 4]
+            kps = kpss[i]
+            face = Face(bbox=bbox, kps=kps, det_score=det_score)
+            self.estimate_face_pose(data[0], face)
+            ret.append(face)
+        ret = sort_by_direction(ret, 'large-small', None)
+        outs = [x.landmark for x in ret]
+        return outs
+
+    def __del__(self):
+        del self.face_det
+        del self.face_pose
diff --git a/src/models/kokoro/__init__.py b/src/models/kokoro/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d00495e31cc6665ed92f89b9ede53a774b5afdd3
--- /dev/null
+++ b/src/models/kokoro/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/1/14
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py
diff --git a/src/models/kokoro/config.json b/src/models/kokoro/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..29e12f5e6f19d8b27dcdb2cd37e8b12fd89590c5
--- /dev/null
+++ b/src/models/kokoro/config.json
@@ -0,0 +1,26 @@
+{
+  "decoder": {
+    "type": "istftnet",
+    "upsample_kernel_sizes": [20, 12],
+    "upsample_rates": [10, 6],
+    "gen_istft_hop_size": 5,
+    "gen_istft_n_fft": 20,
+    "resblock_dilation_sizes": [
+      [1, 3, 5],
+      [1, 3, 5],
+      [1, 3, 5]
+    ],
+    "resblock_kernel_sizes": [3, 7, 11],
+    "upsample_initial_channel": 512
+  },
+  "dim_in": 64,
+  "dropout": 0.2,
+  "hidden_dim": 512,
+  "max_conv_dim": 512,
+  "max_dur": 50,
+  "multispeaker": true,
+  "n_layer": 3,
+  "n_mels": 80,
+  "n_token": 178,
+  "style_dim": 128
+}
\ No newline at end of file
diff --git a/src/models/kokoro/istftnet.py b/src/models/kokoro/istftnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..da29481368de41ce2a3ff9816c9bd3f11f3ab15e
--- /dev/null
+++ b/src/models/kokoro/istftnet.py
@@ -0,0 +1,523 @@
+# https://github.com/yl4579/StyleTTS2/blob/main/Modules/istftnet.py
+from scipy.signal import get_window
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import weight_norm, remove_weight_norm
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# https://github.com/yl4579/StyleTTS2/blob/main/Modules/utils.py
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+
+LRELU_SLOPE = 0.1
+
+class AdaIN1d(nn.Module):
+    def __init__(self, style_dim, num_features):
+        super().__init__()
+        self.norm = nn.InstanceNorm1d(num_features, affine=False)
+        self.fc = nn.Linear(style_dim, num_features*2)
+
+    def forward(self, x, s):
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        return (1 + gamma) * self.norm(x) + beta
+
+class AdaINResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
+        super(AdaINResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+        
+        self.adain1 = nn.ModuleList([
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+        ])
+        
+        self.adain2 = nn.ModuleList([
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+        ])
+        
+        self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
+        self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
+
+
+    def forward(self, x, s):
+        for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
+            xt = n1(x, s)
+            xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2)  # Snake1D
+            xt = c1(xt)
+            xt = n2(xt, s)
+            xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2)  # Snake1D
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+            
+class TorchSTFT(torch.nn.Module):
+    def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'):
+        super().__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32))
+
+    def transform(self, input_data):
+        forward_transform = torch.stft(
+            input_data,
+            self.filter_length, self.hop_length, self.win_length, window=self.window.to(input_data.device),
+            return_complex=True)
+
+        return torch.abs(forward_transform), torch.angle(forward_transform)
+
+    def inverse(self, magnitude, phase):
+        inverse_transform = torch.istft(
+            magnitude * torch.exp(phase * 1j),
+            self.filter_length, self.hop_length, self.win_length, window=self.window.to(magnitude.device))
+
+        return inverse_transform.unsqueeze(-2)  # unsqueeze to stay consistent with conv_transpose1d implementation
+
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
+    
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+
+    def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0,
+                 flag_for_pulse=False):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+        self.upsample_scale = upsample_scale
+
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+
+    def _f02sine(self, f0_values):
+        """ f0_values: (batchsize, length, dim)
+            where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
+                              device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+#             # for normal case
+
+#             # To prevent torch.cumsum numerical overflow,
+#             # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
+#             # Buffer tmp_over_one_idx indicates the time step to add -1.
+#             # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
+#             tmp_over_one = torch.cumsum(rad_values, 1) % 1
+#             tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
+#             cumsum_shift = torch.zeros_like(rad_values)
+#             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+
+#             phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2), 
+                                                         scale_factor=1/self.upsample_scale, 
+                                                         mode="linear").transpose(1, 2)
+    
+#             tmp_over_one = torch.cumsum(rad_values, 1) % 1
+#             tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
+#             cumsum_shift = torch.zeros_like(rad_values)
+#             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+    
+            phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale, 
+                                                    scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
+            sines = torch.sin(phase)
+            
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+
+    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
+                             device=f0.device)
+        # fundamental component
+        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
+
+        # generate sine waveforms
+        sine_waves = self._f02sine(fn) * self.sine_amp
+
+        # generate uv signal
+        # uv = torch.ones(f0.shape)
+        # uv = uv * (f0 > self.voiced_threshold)
+        uv = self._f02uv(f0)
+
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+
+
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+def padDiff(x):
+    return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
+
+    
+class Generator(torch.nn.Module):
+    def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size):
+        super(Generator, self).__init__()
+
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        resblock = AdaINResBlock1
+
+        self.m_source = SourceModuleHnNSF(
+                    sampling_rate=24000,
+                    upsample_scale=np.prod(upsample_rates) * gen_istft_hop_size,
+                    harmonic_num=8, voiced_threshod=10)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * gen_istft_hop_size)
+        self.noise_convs = nn.ModuleList()
+        self.noise_res = nn.ModuleList()
+        
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes,resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d, style_dim))
+                
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            
+            if i + 1 < len(upsample_rates):  #
+                stride_f0 = np.prod(upsample_rates[i + 1:])
+                self.noise_convs.append(Conv1d(
+                    gen_istft_n_fft + 2, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
+                self.noise_res.append(resblock(c_cur, 7, [1,3,5], style_dim))
+            else:
+                self.noise_convs.append(Conv1d(gen_istft_n_fft + 2, c_cur, kernel_size=1))
+                self.noise_res.append(resblock(c_cur, 11, [1,3,5], style_dim))
+                
+                
+        self.post_n_fft = gen_istft_n_fft
+        self.conv_post = weight_norm(Conv1d(ch, self.post_n_fft + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = torch.nn.ReflectionPad1d((1, 0))
+        self.stft = TorchSTFT(filter_length=gen_istft_n_fft, hop_length=gen_istft_hop_size, win_length=gen_istft_n_fft)
+        
+        
+    def forward(self, x, s, f0):
+        with torch.no_grad():
+            f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+
+            har_source, noi_source, uv = self.m_source(f0)
+            har_source = har_source.transpose(1, 2).squeeze(1)
+            har_spec, har_phase = self.stft.transform(har_source)
+            har = torch.cat([har_spec, har_phase], dim=1)
+        
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x_source = self.noise_convs[i](har)
+            x_source = self.noise_res[i](x_source, s)
+
+            x = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x, s)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x, s)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        spec = torch.exp(x[:,:self.post_n_fft // 2 + 1, :])
+        phase = torch.sin(x[:, self.post_n_fft // 2 + 1:, :])
+        return self.stft.inverse(spec, phase)
+    
+    def fw_phase(self, x, s):
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x, s)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x, s)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.reflection_pad(x)
+        x = self.conv_post(x)
+        spec = torch.exp(x[:,:self.post_n_fft // 2 + 1, :])
+        phase = torch.sin(x[:, self.post_n_fft // 2 + 1:, :])
+        return spec, phase
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+        
+class AdainResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
+                 upsample='none', dropout_p=0.0):
+        super().__init__()
+        self.actv = actv
+        self.upsample_type = upsample
+        self.upsample = UpSample1d(upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+        self.dropout = nn.Dropout(dropout_p)
+        
+        if upsample == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
+        
+        
+    def _build_weights(self, dim_in, dim_out, style_dim):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
+        self.norm1 = AdaIN1d(style_dim, dim_in)
+        self.norm2 = AdaIN1d(style_dim, dim_out)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+
+    def _shortcut(self, x):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+
+    def _residual(self, x, s):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.pool(x)
+        x = self.conv1(self.dropout(x))
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(self.dropout(x))
+        return x
+
+    def forward(self, x, s):
+        out = self._residual(x, s)
+        out = (out + self._shortcut(x)) / np.sqrt(2)
+        return out
+    
+class UpSample1d(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        else:
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+
+class Decoder(nn.Module):
+    def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80, 
+                resblock_kernel_sizes = [3,7,11],
+                upsample_rates = [10, 6],
+                upsample_initial_channel=512,
+                resblock_dilation_sizes=[[1,3,5], [1,3,5], [1,3,5]],
+                upsample_kernel_sizes=[20, 12], 
+                gen_istft_n_fft=20, gen_istft_hop_size=5):
+        super().__init__()
+        
+        self.decode = nn.ModuleList()
+        
+        self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
+        
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
+
+        self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
+        
+        self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
+        
+        self.asr_res = nn.Sequential(
+            weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
+        )
+        
+        
+        self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, 
+                                   upsample_initial_channel, resblock_dilation_sizes, 
+                                   upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size)
+        
+    def forward(self, asr, F0_curve, N, s):
+        F0 = self.F0_conv(F0_curve.unsqueeze(1))
+        N = self.N_conv(N.unsqueeze(1))
+        
+        x = torch.cat([asr, F0, N], axis=1)
+        x = self.encode(x, s)
+        
+        asr_res = self.asr_res(asr)
+        
+        res = True
+        for block in self.decode:
+            if res:
+                x = torch.cat([x, asr_res, F0, N], axis=1)
+            x = block(x, s)
+            if block.upsample_type != "none":
+                res = False
+                
+        x = self.generator(x, s, F0_curve)
+        return x
diff --git a/src/models/kokoro/kokoro.py b/src/models/kokoro/kokoro.py
new file mode 100644
index 0000000000000000000000000000000000000000..df4a695440c274de632e18a18e884130b03aa57c
--- /dev/null
+++ b/src/models/kokoro/kokoro.py
@@ -0,0 +1,149 @@
+import phonemizer
+import re
+import torch
+
+def split_num(num):
+    num = num.group()
+    if '.' in num:
+        return num
+    elif ':' in num:
+        h, m = [int(n) for n in num.split(':')]
+        if m == 0:
+            return f"{h} o'clock"
+        elif m < 10:
+            return f'{h} oh {m}'
+        return f'{h} {m}'
+    year = int(num[:4])
+    if year < 1100 or year % 1000 < 10:
+        return num
+    left, right = num[:2], int(num[2:4])
+    s = 's' if num.endswith('s') else ''
+    if 100 <= year % 1000 <= 999:
+        if right == 0:
+            return f'{left} hundred{s}'
+        elif right < 10:
+            return f'{left} oh {right}{s}'
+    return f'{left} {right}{s}'
+
+def flip_money(m):
+    m = m.group()
+    bill = 'dollar' if m[0] == '$' else 'pound'
+    if m[-1].isalpha():
+        return f'{m[1:]} {bill}s'
+    elif '.' not in m:
+        s = '' if m[1:] == '1' else 's'
+        return f'{m[1:]} {bill}{s}'
+    b, c = m[1:].split('.')
+    s = '' if b == '1' else 's'
+    c = int(c.ljust(2, '0'))
+    coins = f"cent{'' if c == 1 else 's'}" if m[0] == '$' else ('penny' if c == 1 else 'pence')
+    return f'{b} {bill}{s} and {c} {coins}'
+
+def point_num(num):
+    a, b = num.group().split('.')
+    return ' point '.join([a, ' '.join(b)])
+
+def normalize_text(text):
+    text = text.replace(chr(8216), "'").replace(chr(8217), "'")
+    text = text.replace('«', chr(8220)).replace('»', chr(8221))
+    text = text.replace(chr(8220), '"').replace(chr(8221), '"')
+    text = text.replace('(', '«').replace(')', '»')
+    for a, b in zip('、。！，：；？', ',.!,:;?'):
+        text = text.replace(a, b+' ')
+    text = re.sub(r'[^\S \n]', ' ', text)
+    text = re.sub(r'  +', ' ', text)
+    text = re.sub(r'(?<=\n) +(?=\n)', '', text)
+    text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
+    text = re.sub(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister', text)
+    text = re.sub(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss', text)
+    text = re.sub(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs', text)
+    text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text)
+    text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text)
+    text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
+    text = re.sub(r'(?<=\d),(?=\d)', '', text)
+    text = re.sub(r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', flip_money, text)
+    text = re.sub(r'\d*\.\d+', point_num, text)
+    text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text)
+    text = re.sub(r'(?<=\d)S', ' S', text)
+    text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
+    text = re.sub(r"(?<=X')S\b", 's', text)
+    text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
+    text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
+    return text.strip()
+
+def get_vocab():
+    _pad = "$"
+    _punctuation = ';:,.!?¡¿—…"«»“” '
+    _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+    _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+    symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+    dicts = {}
+    for i in range(len((symbols))):
+        dicts[symbols[i]] = i
+    return dicts
+
+VOCAB = get_vocab()
+def tokenize(ps):
+    return [i for i in map(VOCAB.get, ps) if i is not None]
+
+phonemizers = dict(
+    a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
+    b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
+)
+def phonemize(text, lang, norm=True):
+    if norm:
+        text = normalize_text(text)
+    ps = phonemizers[lang].phonemize([text])
+    ps = ps[0] if ps else ''
+    # https://en.wiktionary.org/wiki/kokoro#English
+    ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
+    ps = ps.replace('ʲ', 'j').replace('r', 'ɹ').replace('x', 'k').replace('ɬ', 'l')
+    ps = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', ps)
+    ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', 'z', ps)
+    if lang == 'a':
+        ps = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', ps)
+    ps = ''.join(filter(lambda p: p in VOCAB, ps))
+    return ps.strip()
+
+def length_to_mask(lengths):
+    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+    mask = torch.gt(mask+1, lengths.unsqueeze(1))
+    return mask
+
+@torch.no_grad()
+def forward(model, tokens, ref_s, speed):
+    device = ref_s.device
+    tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
+    input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
+    text_mask = length_to_mask(input_lengths).to(device)
+    bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
+    d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
+    s = ref_s[:, 128:]
+    d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
+    x, _ = model.predictor.lstm(d)
+    duration = model.predictor.duration_proj(x)
+    duration = torch.sigmoid(duration).sum(axis=-1) / speed
+    pred_dur = torch.round(duration).clamp(min=1).long()
+    pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
+    c_frame = 0
+    for i in range(pred_aln_trg.size(0)):
+        pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
+        c_frame += pred_dur[0,i].item()
+    en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
+    F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
+    t_en = model.text_encoder(tokens, input_lengths, text_mask)
+    asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
+    return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).cpu()
+
+def generate(model, text, voicepack, lang='a', speed=1, ps=None):
+    ps = ps or phonemize(text, lang)
+    tokens = tokenize(ps)
+    if not tokens:
+        return None
+    elif len(tokens) > 510:
+        tokens = tokens[:510]
+        print('Truncated to 510 tokens')
+    ref_s = voicepack[len(tokens)]
+    out = forward(model, tokens, ref_s, speed)
+    ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
+    return out, ps
diff --git a/src/models/kokoro/models.py b/src/models/kokoro/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7235f00364419fda61fd179207944ca09b194e1
--- /dev/null
+++ b/src/models/kokoro/models.py
@@ -0,0 +1,372 @@
+# https://github.com/yl4579/StyleTTS2/blob/main/models.py
+from .istftnet import AdaIN1d, Decoder
+from munch import Munch
+from pathlib import Path
+from .plbert import load_plbert
+from torch.nn.utils import weight_norm, spectral_norm
+import json
+import numpy as np
+import os
+import os.path as osp
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    def forward(self, x):
+        return self.linear_layer(x)
+
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+    
+class TextEncoder(nn.Module):
+    def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
+        super().__init__()
+        self.embedding = nn.Embedding(n_symbols, channels)
+
+        padding = (kernel_size - 1) // 2
+        self.cnn = nn.ModuleList()
+        for _ in range(depth):
+            self.cnn.append(nn.Sequential(
+                weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
+                LayerNorm(channels),
+                actv,
+                nn.Dropout(0.2),
+            ))
+        # self.cnn = nn.Sequential(*self.cnn)
+
+        self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
+
+    def forward(self, x, input_lengths, m):
+        x = self.embedding(x)  # [B, T, emb]
+        x = x.transpose(1, 2)  # [B, emb, T]
+        m = m.to(input_lengths.device).unsqueeze(1)
+        x.masked_fill_(m, 0.0)
+        
+        for c in self.cnn:
+            x = c(x)
+            x.masked_fill_(m, 0.0)
+            
+        x = x.transpose(1, 2)  # [B, T, chn]
+
+        input_lengths = input_lengths.cpu().numpy()
+        x = nn.utils.rnn.pack_padded_sequence(
+            x, input_lengths, batch_first=True, enforce_sorted=False)
+
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(
+            x, batch_first=True)
+                
+        x = x.transpose(-1, -2)
+        x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
+
+        x_pad[:, :, :x.shape[-1]] = x
+        x = x_pad.to(x.device)
+        
+        x.masked_fill_(m, 0.0)
+        
+        return x
+
+    def inference(self, x):
+        x = self.embedding(x)
+        x = x.transpose(1, 2)
+        x = self.cnn(x)
+        x = x.transpose(1, 2)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        return x
+    
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+
+
+class UpSample1d(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        else:
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+
+class AdainResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
+                 upsample='none', dropout_p=0.0):
+        super().__init__()
+        self.actv = actv
+        self.upsample_type = upsample
+        self.upsample = UpSample1d(upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+        self.dropout = nn.Dropout(dropout_p)
+        
+        if upsample == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
+        
+        
+    def _build_weights(self, dim_in, dim_out, style_dim):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
+        self.norm1 = AdaIN1d(style_dim, dim_in)
+        self.norm2 = AdaIN1d(style_dim, dim_out)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+
+    def _shortcut(self, x):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+
+    def _residual(self, x, s):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.pool(x)
+        x = self.conv1(self.dropout(x))
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(self.dropout(x))
+        return x
+
+    def forward(self, x, s):
+        out = self._residual(x, s)
+        out = (out + self._shortcut(x)) / np.sqrt(2)
+        return out
+    
+class AdaLayerNorm(nn.Module):
+    def __init__(self, style_dim, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+
+        self.fc = nn.Linear(style_dim, channels*2)
+
+    def forward(self, x, s):
+        x = x.transpose(-1, -2)
+        x = x.transpose(1, -1)
+                
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
+        
+        
+        x = F.layer_norm(x, (self.channels,), eps=self.eps)
+        x = (1 + gamma) * x + beta
+        return x.transpose(1, -1).transpose(-1, -2)
+
+class ProsodyPredictor(nn.Module):
+
+    def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
+        super().__init__() 
+        
+        self.text_encoder = DurationEncoder(sty_dim=style_dim, 
+                                            d_model=d_hid,
+                                            nlayers=nlayers, 
+                                            dropout=dropout)
+
+        self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
+        self.duration_proj = LinearNorm(d_hid, max_dur)
+        
+        self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
+        self.F0 = nn.ModuleList()
+        self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
+        self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
+        self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
+
+        self.N = nn.ModuleList()
+        self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
+        self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
+        self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
+        
+        self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+        self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+
+
+    def forward(self, texts, style, text_lengths, alignment, m):
+        d = self.text_encoder(texts, style, text_lengths, m)
+        
+        batch_size = d.shape[0]
+        text_size = d.shape[1]
+        
+        # predict duration
+        input_lengths = text_lengths.cpu().numpy()
+        x = nn.utils.rnn.pack_padded_sequence(
+            d, input_lengths, batch_first=True, enforce_sorted=False)
+        
+        m = m.to(text_lengths.device).unsqueeze(1)
+        
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(
+            x, batch_first=True)
+        
+        x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]])
+
+        x_pad[:, :x.shape[1], :] = x
+        x = x_pad.to(x.device)
+                
+        duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=self.training))
+        
+        en = (d.transpose(-1, -2) @ alignment)
+
+        return duration.squeeze(-1), en
+    
+    def F0Ntrain(self, x, s):
+        x, _ = self.shared(x.transpose(-1, -2))
+        
+        F0 = x.transpose(-1, -2)
+        for block in self.F0:
+            F0 = block(F0, s)
+        F0 = self.F0_proj(F0)
+
+        N = x.transpose(-1, -2)
+        for block in self.N:
+            N = block(N, s)
+        N = self.N_proj(N)
+        
+        return F0.squeeze(1), N.squeeze(1)
+    
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+
+class DurationEncoder(nn.Module):
+
+    def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
+        super().__init__()
+        self.lstms = nn.ModuleList()
+        for _ in range(nlayers):
+            self.lstms.append(nn.LSTM(d_model + sty_dim, 
+                                 d_model // 2, 
+                                 num_layers=1, 
+                                 batch_first=True, 
+                                 bidirectional=True, 
+                                 dropout=dropout))
+            self.lstms.append(AdaLayerNorm(sty_dim, d_model))
+        
+        
+        self.dropout = dropout
+        self.d_model = d_model
+        self.sty_dim = sty_dim
+
+    def forward(self, x, style, text_lengths, m):
+        masks = m.to(text_lengths.device)
+        
+        x = x.permute(2, 0, 1)
+        s = style.expand(x.shape[0], x.shape[1], -1)
+        x = torch.cat([x, s], axis=-1)
+        x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
+                
+        x = x.transpose(0, 1)
+        input_lengths = text_lengths.cpu().numpy()
+        x = x.transpose(-1, -2)
+        
+        for block in self.lstms:
+            if isinstance(block, AdaLayerNorm):
+                x = block(x.transpose(-1, -2), style).transpose(-1, -2)
+                x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
+                x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
+            else:
+                x = x.transpose(-1, -2)
+                x = nn.utils.rnn.pack_padded_sequence(
+                    x, input_lengths, batch_first=True, enforce_sorted=False)
+                block.flatten_parameters()
+                x, _ = block(x)
+                x, _ = nn.utils.rnn.pad_packed_sequence(
+                    x, batch_first=True)
+                x = F.dropout(x, p=self.dropout, training=self.training)
+                x = x.transpose(-1, -2)
+                
+                x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
+
+                x_pad[:, :, :x.shape[-1]] = x
+                x = x_pad.to(x.device)
+        
+        return x.transpose(-1, -2)
+    
+    def inference(self, x, style):
+        x = self.embedding(x.transpose(-1, -2)) * np.sqrt(self.d_model)
+        style = style.expand(x.shape[0], x.shape[1], -1)
+        x = torch.cat([x, style], axis=-1)
+        src = self.pos_encoder(x)
+        output = self.transformer_encoder(src).transpose(0, 1)
+        return output
+    
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+
+# https://github.com/yl4579/StyleTTS2/blob/main/utils.py
+def recursive_munch(d):
+    if isinstance(d, dict):
+        return Munch((k, recursive_munch(v)) for k, v in d.items())
+    elif isinstance(d, list):
+        return [recursive_munch(v) for v in d]
+    else:
+        return d
+
+def build_model(path, device):
+    config = Path(__file__).parent / 'config.json'
+    assert config.exists(), f'Config path incorrect: config.json not found at {config}'
+    with open(config, 'r') as r:
+        args = recursive_munch(json.load(r))
+    assert args.decoder.type == 'istftnet', f'Unknown decoder type: {args.decoder.type}'
+    decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
+            resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
+            upsample_rates = args.decoder.upsample_rates,
+            upsample_initial_channel=args.decoder.upsample_initial_channel,
+            resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
+            upsample_kernel_sizes=args.decoder.upsample_kernel_sizes,
+            gen_istft_n_fft=args.decoder.gen_istft_n_fft, gen_istft_hop_size=args.decoder.gen_istft_hop_size)
+    text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
+    predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
+    bert = load_plbert()
+    bert_encoder = nn.Linear(bert.config.hidden_size, args.hidden_dim)
+    for parent in [bert, bert_encoder, predictor, decoder, text_encoder]:
+        for child in parent.children():
+            if isinstance(child, nn.RNNBase):
+                child.flatten_parameters()
+    model = Munch(
+        bert=bert.to(device).eval(),
+        bert_encoder=bert_encoder.to(device).eval(),
+        predictor=predictor.to(device).eval(),
+        decoder=decoder.to(device).eval(),
+        text_encoder=text_encoder.to(device).eval(),
+    )
+    for key, state_dict in torch.load(path, map_location='cpu', weights_only=True)['net'].items():
+        assert key in model, key
+        try:
+            model[key].load_state_dict(state_dict)
+        except:
+            state_dict = {k[7:]: v for k, v in state_dict.items()}
+            model[key].load_state_dict(state_dict, strict=False)
+    return model
diff --git a/src/models/kokoro/plbert.py b/src/models/kokoro/plbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef54f57bb8405abebfcd052bcb2be1249ce510bc
--- /dev/null
+++ b/src/models/kokoro/plbert.py
@@ -0,0 +1,15 @@
+# https://github.com/yl4579/StyleTTS2/blob/main/Utils/PLBERT/util.py
+from transformers import AlbertConfig, AlbertModel
+
+class CustomAlbert(AlbertModel):
+    def forward(self, *args, **kwargs):
+        # Call the original forward method
+        outputs = super().forward(*args, **kwargs)
+        # Only return the last_hidden_state
+        return outputs.last_hidden_state
+
+def load_plbert():
+    plbert_config = {'vocab_size': 178, 'hidden_size': 768, 'num_attention_heads': 12, 'intermediate_size': 2048, 'max_position_embeddings': 512, 'num_hidden_layers': 12, 'dropout': 0.1}
+    albert_base_configuration = AlbertConfig(**plbert_config)
+    bert = CustomAlbert(albert_base_configuration)
+    return bert
diff --git a/src/models/landmark_model.py b/src/models/landmark_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e8d3721b08e4c842a57c37f265633d404016262
--- /dev/null
+++ b/src/models/landmark_model.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: landmark_model.py
+import pdb
+
+from .base_model import BaseModel
+import cv2
+import numpy as np
+from src.utils.crop import crop_image, _transform_pts
+import torch
+from torch.cuda import nvtx
+from .predictor import numpy_to_torch_dtype_dict
+
+
+class LandmarkModel(BaseModel):
+    """
+    landmark Model
+    """
+
+    def __init__(self, **kwargs):
+        super(LandmarkModel, self).__init__(**kwargs)
+        self.dsize = 224
+
+    def input_process(self, *data):
+        if len(data) > 1:
+            img_rgb, lmk = data
+        else:
+            img_rgb = data[0]
+            lmk = None
+        if lmk is not None:
+            crop_dct = crop_image(img_rgb, lmk, dsize=self.dsize, scale=1.5, vy_ratio=-0.1)
+            img_crop_rgb = crop_dct['img_crop']
+        else:
+            # NOTE: force resize to 224x224, NOT RECOMMEND!
+            img_crop_rgb = cv2.resize(img_rgb, (self.dsize, self.dsize))
+            scale = max(img_rgb.shape[:2]) / self.dsize
+            crop_dct = {
+                'M_c2o': np.array([
+                    [scale, 0., 0.],
+                    [0., scale, 0.],
+                    [0., 0., 1.],
+                ], dtype=np.float32),
+            }
+
+        inp = (img_crop_rgb.astype(np.float32) / 255.).transpose(2, 0, 1)[None, ...]  # HxWx3 (BGR) -> 1x3xHxW (RGB!)
+        return inp, crop_dct
+
+    def output_process(self, *data):
+        out_pts, crop_dct = data
+        lmk = out_pts[2].reshape(-1, 2) * self.dsize  # scale to 0-224
+        lmk = _transform_pts(lmk, M=crop_dct['M_c2o'])
+        return lmk
+
+    def predict_trt(self, *data):
+        nvtx.range_push("forward")
+        feed_dict = {}
+        for i, inp in enumerate(self.predictor.inputs):
+            if isinstance(data[i], torch.Tensor):
+                feed_dict[inp['name']] = data[i]
+            else:
+                feed_dict[inp['name']] = torch.from_numpy(data[i]).to(device=self.device,
+                                                                      dtype=numpy_to_torch_dtype_dict[inp['dtype']])
+        preds_dict = self.predictor.predict(feed_dict, self.cudaStream)
+        outs = []
+        for i, out in enumerate(self.predictor.outputs):
+            outs.append(preds_dict[out["name"]].cpu().numpy())
+        nvtx.range_pop()
+        return outs
+
+    def predict(self, *data):
+        input, crop_dct = self.input_process(*data)
+        if self.predict_type == "trt":
+            preds = self.predict_trt(input)
+        else:
+            preds = self.predictor.predict(input)
+        outputs = self.output_process(preds, crop_dct)
+        return outputs
diff --git a/src/models/mediapipe_face_model.py b/src/models/mediapipe_face_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..87ef24a192ef783de52e64ad2b8988726afc0c37
--- /dev/null
+++ b/src/models/mediapipe_face_model.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/8/7 9:00
+# @Author  : shaoguowen
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: mediapipe_face_model.py
+import cv2
+import mediapipe as mp
+import numpy as np
+
+
+class MediaPipeFaceModel:
+    """
+    MediaPipeFaceModel
+    """
+
+    def __init__(self, **kwargs):
+        mp_face_mesh = mp.solutions.face_mesh
+        self.face_mesh = mp_face_mesh.FaceMesh(
+            static_image_mode=True,
+            max_num_faces=1,
+            refine_landmarks=True,
+            min_detection_confidence=0.5)
+
+    def predict(self, *data):
+        img_bgr = data[0]
+        img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+        h, w = img_bgr.shape[:2]
+        results = self.face_mesh.process(cv2.cvtColor(img_rgb, cv2.COLOR_BGR2RGB))
+
+        # Print and draw face mesh landmarks on the image.
+        if not results.multi_face_landmarks:
+            return []
+        outs = []
+        for face_landmarks in results.multi_face_landmarks:
+            landmarks = []
+            for landmark in face_landmarks.landmark:
+                # 提取每个关键点的 x, y, z 坐标
+                landmarks.append([landmark.x * w, landmark.y * h])
+            outs.append(np.array(landmarks))
+        return outs
diff --git a/src/models/motion_extractor_model.py b/src/models/motion_extractor_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb45276bea79edf6b00d7a0a1bf8e6d567a46493
--- /dev/null
+++ b/src/models/motion_extractor_model.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: motion_extractor_model.py
+import pdb
+
+import numpy as np
+
+from .base_model import BaseModel
+import torch
+from torch.cuda import nvtx
+from .predictor import numpy_to_torch_dtype_dict
+import torch.nn.functional as F
+
+
+def headpose_pred_to_degree(pred):
+    """
+    pred: (bs, 66) or (bs, 1) or others
+    """
+    if pred.ndim > 1 and pred.shape[1] == 66:
+        # NOTE: note that the average is modified to 97.5
+        idx_array = np.arange(0, 66)
+        pred = np.apply_along_axis(lambda x: np.exp(x) / np.sum(np.exp(x)), 1, pred)
+        degree = np.sum(pred * idx_array, axis=1) * 3 - 97.5
+
+        return degree
+
+    return pred
+
+
+class MotionExtractorModel(BaseModel):
+    """
+    MotionExtractorModel
+    """
+
+    def __init__(self, **kwargs):
+        super(MotionExtractorModel, self).__init__(**kwargs)
+        self.flag_refine_info = kwargs.get("flag_refine_info", True)
+
+    def input_process(self, *data):
+        img = data[0].astype(np.float32)
+        img /= 255.0
+        img = np.transpose(img, (2, 0, 1))
+        return img[None]
+
+    def output_process(self, *data):
+        if self.predict_type == "trt":
+            kp, pitch, yaw, roll, t, exp, scale = data
+        else:
+            pitch, yaw, roll, t, exp, scale, kp = data
+        if self.flag_refine_info:
+            bs = kp.shape[0]
+            pitch = headpose_pred_to_degree(pitch)[:, None]  # Bx1
+            yaw = headpose_pred_to_degree(yaw)[:, None]  # Bx1
+            roll = headpose_pred_to_degree(roll)[:, None]  # Bx1
+            kp = kp.reshape(bs, -1, 3)  # BxNx3
+            exp = exp.reshape(bs, -1, 3)  # BxNx3
+        return pitch, yaw, roll, t, exp, scale, kp
+
+    def predict_trt(self, *data):
+        nvtx.range_push("forward")
+        feed_dict = {}
+        for i, inp in enumerate(self.predictor.inputs):
+            if isinstance(data[i], torch.Tensor):
+                feed_dict[inp['name']] = data[i]
+            else:
+                feed_dict[inp['name']] = torch.from_numpy(data[i]).to(device=self.device,
+                                                                      dtype=numpy_to_torch_dtype_dict[inp['dtype']])
+        preds_dict = self.predictor.predict(feed_dict, self.cudaStream)
+        outs = []
+        for i, out in enumerate(self.predictor.outputs):
+            outs.append(preds_dict[out["name"]].cpu().numpy())
+        nvtx.range_pop()
+        return outs
+
+    def predict(self, *data):
+        img = self.input_process(*data)
+        if self.predict_type == "trt":
+            preds = self.predict_trt(img)
+        else:
+            preds = self.predictor.predict(img)
+        outputs = self.output_process(*preds)
+        return outputs
diff --git a/src/models/predictor.py b/src/models/predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eb55b666c6ec4169807e7d2580719c65bebda36
--- /dev/null
+++ b/src/models/predictor.py
@@ -0,0 +1,259 @@
+import pdb
+import threading
+import os
+import time
+
+import numpy as np
+import onnxruntime
+
+import torch
+from torch.cuda import nvtx
+from collections import OrderedDict
+import platform
+
+try:
+    import tensorrt as trt
+    import ctypes
+except ModuleNotFoundError:
+    print("No TensorRT Found")
+
+numpy_to_torch_dtype_dict = {
+    np.uint8: torch.uint8,
+    np.int8: torch.int8,
+    np.int16: torch.int16,
+    np.int32: torch.int32,
+    np.int64: torch.int64,
+    np.float16: torch.float16,
+    np.float32: torch.float32,
+    np.float64: torch.float64,
+    np.complex64: torch.complex64,
+    np.complex128: torch.complex128,
+}
+if np.version.full_version >= "1.24.0":
+    numpy_to_torch_dtype_dict[np.bool_] = torch.bool
+else:
+    numpy_to_torch_dtype_dict[np.bool] = torch.bool
+
+
+class TensorRTPredictor:
+    """
+    Implements inference for the EfficientDet TensorRT engine.
+    """
+
+    def __init__(self, **kwargs):
+        """
+        :param engine_path: The path to the serialized engine to load from disk.
+        """
+        if platform.system().lower() == 'linux':
+            ctypes.CDLL("./checkpoints/liveportrait_onnx/libgrid_sample_3d_plugin.so", mode=ctypes.RTLD_GLOBAL)
+        else:
+            ctypes.CDLL("./checkpoints/liveportrait_onnx/grid_sample_3d_plugin.dll", mode=ctypes.RTLD_GLOBAL,
+                        winmode=0)
+        # Load TRT engine
+        self.logger = trt.Logger(trt.Logger.ERROR)
+        trt.init_libnvinfer_plugins(self.logger, "")
+        engine_path = kwargs.get("model_path", None)
+        self.debug = kwargs.get("debug", False)
+        assert engine_path, f"model:{engine_path} must exist!"
+        with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:
+            assert runtime
+            self.engine = runtime.deserialize_cuda_engine(f.read())
+        assert self.engine
+        self.context = self.engine.create_execution_context()
+        assert self.context
+
+        # Setup I/O bindings
+        self.inputs = []
+        self.outputs = []
+        self.tensors = OrderedDict()
+
+        # TODO: 支持动态shape输入
+        for idx in range(self.engine.num_io_tensors):
+            name = self.engine[idx]
+            is_input = self.engine.get_tensor_mode(name).name == "INPUT"
+            shape = self.engine.get_tensor_shape(name)
+            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
+
+            binding = {
+                "index": idx,
+                "name": name,
+                "dtype": dtype,
+                "shape": list(shape)
+            }
+            if is_input:
+                self.inputs.append(binding)
+            else:
+                self.outputs.append(binding)
+
+        assert len(self.inputs) > 0
+        assert len(self.outputs) > 0
+        self.allocate_max_buffers()
+
+    def allocate_max_buffers(self, device="cuda"):
+        nvtx.range_push("allocate_max_buffers")
+        # 目前仅支持 batch 维度的动态处理
+        batch_size = 1
+        for idx in range(self.engine.num_io_tensors):
+            binding = self.engine[idx]
+            shape = self.engine.get_tensor_shape(binding)
+            is_input = self.engine.get_tensor_mode(binding).name == "INPUT"
+            if -1 in shape:
+                if is_input:
+                    shape = self.engine.get_tensor_profile_shape(binding, 0)[-1]
+                    batch_size = shape[0]
+                else:
+                    shape[0] = batch_size
+            dtype = trt.nptype(self.engine.get_tensor_dtype(binding))
+            tensor = torch.empty(
+                tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]
+            ).to(device=device)
+            self.tensors[binding] = tensor
+        nvtx.range_pop()
+
+    def input_spec(self):
+        """
+        Get the specs for the input tensor of the network. Useful to prepare memory allocations.
+        :return: Two items, the shape of the input tensor and its (numpy) datatype.
+        """
+        specs = []
+        for i, o in enumerate(self.inputs):
+            specs.append((o["name"], o['shape'], o['dtype']))
+            if self.debug:
+                print(f"trt input {i} -> {o['name']} -> {o['shape']}")
+        return specs
+
+    def output_spec(self):
+        """
+        Get the specs for the output tensors of the network. Useful to prepare memory allocations.
+        :return: A list with two items per element, the shape and (numpy) datatype of each output tensor.
+        """
+        specs = []
+        for i, o in enumerate(self.outputs):
+            specs.append((o["name"], o['shape'], o['dtype']))
+            if self.debug:
+                print(f"trt output {i} -> {o['name']} -> {o['shape']}")
+        return specs
+
+    def adjust_buffer(self, feed_dict):
+        nvtx.range_push("adjust_buffer")
+        for name, buf in feed_dict.items():
+            input_tensor = self.tensors[name]
+            current_shape = list(buf.shape)
+            slices = tuple(slice(0, dim) for dim in current_shape)
+            input_tensor[slices].copy_(buf)
+            self.context.set_input_shape(name, current_shape)
+        nvtx.range_pop()
+
+    def predict(self, feed_dict, stream):
+        """
+        Execute inference on a batch of images.
+        :param data: A list of inputs as numpy arrays.
+        :return A list of outputs as numpy arrays.
+        """
+        nvtx.range_push("set_tensors")
+        self.adjust_buffer(feed_dict)
+        for name, tensor in self.tensors.items():
+            self.context.set_tensor_address(name, tensor.data_ptr())
+        nvtx.range_pop()
+        nvtx.range_push("execute")
+        noerror = self.context.execute_async_v3(stream)
+        if not noerror:
+            raise ValueError("ERROR: inference failed.")
+        nvtx.range_pop()
+        return self.tensors
+
+    def __del__(self):
+        del self.engine
+        del self.context
+        del self.inputs
+        del self.outputs
+        del self.tensors
+
+
+class OnnxRuntimePredictor:
+    """
+    OnnxRuntime Prediction
+    """
+
+    def __init__(self, **kwargs):
+        model_path = kwargs.get("model_path", "")  # 用模型路径区分是否是一样的实例
+        assert os.path.exists(model_path), "model path must exist!"
+        # print("loading ort model:{}".format(model_path))
+        self.debug = kwargs.get("debug", False)
+        providers = ['CUDAExecutionProvider', 'CoreMLExecutionProvider', 'CPUExecutionProvider']
+
+        print(f"OnnxRuntime use {providers}")
+        opts = onnxruntime.SessionOptions()
+        # opts.inter_op_num_threads = kwargs.get("num_threads", 4)
+        # opts.intra_op_num_threads = kwargs.get("num_threads", 4)
+        # opts.log_severity_level = 3
+        self.onnx_model = onnxruntime.InferenceSession(model_path, providers=providers, sess_options=opts)
+        self.inputs = self.onnx_model.get_inputs()
+        self.outputs = self.onnx_model.get_outputs()
+
+    def input_spec(self):
+        """
+        Get the specs for the input tensor of the network. Useful to prepare memory allocations.
+        :return: Two items, the shape of the input tensor and its (numpy) datatype.
+        """
+        specs = []
+        for i, o in enumerate(self.inputs):
+            specs.append((o.name, o.shape, o.type))
+            if self.debug:
+                print(f"ort {i} -> {o.name} -> {o.shape}")
+        return specs
+
+    def output_spec(self):
+        """
+        Get the specs for the output tensors of the network. Useful to prepare memory allocations.
+        :return: A list with two items per element, the shape and (numpy) datatype of each output tensor.
+        """
+        specs = []
+        for i, o in enumerate(self.outputs):
+            specs.append((o.name, o.shape, o.type))
+            if self.debug:
+                print(f"ort output {i} -> {o.name} -> {o.shape}")
+        return specs
+
+    def predict(self, *data):
+        input_feeds = {}
+        for i in range(len(data)):
+            if self.inputs[i].type == 'tensor(float16)':
+                input_feeds[self.inputs[i].name] = data[i].astype(np.float16)
+            else:
+                input_feeds[self.inputs[i].name] = data[i].astype(np.float32)
+        results = self.onnx_model.run(None, input_feeds)
+        return results
+
+    def __del__(self):
+        del self.onnx_model
+        self.onnx_model = None
+
+
+class OnnxRuntimePredictorSingleton(OnnxRuntimePredictor):
+    """
+    单例模式，防止模型被加载多次
+    """
+    _instance_lock = threading.Lock()
+    _instance = {}
+
+    def __new__(cls, *args, **kwargs):
+        model_path = kwargs.get("model_path", "")  # 用模型路径区分是否是一样的实例
+        assert os.path.exists(model_path), "model path must exist!"
+        # 单例模式，避免重复加载模型
+        with OnnxRuntimePredictorSingleton._instance_lock:
+            if model_path not in OnnxRuntimePredictorSingleton._instance or \
+                    OnnxRuntimePredictorSingleton._instance[model_path].onnx_model is None:
+                OnnxRuntimePredictorSingleton._instance[model_path] = OnnxRuntimePredictor(**kwargs)
+
+        return OnnxRuntimePredictorSingleton._instance[model_path]
+
+
+def get_predictor(**kwargs):
+    predict_type = kwargs.get("predict_type", "trt")
+    if predict_type == "ort":
+        return OnnxRuntimePredictorSingleton(**kwargs)
+    elif predict_type == "trt":
+        return TensorRTPredictor(**kwargs)
+    else:
+        raise NotImplementedError
diff --git a/src/models/stitching_model.py b/src/models/stitching_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a25bd4bf26446cff1a74e823ff7ad87776908480
--- /dev/null
+++ b/src/models/stitching_model.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo0611@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: stitching_model.py
+
+from .base_model import BaseModel
+import torch
+from torch.cuda import nvtx
+from .predictor import numpy_to_torch_dtype_dict
+
+
+class StitchingModel(BaseModel):
+    """
+    StitchingModel
+    """
+
+    def __init__(self, **kwargs):
+        super(StitchingModel, self).__init__(**kwargs)
+
+    def input_process(self, *data):
+        input = data[0]
+        return input
+
+    def output_process(self, *data):
+        return data[0]
+
+    def predict_trt(self, *data):
+        nvtx.range_push("forward")
+        feed_dict = {}
+        for i, inp in enumerate(self.predictor.inputs):
+            if isinstance(data[i], torch.Tensor):
+                feed_dict[inp['name']] = data[i]
+            else:
+                feed_dict[inp['name']] = torch.from_numpy(data[i]).to(device=self.device,
+                                                                      dtype=numpy_to_torch_dtype_dict[inp['dtype']])
+        preds_dict = self.predictor.predict(feed_dict, self.cudaStream)
+        outs = []
+        for i, out in enumerate(self.predictor.outputs):
+            outs.append(preds_dict[out["name"]].cpu().numpy())
+        nvtx.range_pop()
+        return outs
+
+    def predict(self, *data):
+        data = self.input_process(*data)
+        if self.predict_type == "trt":
+            preds = self.predict_trt(data)
+        else:
+            preds = self.predictor.predict(data)
+        outputs = self.output_process(*preds)
+        return outputs
diff --git a/src/models/util.py b/src/models/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc6b925ff4d93dbb89d0d1e593bee15c888c39ee
--- /dev/null
+++ b/src/models/util.py
@@ -0,0 +1,452 @@
+# coding: utf-8
+
+"""
+This file defines various neural network modules and utility functions, including convolutional and residual blocks,
+normalizations, and functions for spatial transformation and tensor manipulation.
+"""
+
+from torch import nn
+import torch.nn.functional as F
+import torch
+import torch.nn.utils.spectral_norm as spectral_norm
+import math
+import warnings
+import collections.abc
+from itertools import repeat
+
+def kp2gaussian(kp, spatial_size, kp_variance):
+    """
+    Transform a keypoint into gaussian like representation
+    """
+    mean = kp
+
+    coordinate_grid = make_coordinate_grid(spatial_size, mean)
+    number_of_leading_dimensions = len(mean.shape) - 1
+    shape = (1,) * number_of_leading_dimensions + coordinate_grid.shape
+    coordinate_grid = coordinate_grid.view(*shape)
+    repeats = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 1)
+    coordinate_grid = coordinate_grid.repeat(*repeats)
+
+    # Preprocess kp shape
+    shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 3)
+    mean = mean.view(*shape)
+
+    mean_sub = (coordinate_grid - mean)
+
+    out = torch.exp(-0.5 * (mean_sub ** 2).sum(-1) / kp_variance)
+
+    return out
+
+
+def make_coordinate_grid(spatial_size, ref, **kwargs):
+    d, h, w = spatial_size
+    x = torch.arange(w).type(ref.dtype).to(ref.device)
+    y = torch.arange(h).type(ref.dtype).to(ref.device)
+    z = torch.arange(d).type(ref.dtype).to(ref.device)
+
+    # NOTE: must be right-down-in
+    x = (2 * (x / (w - 1)) - 1)  # the x axis faces to the right
+    y = (2 * (y / (h - 1)) - 1)  # the y axis faces to the bottom
+    z = (2 * (z / (d - 1)) - 1)  # the z axis faces to the inner
+
+    yy = y.view(1, -1, 1).repeat(d, 1, w)
+    xx = x.view(1, 1, -1).repeat(d, h, 1)
+    zz = z.view(-1, 1, 1).repeat(1, h, w)
+
+    meshed = torch.cat([xx.unsqueeze_(3), yy.unsqueeze_(3), zz.unsqueeze_(3)], 3)
+
+    return meshed
+
+
+class ConvT2d(nn.Module):
+    """
+    Upsampling block for use in decoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, stride=2, padding=1, output_padding=1):
+        super(ConvT2d, self).__init__()
+
+        self.convT = nn.ConvTranspose2d(in_features, out_features, kernel_size=kernel_size, stride=stride,
+                                        padding=padding, output_padding=output_padding)
+        self.norm = nn.InstanceNorm2d(out_features)
+
+    def forward(self, x):
+        out = self.convT(x)
+        out = self.norm(out)
+        out = F.leaky_relu(out)
+        return out
+
+
+class ResBlock3d(nn.Module):
+    """
+    Res block, preserve spatial resolution.
+    """
+
+    def __init__(self, in_features, kernel_size, padding):
+        super(ResBlock3d, self).__init__()
+        self.conv1 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)
+        self.conv2 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)
+        self.norm1 = nn.BatchNorm3d(in_features, affine=True)
+        self.norm2 = nn.BatchNorm3d(in_features, affine=True)
+
+    def forward(self, x):
+        out = self.norm1(x)
+        out = F.relu(out)
+        out = self.conv1(out)
+        out = self.norm2(out)
+        out = F.relu(out)
+        out = self.conv2(out)
+        out += x
+        return out
+
+
+class UpBlock3d(nn.Module):
+    """
+    Upsampling block for use in decoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(UpBlock3d, self).__init__()
+
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+        self.norm = nn.BatchNorm3d(out_features, affine=True)
+
+    def forward(self, x):
+        out = F.interpolate(x, scale_factor=(1, 2, 2))
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out)
+        return out
+
+
+class DownBlock2d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(DownBlock2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)
+        self.norm = nn.BatchNorm2d(out_features, affine=True)
+        self.pool = nn.AvgPool2d(kernel_size=(2, 2))
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = F.relu(out)
+        out = self.pool(out)
+        return out
+
+
+class DownBlock3d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(DownBlock3d, self).__init__()
+        '''
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                                padding=padding, groups=groups, stride=(1, 2, 2))
+        '''
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+        self.norm = nn.BatchNorm3d(out_features, affine=True)
+        self.pool = nn.AvgPool3d(kernel_size=(1, 2, 2))
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = F.relu(out)
+        out = self.pool(out)
+        return out
+
+
+class SameBlock2d(nn.Module):
+    """
+    Simple block, preserve spatial resolution.
+    """
+
+    def __init__(self, in_features, out_features, groups=1, kernel_size=3, padding=1, lrelu=False):
+        super(SameBlock2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)
+        self.norm = nn.BatchNorm2d(out_features, affine=True)
+        if lrelu:
+            self.ac = nn.LeakyReLU()
+        else:
+            self.ac = nn.ReLU()
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = self.ac(out)
+        return out
+
+
+class Encoder(nn.Module):
+    """
+    Hourglass Encoder
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Encoder, self).__init__()
+
+        down_blocks = []
+        for i in range(num_blocks):
+            down_blocks.append(DownBlock3d(in_features if i == 0 else min(max_features, block_expansion * (2 ** i)), min(max_features, block_expansion * (2 ** (i + 1))), kernel_size=3, padding=1))
+        self.down_blocks = nn.ModuleList(down_blocks)
+
+    def forward(self, x):
+        outs = [x]
+        for down_block in self.down_blocks:
+            outs.append(down_block(outs[-1]))
+        return outs
+
+
+class Decoder(nn.Module):
+    """
+    Hourglass Decoder
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Decoder, self).__init__()
+
+        up_blocks = []
+
+        for i in range(num_blocks)[::-1]:
+            in_filters = (1 if i == num_blocks - 1 else 2) * min(max_features, block_expansion * (2 ** (i + 1)))
+            out_filters = min(max_features, block_expansion * (2 ** i))
+            up_blocks.append(UpBlock3d(in_filters, out_filters, kernel_size=3, padding=1))
+
+        self.up_blocks = nn.ModuleList(up_blocks)
+        self.out_filters = block_expansion + in_features
+
+        self.conv = nn.Conv3d(in_channels=self.out_filters, out_channels=self.out_filters, kernel_size=3, padding=1)
+        self.norm = nn.BatchNorm3d(self.out_filters, affine=True)
+
+    def forward(self, x):
+        out = x.pop()
+        for up_block in self.up_blocks:
+            out = up_block(out)
+            skip = x.pop()
+            out = torch.cat([out, skip], dim=1)
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out)
+        return out
+
+
+class Hourglass(nn.Module):
+    """
+    Hourglass architecture.
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Hourglass, self).__init__()
+        self.encoder = Encoder(block_expansion, in_features, num_blocks, max_features)
+        self.decoder = Decoder(block_expansion, in_features, num_blocks, max_features)
+        self.out_filters = self.decoder.out_filters
+
+    def forward(self, x):
+        return self.decoder(self.encoder(x))
+
+
+class SPADE(nn.Module):
+    def __init__(self, norm_nc, label_nc):
+        super().__init__()
+
+        self.param_free_norm = nn.InstanceNorm2d(norm_nc, affine=False)
+        nhidden = 128
+
+        self.mlp_shared = nn.Sequential(
+            nn.Conv2d(label_nc, nhidden, kernel_size=3, padding=1),
+            nn.ReLU())
+        self.mlp_gamma = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)
+        self.mlp_beta = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)
+
+    def forward(self, x, segmap):
+        normalized = self.param_free_norm(x)
+        segmap = F.interpolate(segmap, size=x.size()[2:], mode='nearest')
+        actv = self.mlp_shared(segmap)
+        gamma = self.mlp_gamma(actv)
+        beta = self.mlp_beta(actv)
+        out = normalized * (1 + gamma) + beta
+        return out
+
+
+class SPADEResnetBlock(nn.Module):
+    def __init__(self, fin, fout, norm_G, label_nc, use_se=False, dilation=1):
+        super().__init__()
+        # Attributes
+        self.learned_shortcut = (fin != fout)
+        fmiddle = min(fin, fout)
+        self.use_se = use_se
+        # create conv layers
+        self.conv_0 = nn.Conv2d(fin, fmiddle, kernel_size=3, padding=dilation, dilation=dilation)
+        self.conv_1 = nn.Conv2d(fmiddle, fout, kernel_size=3, padding=dilation, dilation=dilation)
+        if self.learned_shortcut:
+            self.conv_s = nn.Conv2d(fin, fout, kernel_size=1, bias=False)
+        # apply spectral norm if specified
+        if 'spectral' in norm_G:
+            self.conv_0 = spectral_norm(self.conv_0)
+            self.conv_1 = spectral_norm(self.conv_1)
+            if self.learned_shortcut:
+                self.conv_s = spectral_norm(self.conv_s)
+        # define normalization layers
+        self.norm_0 = SPADE(fin, label_nc)
+        self.norm_1 = SPADE(fmiddle, label_nc)
+        if self.learned_shortcut:
+            self.norm_s = SPADE(fin, label_nc)
+
+    def forward(self, x, seg1):
+        x_s = self.shortcut(x, seg1)
+        dx = self.conv_0(self.actvn(self.norm_0(x, seg1)))
+        dx = self.conv_1(self.actvn(self.norm_1(dx, seg1)))
+        out = x_s + dx
+        return out
+
+    def shortcut(self, x, seg1):
+        if self.learned_shortcut:
+            x_s = self.conv_s(self.norm_s(x, seg1))
+        else:
+            x_s = x
+        return x_s
+
+    def actvn(self, x):
+        return F.leaky_relu(x, 2e-1)
+
+
+def filter_state_dict(state_dict, remove_name='fc'):
+    new_state_dict = {}
+    for key in state_dict:
+        if remove_name in key:
+            continue
+        new_state_dict[key] = state_dict[key]
+    return new_state_dict
+
+
+class GRN(nn.Module):
+    """ GRN (Global Response Normalization) layer
+    """
+
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+
+    def forward(self, x):
+        Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+
+
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def drop_path(x, drop_prob=0., training=False, scale_by_keep=True):
+    """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """ Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+
+to_2tuple = _ntuple(2)
diff --git a/src/models/warping_spade_model.py b/src/models/warping_spade_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..06ae4ad96d72e37eba5b0ec28f98338142d11c16
--- /dev/null
+++ b/src/models/warping_spade_model.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: warping_spade_model.py
+import pdb
+import numpy as np
+from .base_model import BaseModel
+import torch
+from torch.cuda import nvtx
+from .predictor import numpy_to_torch_dtype_dict
+
+
+class WarpingSpadeModel(BaseModel):
+    """
+    WarpingSpade Model
+    """
+
+    def __init__(self, **kwargs):
+        super(WarpingSpadeModel, self).__init__(**kwargs)
+
+    def input_process(self, *data):
+        feature_3d, kp_source, kp_driving = data
+        return feature_3d, kp_driving, kp_source
+
+    def output_process(self, *data):
+        if self.predict_type != "trt":
+            out = torch.from_numpy(data[0]).to(self.device).float()
+        else:
+            out = data[0]
+        out = out.permute(0, 2, 3, 1)
+        out = torch.clip(out, 0, 1) * 255
+        return out[0]
+
+    def predict_trt(self, *data):
+        nvtx.range_push("forward")
+        feed_dict = {}
+        for i, inp in enumerate(self.predictor.inputs):
+            if isinstance(data[i], torch.Tensor):
+                feed_dict[inp['name']] = data[i]
+            else:
+                feed_dict[inp['name']] = torch.from_numpy(data[i]).to(device=self.device,
+                                                                      dtype=numpy_to_torch_dtype_dict[inp['dtype']])
+        preds_dict = self.predictor.predict(feed_dict, self.cudaStream)
+        outs = []
+        for i, out in enumerate(self.predictor.outputs):
+            outs.append(preds_dict[out["name"]].clone())
+        nvtx.range_pop()
+        return outs
+
+    def predict(self, *data):
+        data = self.input_process(*data)
+        if self.predict_type == "trt":
+            preds = self.predict_trt(*data)
+        else:
+            preds = self.predictor.predict(*data)
+        outputs = self.output_process(*preds)
+        return outputs
diff --git a/src/pipelines/__init__.py b/src/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4281f36399765389fef51c9025d4e12f9e58fe75
--- /dev/null
+++ b/src/pipelines/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/7/16 19:22
+# @Author  : wenshao
+# @Email   : wenshaoguo0611@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py
diff --git a/src/pipelines/faster_live_portrait_pipeline.py b/src/pipelines/faster_live_portrait_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..9031fc312c298068d411bb1cc830e6062981084a
--- /dev/null
+++ b/src/pipelines/faster_live_portrait_pipeline.py
@@ -0,0 +1,592 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo0611@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: faster_live_portrait_pipeline.py
+
+import copy
+import os.path
+import pdb
+import time
+import traceback
+from PIL import Image
+import cv2
+from tqdm import tqdm
+import numpy as np
+import torch
+
+from .. import models
+from ..utils.crop import crop_image, parse_bbox_from_landmark, crop_image_by_bbox, paste_back, paste_back_pytorch
+from ..utils.utils import resize_to_limit, prepare_paste_back, get_rotation_matrix, calc_lip_close_ratio, \
+    calc_eye_close_ratio, transform_keypoint, concat_feat
+from src.utils import utils
+
+
+class FasterLivePortraitPipeline:
+    def __init__(self, cfg, **kwargs):
+        self.cfg = cfg
+        self.init(**kwargs)
+
+    def init(self, **kwargs):
+        self.init_vars(**kwargs)
+        self.init_models(**kwargs)
+
+    def update_cfg(self, args_user):
+        update_ret = False
+        for key in args_user:
+            if key in self.cfg.infer_params:
+                if self.cfg.infer_params[key] != args_user[key]:
+                    update_ret = True
+                print("update infer cfg {} from {} to {}".format(key, self.cfg.infer_params[key], args_user[key]))
+                self.cfg.infer_params[key] = args_user[key]
+            elif key in self.cfg.crop_params:
+                if self.cfg.crop_params[key] != args_user[key]:
+                    update_ret = True
+                print("update crop cfg {} from {} to {}".format(key, self.cfg.crop_params[key], args_user[key]))
+                self.cfg.crop_params[key] = args_user[key]
+            else:
+                if key in self.cfg.infer_params and self.cfg.infer_params[key] != args_user[key]:
+                    update_ret = True
+                print("add {}:{} to infer cfg".format(key, args_user[key]))
+                self.cfg.infer_params[key] = args_user[key]
+        return update_ret
+
+    def clean_models(self, **kwargs):
+        """
+        clean model
+        :param kwargs:
+        :return:
+        """
+        for key in list(self.model_dict.keys()):
+            del self.model_dict[key]
+        self.model_dict = {}
+
+    def init_models(self, **kwargs):
+        if not kwargs.get("is_animal", False):
+            print("load Human Model >>>")
+            self.is_animal = False
+            self.model_dict = {}
+            for model_name in self.cfg.models:
+                print(f"loading model: {model_name}")
+                print(self.cfg.models[model_name])
+                self.model_dict[model_name] = getattr(models, self.cfg.models[model_name]["name"])(
+                    **self.cfg.models[model_name])
+        else:
+            print("load Animal Model >>>")
+            self.is_animal = True
+            self.model_dict = {}
+            from src.utils.animal_landmark_runner import XPoseRunner
+            from src.utils.utils import make_abs_path
+            checkpoint_dir = None
+            for model_name in self.cfg.animal_models:
+                print(f"loading model: {model_name}")
+                print(self.cfg.animal_models[model_name])
+                if checkpoint_dir is None and isinstance(self.cfg.animal_models[model_name].model_path, str):
+                    checkpoint_dir = os.path.dirname(self.cfg.animal_models[model_name].model_path)
+                self.model_dict[model_name] = getattr(models, self.cfg.animal_models[model_name]["name"])(
+                    **self.cfg.animal_models[model_name])
+
+            xpose_config_file_path: str = make_abs_path("models/XPose/config_model/UniPose_SwinT.py")
+            xpose_ckpt_path: str = os.path.join(checkpoint_dir, "xpose.pth")
+            xpose_embedding_cache_path: str = os.path.join(checkpoint_dir, 'clip_embedding')
+            self.model_dict["xpose"] = XPoseRunner(model_config_path=xpose_config_file_path,
+                                                   model_checkpoint_path=xpose_ckpt_path,
+                                                   embeddings_cache_path=xpose_embedding_cache_path,
+                                                   flag_use_half_precision=True)
+
+    def init_vars(self, **kwargs):
+        self.mask_crop = cv2.imread(self.cfg.infer_params.mask_crop_path, cv2.IMREAD_COLOR)
+        self.frame_id = 0
+        self.src_lmk_pre = None
+        self.R_d_0 = None
+        self.x_d_0_info = None
+        self.R_d_smooth = utils.OneEuroFilter(4, 0.3)
+        self.exp_smooth = utils.OneEuroFilter(4, 0.3)
+
+        ## 记录source的信息
+        self.source_path = None
+        self.src_infos = []
+        self.src_imgs = []
+        self.is_source_video = False
+        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+
+    def calc_combined_eye_ratio(self, c_d_eyes_i, source_lmk):
+        c_s_eyes = calc_eye_close_ratio(source_lmk[None])
+        c_d_eyes_i = np.array(c_d_eyes_i).reshape(1, 1)
+        # [c_s,eyes, c_d,eyes,i]
+        combined_eye_ratio_tensor = np.concatenate([c_s_eyes, c_d_eyes_i], axis=1)
+        return combined_eye_ratio_tensor
+
+    def calc_combined_lip_ratio(self, c_d_lip_i, source_lmk):
+        c_s_lip = calc_lip_close_ratio(source_lmk[None])
+        c_d_lip_i = np.array(c_d_lip_i).reshape(1, 1)  # 1x1
+        # [c_s,lip, c_d,lip,i]
+        combined_lip_ratio_tensor = np.concatenate([c_s_lip, c_d_lip_i], axis=1)  # 1x2
+        return combined_lip_ratio_tensor
+
+    def prepare_source(self, source_path, **kwargs):
+        print(f"process source:{source_path} >>>>>>>>")
+        try:
+            if utils.is_video(source_path):
+                self.is_source_video = True
+            else:
+                self.is_source_video = False
+
+            if self.is_source_video:
+                src_imgs_bgr = []
+                src_vcap = cv2.VideoCapture(source_path)
+                while True:
+                    ret, frame = src_vcap.read()
+                    if not ret:
+                        break
+                    src_imgs_bgr.append(frame)
+                src_vcap.release()
+            else:
+                img_bgr = cv2.imread(source_path, cv2.IMREAD_COLOR)
+                src_imgs_bgr = [img_bgr]
+
+            self.src_imgs = []
+            self.src_infos = []
+            self.source_path = source_path
+
+            for ii, img_bgr in tqdm(enumerate(src_imgs_bgr), total=len(src_imgs_bgr)):
+                img_bgr = resize_to_limit(img_bgr, self.cfg.infer_params.source_max_dim,
+                                          self.cfg.infer_params.source_division)
+                img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+                src_faces = []
+                if self.is_animal:
+                    with torch.no_grad():
+                        img_rgb_pil = Image.fromarray(img_rgb)
+                        lmk = self.model_dict["xpose"].run(
+                            img_rgb_pil,
+                            'face',
+                            'animal_face',
+                            0,
+                            0
+                        )
+                    if lmk is None:
+                        continue
+                    self.src_imgs.append(img_rgb)
+                    src_faces.append(lmk)
+                else:
+                    src_faces = self.model_dict["face_analysis"].predict(img_bgr)
+                    if len(src_faces) == 0:
+                        print("No face detected in the this image.")
+                        continue
+                    self.src_imgs.append(img_rgb)
+                    # 如果是实时，只关注最大的那张脸
+                    if kwargs.get("realtime", False):
+                        src_faces = src_faces[:1]
+
+                crop_infos = []
+                for i in range(len(src_faces)):
+                    # NOTE: temporarily only pick the first face, to support multiple face in the future
+                    lmk = src_faces[i]
+                    # crop the face
+                    ret_dct = crop_image(
+                        img_rgb,  # ndarray
+                        lmk,  # 106x2 or Nx2
+                        dsize=self.cfg.crop_params.src_dsize,
+                        scale=self.cfg.crop_params.src_scale,
+                        vx_ratio=self.cfg.crop_params.src_vx_ratio,
+                        vy_ratio=self.cfg.crop_params.src_vy_ratio,
+                    )
+                    if self.is_animal:
+                        ret_dct["lmk_crop"] = lmk
+                    else:
+                        lmk = self.model_dict["landmark"].predict(img_rgb, lmk)
+                        ret_dct["lmk_crop"] = lmk
+                        ret_dct["lmk_crop_256x256"] = ret_dct["lmk_crop"] * 256 / self.cfg.crop_params.src_dsize
+
+                    # update a 256x256 version for network input
+                    ret_dct["img_crop_256x256"] = cv2.resize(
+                        ret_dct["img_crop"], (256, 256), interpolation=cv2.INTER_AREA
+                    )
+                    crop_infos.append(ret_dct)
+
+                src_infos = [[] for _ in range(len(crop_infos))]
+                for i, crop_info in enumerate(crop_infos):
+                    source_lmk = crop_info['lmk_crop']
+                    img_crop, img_crop_256x256 = crop_info['img_crop'], crop_info['img_crop_256x256']
+                    pitch, yaw, roll, t, exp, scale, kp = self.model_dict["motion_extractor"].predict(
+                        img_crop_256x256)
+                    x_s_info = {
+                        "pitch": pitch,
+                        "yaw": yaw,
+                        "roll": roll,
+                        "t": t,
+                        "exp": exp,
+                        "scale": scale,
+                        "kp": kp
+                    }
+                    src_infos[i].append(copy.deepcopy(x_s_info))
+                    x_c_s = kp
+                    R_s = get_rotation_matrix(pitch, yaw, roll)
+                    f_s = self.model_dict["app_feat_extractor"].predict(img_crop_256x256)
+                    x_s = transform_keypoint(pitch, yaw, roll, t, exp, scale, kp)
+                    src_infos[i].extend([source_lmk.copy(), R_s.copy(), f_s.copy(), x_s.copy(), x_c_s.copy()])
+                    if not self.is_animal:
+                        flag_lip_zero = self.cfg.infer_params.flag_normalize_lip  # not overwrite
+                        if flag_lip_zero:
+                            # let lip-open scalar to be 0 at first
+                            # 似乎要调参？
+                            c_d_lip_before_animation = [0.05]
+                            combined_lip_ratio_tensor_before_animation = self.calc_combined_lip_ratio(
+                                c_d_lip_before_animation, source_lmk.copy())
+                            if combined_lip_ratio_tensor_before_animation[0][
+                                0] < self.cfg.infer_params.lip_normalize_threshold:
+                                flag_lip_zero = False
+                                src_infos[i].append(None)
+                                src_infos[i].append(flag_lip_zero)
+                            else:
+                                lip_delta_before_animation = self.model_dict['stitching_lip_retarget'].predict(
+                                    concat_feat(x_s, combined_lip_ratio_tensor_before_animation))
+                                src_infos[i].append(lip_delta_before_animation.copy())
+                                src_infos[i].append(flag_lip_zero)
+                        else:
+                            src_infos[i].append(None)
+                            src_infos[i].append(flag_lip_zero)
+                    else:
+                        src_infos[i].append(None)
+                        src_infos[i].append(False)
+
+                    ######## prepare for pasteback ########
+                    if self.cfg.infer_params.flag_pasteback and self.cfg.infer_params.flag_do_crop and self.cfg.infer_params.flag_stitching:
+                        mask_ori_float = prepare_paste_back(self.mask_crop, crop_info['M_c2o'],
+                                                            dsize=(img_rgb.shape[1], img_rgb.shape[0]))
+                        mask_ori_float = torch.from_numpy(mask_ori_float).to(self.device)
+                        src_infos[i].append(mask_ori_float)
+                    else:
+                        src_infos[i].append(None)
+                    M = torch.from_numpy(crop_info['M_c2o']).to(self.device)
+                    src_infos[i].append(M)
+                self.src_infos.append(src_infos[:])
+            print(f"finish process source:{source_path} >>>>>>>>")
+            return len(self.src_infos) > 0
+        except Exception as e:
+            traceback.print_exc()
+            return False
+
+    def retarget_eye(self, kp_source, eye_close_ratio):
+        """
+        kp_source: BxNx3
+        eye_close_ratio: Bx3
+        Return: Bx(3*num_kp+2)
+        """
+        feat_eye = concat_feat(kp_source, eye_close_ratio)
+        delta = self.model_dict['stitching_eye_retarget'].predict(feat_eye)
+        return delta
+
+    def retarget_lip(self, kp_source, lip_close_ratio):
+        """
+        kp_source: BxNx3
+        lip_close_ratio: Bx2
+        """
+        feat_lip = concat_feat(kp_source, lip_close_ratio)
+        delta = self.model_dict['stitching_lip_retarget'].predict(feat_lip)
+        return delta
+
+    def stitching(self, kp_source, kp_driving):
+        """ conduct the stitching
+        kp_source: Bxnum_kpx3
+        kp_driving: Bxnum_kpx3
+        """
+
+        bs, num_kp = kp_source.shape[:2]
+
+        kp_driving_new = kp_driving.copy()
+
+        delta = self.model_dict['stitching'].predict(concat_feat(kp_source, kp_driving_new))
+
+        delta_exp = delta[..., :3 * num_kp].reshape(bs, num_kp, 3)  # 1x20x3
+        delta_tx_ty = delta[..., 3 * num_kp:3 * num_kp + 2].reshape(bs, 1, 2)  # 1x1x2
+
+        kp_driving_new += delta_exp
+        kp_driving_new[..., :2] += delta_tx_ty
+
+        return kp_driving_new
+
+    def _run(self, src_info, x_d_i_info, x_d_0_info, R_d_i, R_d_0, realtime, input_eye_ratio, input_lip_ratio,
+             I_p_pstbk, **kwargs):
+        out_crop, out_org = None, None
+        eye_delta_before_animation = None
+        for j in range(len(src_info)):
+            if self.is_source_video:
+                x_s_info, source_lmk, R_s, f_s, x_s, x_c_s, lip_delta_before_animation, flag_lip_zero, mask_ori_float, M = \
+                    src_info[j]
+                # let lip-open scalar to be 0 at first if the input is a video and flag_relative_motion
+                if not (self.cfg.infer_params.flag_normalize_lip and self.cfg.infer_params.flag_relative_motion):
+                    lip_delta_before_animation = None
+                # let eye-open scalar to be the same as the first frame if the latter is eye-open state
+                if self.cfg.infer_params.flag_source_video_eye_retargeting and source_lmk is not None:
+                    combined_eye_ratio_tensor_frame_zero = utils.calc_eye_close_ratio(src_info[0][1])
+                    c_d_eye_before_animation_frame_zero = [
+                        [combined_eye_ratio_tensor_frame_zero[0][:2].mean()]]
+                    if c_d_eye_before_animation_frame_zero[0][
+                        0] < self.cfg.infer_params.source_video_eye_retargeting_threshold:
+                        c_d_eye_before_animation_frame_zero = [[0.39]]
+                    combined_eye_ratio_tensor_before_animation = self.calc_combined_eye_ratio(
+                        c_d_eye_before_animation_frame_zero, source_lmk)
+                    eye_delta_before_animation = self.retarget_eye(x_s, combined_eye_ratio_tensor_before_animation)
+
+                if not realtime and self.cfg.infer_params.flag_pasteback and self.cfg.infer_params.flag_do_crop and \
+                        self.cfg.infer_params.flag_stitching:
+                    mask_ori_float = prepare_paste_back(self.mask_crop, M.cpu().numpy(),
+                                                        dsize=(self.src_imgs[0].shape[1], self.src_imgs[0].shape[0]))
+                    mask_ori_float = torch.from_numpy(mask_ori_float).to(self.device)
+            else:
+                x_s_info, source_lmk, R_s, f_s, x_s, x_c_s, lip_delta_before_animation, flag_lip_zero, mask_ori_float, M = \
+                    src_info[j]
+            if self.cfg.infer_params.flag_relative_motion:
+                if self.cfg.infer_params.animation_region in ["all", "pose"]:
+                    if self.is_source_video:
+                        R_new = self.R_d_smooth.process(R_d_i)
+                    else:
+                        R_new = (R_d_i @ np.transpose(R_d_0, (0, 2, 1))) @ R_s
+                else:
+                    R_new = R_s
+
+                delta_new = x_s_info['exp'].copy()
+                x_d_exp_smooth = x_d_i_info['exp'].copy()
+                if self.is_source_video:
+                    x_d_exp_smooth = self.exp_smooth.process(x_d_exp_smooth)
+                if self.cfg.infer_params.animation_region in ["all", "exp"]:
+                    if self.is_source_video:
+                        for idx in [1, 2, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]:
+                            delta_new[:, idx, :] = x_d_exp_smooth[:, idx, :]
+                        delta_new[:, 3:5, 1] = x_d_exp_smooth[:, 3:5, 1]
+                        delta_new[:, 5, 2] = x_d_exp_smooth[:, 5, 2]
+                        delta_new[:, 8, 2] = x_d_exp_smooth[:, 8, 2]
+                        delta_new[:, 9, 1:] = x_d_exp_smooth[:, 9, 1:]
+                    else:
+                        delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
+                elif self.cfg.infer_params.animation_region in ["lip"]:
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        if self.is_source_video:
+                            delta_new[:, lip_idx, :] = x_d_exp_smooth[:, lip_idx, :]
+                        else:
+                            delta_new[:, lip_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:,
+                                                       lip_idx, :]
+                elif self.cfg.infer_params.animation_region in ["eyes"]:
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        if self.is_source_video:
+                            delta_new[:, eyes_idx, :] = x_d_exp_smooth[:, eyes_idx, :]
+                        else:
+                            delta_new[:, eyes_idx, :] = (x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']))[:,
+                                                        eyes_idx, :]
+                if self.cfg.infer_params.animation_region in ["all"]:
+                    scale_new = x_s_info['scale'] if self.is_source_video else x_s_info['scale'] * (
+                            x_d_i_info['scale'] / x_d_0_info['scale'])
+                else:
+                    scale_new = x_s_info['scale']
+                if self.cfg.infer_params.animation_region in ["all"]:
+                    t_new = x_s_info['t'] if self.is_source_video else x_s_info['t'] + (
+                            x_d_i_info['t'] - x_d_0_info['t'])
+                else:
+                    t_new = x_s_info['t']
+            else:
+                if self.cfg.infer_params.animation_region in ["all", "pose"]:
+                    if self.is_source_video:
+                        R_new = self.R_d_smooth.process(R_d_i)
+                    else:
+                        R_new = R_d_i
+                else:
+                    R_new = R_s
+
+                delta_new = x_s_info['exp'].copy()
+                x_d_exp_smooth = x_d_i_info['exp'].copy()
+                if self.is_source_video:
+                    x_d_exp_smooth = self.exp_smooth.process(x_d_exp_smooth)
+                if self.cfg.infer_params.animation_region in ["all", "exp"]:
+                    for idx in [1, 2, 6, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]:
+                        delta_new[:, idx, :] = x_d_exp_smooth[:, idx, :] if self.is_source_video else x_d_i_info['exp'][
+                                                                                                      :, idx, :]
+                    delta_new[:, 3:5, 1] = x_d_exp_smooth[:, 3:5, 1] if self.is_source_video else x_d_i_info['exp'][:,
+                                                                                                  3:5, 1]
+                    delta_new[:, 5, 2] = x_d_exp_smooth[:, 5, 2] if self.is_source_video else x_d_i_info['exp'][:,
+                                                                                              5, 2]
+                    delta_new[:, 8, 2] = x_d_exp_smooth[:, 8, 2] if self.is_source_video else x_d_i_info['exp'][:,
+                                                                                              8, 2]
+                    delta_new[:, 9, 1:] = x_d_exp_smooth[:, 9, 1:] if self.is_source_video else x_d_i_info['exp'][:,
+                                                                                                9, 1:]
+                elif self.cfg.infer_params.animation_region in ["lip"]:
+                    for lip_idx in [6, 12, 14, 17, 19, 20]:
+                        delta_new[:, lip_idx, :] = x_d_exp_smooth[:, lip_idx, :] if self.is_source_video else \
+                            x_d_i_info['exp'][:, lip_idx, :]
+                elif self.cfg.infer_params.animation_region in ["eyes"]:
+                    for eyes_idx in [11, 13, 15, 16, 18]:
+                        delta_new[:, eyes_idx, :] = x_d_exp_smooth[:, eyes_idx, :] if self.is_source_video else \
+                            x_d_i_info['exp'][:, eyes_idx, :]
+                scale_new = x_s_info['scale'].copy()
+                if self.cfg.infer_params.animation_region in ["all", "pose"]:
+                    t_new = x_d_i_info['t'].copy()
+                else:
+                    t_new = x_s_info['t'].copy()
+
+            t_new[..., 2] = 0  # zero tz
+            x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
+            if not self.is_animal:
+                # Algorithm 1:
+                if not self.cfg.infer_params.flag_stitching and not self.cfg.infer_params.flag_eye_retargeting and not self.cfg.infer_params.flag_lip_retargeting:
+                    # without stitching or retargeting
+                    if flag_lip_zero and lip_delta_before_animation is not None:
+                        x_d_i_new += lip_delta_before_animation.reshape(-1, x_s.shape[1], 3)
+                    if self.cfg.infer_params.flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                        x_d_i_new += eye_delta_before_animation
+                elif self.cfg.infer_params.flag_stitching and not self.cfg.infer_params.flag_eye_retargeting and not self.cfg.infer_params.flag_lip_retargeting:
+                    # with stitching and without retargeting
+                    if flag_lip_zero and lip_delta_before_animation is not None:
+                        x_d_i_new = self.stitching(x_s, x_d_i_new) + lip_delta_before_animation.reshape(
+                            -1, x_s.shape[1], 3)
+                    else:
+                        x_d_i_new = self.stitching(x_s, x_d_i_new)
+                    if self.cfg.infer_params.flag_source_video_eye_retargeting and eye_delta_before_animation is not None:
+                        x_d_i_new += eye_delta_before_animation
+                else:
+                    eyes_delta, lip_delta = None, None
+                    if self.cfg.infer_params.flag_eye_retargeting:
+                        c_d_eyes_i = input_eye_ratio
+                        combined_eye_ratio_tensor = self.calc_combined_eye_ratio(c_d_eyes_i,
+                                                                                 source_lmk)
+                        # ∆_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i)
+                        eyes_delta = self.retarget_eye(x_s, combined_eye_ratio_tensor)
+                    if self.cfg.infer_params.flag_lip_retargeting:
+                        c_d_lip_i = input_lip_ratio
+                        combined_lip_ratio_tensor = self.calc_combined_lip_ratio(c_d_lip_i, source_lmk)
+                        # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
+                        lip_delta = self.retarget_lip(x_s, combined_lip_ratio_tensor)
+
+                    if self.cfg.infer_params.flag_relative_motion:  # use x_s
+                        x_d_i_new = x_s + \
+                                    (eyes_delta.reshape(-1, x_s.shape[1], 3) if eyes_delta is not None else 0) + \
+                                    (lip_delta.reshape(-1, x_s.shape[1], 3) if lip_delta is not None else 0)
+                    else:  # use x_d,i
+                        x_d_i_new = x_d_i_new + \
+                                    (eyes_delta.reshape(-1, x_s.shape[1], 3) if eyes_delta is not None else 0) + \
+                                    (lip_delta.reshape(-1, x_s.shape[1], 3) if lip_delta is not None else 0)
+
+                    if self.cfg.infer_params.flag_stitching:
+                        x_d_i_new = self.stitching(x_s, x_d_i_new)
+            else:
+                if self.cfg.infer_params.flag_stitching:
+                    x_d_i_new = self.stitching(x_s, x_d_i_new)
+
+            x_d_i_new = x_s + (x_d_i_new - x_s) * self.cfg.infer_params.driving_multiplier
+            out_crop = self.model_dict["warping_spade"].predict(f_s, x_s, x_d_i_new)
+            if not realtime and self.cfg.infer_params.flag_pasteback and self.cfg.infer_params.flag_do_crop and self.cfg.infer_params.flag_stitching:
+                # TODO: pasteback is slow, considering optimize it using multi-threading or GPU
+                # I_p_pstbk = paste_back(out_crop, crop_info['M_c2o'], I_p_pstbk, mask_ori_float)
+                I_p_pstbk = paste_back_pytorch(out_crop, M, I_p_pstbk, mask_ori_float)
+        return out_crop.to(dtype=torch.uint8).cpu().numpy(), I_p_pstbk.to(dtype=torch.uint8).cpu().numpy()
+
+    def run(self, image, img_src, src_info, **kwargs):
+        img_bgr = image
+        img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+        I_p_pstbk = torch.from_numpy(img_src).to(self.device).float()
+        realtime = kwargs.get("realtime", False)
+        if self.cfg.infer_params.flag_crop_driving_video:
+            if self.src_lmk_pre is None:
+                src_face = self.model_dict["face_analysis"].predict(img_bgr)
+                if len(src_face) == 0:
+                    return None, None, None, None
+                lmk = src_face[0]
+                lmk = self.model_dict["landmark"].predict(img_rgb, lmk)
+                self.src_lmk_pre = lmk.copy()
+            else:
+                lmk = self.model_dict["landmark"].predict(img_rgb, self.src_lmk_pre)
+                self.src_lmk_pre = lmk.copy()
+
+            ret_bbox = parse_bbox_from_landmark(
+                lmk,
+                scale=self.cfg.crop_params.dri_scale,
+                vx_ratio_crop_video=self.cfg.crop_params.dri_vx_ratio,
+                vy_ratio=self.cfg.crop_params.dri_vy_ratio,
+            )["bbox"]
+            global_bbox = [
+                ret_bbox[0, 0],
+                ret_bbox[0, 1],
+                ret_bbox[2, 0],
+                ret_bbox[2, 1],
+            ]
+            ret_dct = crop_image_by_bbox(
+                img_rgb,
+                global_bbox,
+                lmk=lmk,
+                dsize=kwargs.get("dsize", 512),
+                flag_rot=False,
+                borderValue=(0, 0, 0),
+            )
+            lmk_crop = ret_dct["lmk_crop"]
+            img_crop = ret_dct["img_crop"]
+            img_crop = cv2.resize(img_crop, (256, 256))
+        else:
+            if self.src_lmk_pre is None:
+                src_face = self.model_dict["face_analysis"].predict(img_bgr)
+                if len(src_face) == 0:
+                    return None, None, None, None
+                lmk = src_face[0]
+                lmk = self.model_dict["landmark"].predict(img_rgb, lmk)
+                self.src_lmk_pre = lmk.copy()
+            else:
+                lmk = self.model_dict["landmark"].predict(img_rgb, self.src_lmk_pre)
+                self.src_lmk_pre = lmk.copy()
+            lmk_crop = lmk.copy()
+            img_crop = cv2.resize(img_rgb, (256, 256))
+
+        input_eye_ratio = calc_eye_close_ratio(lmk_crop[None])
+        input_lip_ratio = calc_lip_close_ratio(lmk_crop[None])
+        pitch, yaw, roll, t, exp, scale, kp = self.model_dict["motion_extractor"].predict(img_crop)
+        x_d_i_info = {
+            "pitch": pitch,
+            "yaw": yaw,
+            "roll": roll,
+            "t": t,
+            "exp": exp,
+            "scale": scale,
+            "kp": kp
+        }
+        R_d_i = get_rotation_matrix(pitch, yaw, roll)
+        x_d_i_info["R"] = R_d_i
+        x_d_i_info_copy = copy.deepcopy(x_d_i_info)
+        for key in x_d_i_info_copy:
+            x_d_i_info_copy[key] = x_d_i_info_copy[key].astype(np.float32)
+        dri_motion_info = [x_d_i_info_copy, copy.deepcopy(input_eye_ratio.astype(np.float32)),
+                           copy.deepcopy(input_lip_ratio.astype(np.float32))]
+        if kwargs.get("first_frame", False) or self.R_d_0 is None:
+            self.frame_id = 0
+            self.R_d_0 = R_d_i.copy()
+            self.x_d_0_info = copy.deepcopy(x_d_i_info)
+            # realtime smooth
+            self.R_d_smooth = utils.OneEuroFilter(4, 0.3)
+            self.exp_smooth = utils.OneEuroFilter(4, 0.3)
+        R_d_0 = self.R_d_0.copy()
+        x_d_0_info = copy.deepcopy(self.x_d_0_info)
+        out_crop, I_p_pstbk = self._run(src_info, x_d_i_info, x_d_0_info, R_d_i, R_d_0, realtime, input_eye_ratio,
+                                        input_lip_ratio,
+                                        I_p_pstbk, **kwargs)
+        return img_crop, out_crop, I_p_pstbk, dri_motion_info
+
+    def run_with_pkl(self, dri_motion_info, img_src, src_info, **kwargs):
+        I_p_pstbk = torch.from_numpy(img_src).to(self.device).float()
+        realtime = kwargs.get("realtime", False)
+
+        input_eye_ratio = dri_motion_info[1]
+        input_lip_ratio = dri_motion_info[2]
+        x_d_i_info = dri_motion_info[0]
+        R_d_i = x_d_i_info["R"] if "R" in x_d_i_info else x_d_i_info["R_d"]
+
+        if kwargs.get("first_frame", False) or self.R_d_0 is None:
+            self.frame_id = 0
+            self.R_d_0 = R_d_i.copy()
+            self.x_d_0_info = copy.deepcopy(x_d_i_info)
+            # realtime smooth
+            self.R_d_smooth = utils.OneEuroFilter(4, 0.3)
+            self.exp_smooth = utils.OneEuroFilter(4, 0.3)
+        R_d_0 = self.R_d_0.copy()
+        x_d_0_info = copy.deepcopy(self.x_d_0_info)
+        out_crop, I_p_pstbk = self._run(src_info, x_d_i_info, x_d_0_info, R_d_i, R_d_0, realtime, input_eye_ratio,
+                                        input_lip_ratio, I_p_pstbk, **kwargs)
+        return out_crop, I_p_pstbk
+
+    def __del__(self):
+        self.clean_models()
diff --git a/src/pipelines/gradio_live_portrait_pipeline.py b/src/pipelines/gradio_live_portrait_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f998f34c100e73d3674c9dda168852ac508d7fb
--- /dev/null
+++ b/src/pipelines/gradio_live_portrait_pipeline.py
@@ -0,0 +1,562 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo0611@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: gradio_live_portrait_pipeline.py
+import pdb
+
+import gradio as gr
+import cv2
+import datetime
+import os
+import time
+import torchaudio
+from tqdm import tqdm
+import subprocess
+import pickle
+import numpy as np
+from .faster_live_portrait_pipeline import FasterLivePortraitPipeline
+from .joyvasa_audio_to_motion_pipeline import JoyVASAAudio2MotionPipeline
+from ..utils.utils import video_has_audio
+from ..utils.utils import resize_to_limit, prepare_paste_back, get_rotation_matrix, calc_lip_close_ratio, \
+    calc_eye_close_ratio, transform_keypoint, concat_feat
+from ..utils.crop import crop_image, parse_bbox_from_landmark, crop_image_by_bbox, paste_back, paste_back_pytorch
+from src.utils import utils
+import platform
+import torch
+from PIL import Image
+
+if platform.system().lower() == 'windows':
+    FFMPEG = "third_party/ffmpeg-7.0.1-full_build/bin/ffmpeg.exe"
+else:
+    FFMPEG = "ffmpeg"
+
+
+class GradioLivePortraitPipeline(FasterLivePortraitPipeline):
+    def __init__(self, cfg, **kwargs):
+        super(GradioLivePortraitPipeline, self).__init__(cfg, **kwargs)
+        self.joyvasa_pipe = None
+        self.kokoro_model = None
+
+    def execute_video(
+            self,
+            input_source_image_path=None,
+            input_source_video_path=None,
+            input_driving_video_path=None,
+            input_driving_image_path=None,
+            input_driving_pickle_path=None,
+            input_driving_audio_path=None,
+            input_driving_text=None,
+            flag_relative_input=True,
+            flag_do_crop_input=True,
+            flag_remap_input=True,
+            driving_multiplier=1.0,
+            flag_stitching=True,
+            flag_crop_driving_video_input=True,
+            flag_video_editing_head_rotation=False,
+            flag_is_animal=False,
+            animation_region="all",
+            scale=2.3,
+            vx_ratio=0.0,
+            vy_ratio=-0.125,
+            scale_crop_driving_video=2.2,
+            vx_ratio_crop_driving_video=0.0,
+            vy_ratio_crop_driving_video=-0.1,
+            driving_smooth_observation_variance=1e-7,
+            tab_selection=None,
+            v_tab_selection=None,
+            cfg_scale=4.0,
+            voice_name='af',
+    ):
+        """ for video driven potrait animation
+        """
+        if tab_selection == 'Video':
+            input_source_path = input_source_video_path
+        else:
+            input_source_path = input_source_image_path
+
+        if v_tab_selection == 'Image':
+            input_driving_path = str(input_driving_image_path)
+        elif v_tab_selection == 'Pickle':
+            input_driving_path = str(input_driving_pickle_path)
+        elif v_tab_selection == 'Audio':
+            input_driving_path = str(input_driving_audio_path)
+        elif v_tab_selection == 'Text':
+            input_driving_path = input_driving_text
+        else:
+            input_driving_path = str(input_driving_video_path)
+
+        if flag_is_animal != self.is_animal:
+            self.init_models(is_animal=flag_is_animal)
+
+        if input_source_path and input_driving_path:
+            args_user = {
+                'source': input_source_path,
+                'driving': input_driving_path,
+                'flag_relative_motion': flag_relative_input,
+                'flag_do_crop': flag_do_crop_input,
+                'flag_pasteback': flag_remap_input,
+                'driving_multiplier': driving_multiplier,
+                'flag_stitching': flag_stitching,
+                'flag_crop_driving_video': flag_crop_driving_video_input,
+                'flag_video_editing_head_rotation': flag_video_editing_head_rotation,
+                'src_scale': scale,
+                'src_vx_ratio': vx_ratio,
+                'src_vy_ratio': vy_ratio,
+                'dri_scale': scale_crop_driving_video,
+                'dri_vx_ratio': vx_ratio_crop_driving_video,
+                'dri_vy_ratio': vy_ratio_crop_driving_video,
+                'driving_smooth_observation_variance': driving_smooth_observation_variance,
+                'animation_region': animation_region,
+                'cfg_scale': cfg_scale
+            }
+            # update config from user input
+            update_ret = self.update_cfg(args_user)
+            if v_tab_selection == 'Video':
+                # video driven animation
+                video_path, video_path_concat, total_time = self.run_video_driving(input_driving_path,
+                                                                                   input_source_path,
+                                                                                   update_ret=update_ret)
+                gr.Info(f"Run successfully! Cost: {total_time} seconds!", duration=3)
+                return gr.update(visible=True), video_path, gr.update(visible=True), video_path_concat, gr.update(
+                    visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+            elif v_tab_selection == 'Pickle':
+                # pickle driven animation
+                video_path, video_path_concat, total_time = self.run_pickle_driving(input_driving_path,
+                                                                                    input_source_path,
+                                                                                    update_ret=update_ret)
+                gr.Info(f"Run successfully! Cost: {total_time} seconds!", duration=3)
+                return gr.update(visible=True), video_path, gr.update(visible=True), video_path_concat, gr.update(
+                    visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+            elif v_tab_selection == 'Audio':
+                # audio driven animation
+                video_path, video_path_concat, total_time = self.run_audio_driving(input_driving_path,
+                                                                                   input_source_path,
+                                                                                   update_ret=update_ret)
+                gr.Info(f"Run successfully! Cost: {total_time} seconds!", duration=3)
+                return gr.update(visible=True), video_path, gr.update(visible=True), video_path_concat, gr.update(
+                    visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+            elif v_tab_selection == 'Text':
+                # Text driven animation
+                video_path, video_path_concat, total_time = self.run_text_driving(input_driving_path,
+                                                                                  voice_name,
+                                                                                  input_source_path,
+                                                                                  update_ret=update_ret)
+                gr.Info(f"Run successfully! Cost: {total_time} seconds!", duration=3)
+                return gr.update(visible=True), video_path, gr.update(visible=True), video_path_concat, gr.update(
+                    visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+            else:
+                # video driven animation
+                image_path, image_path_concat, total_time = self.run_image_driving(input_driving_path,
+                                                                                   input_source_path,
+                                                                                   update_ret=update_ret)
+                gr.Info(f"Run successfully! Cost: {total_time} seconds!", duration=3)
+                return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
+                    visible=False), gr.update(visible=True), image_path, gr.update(
+                    visible=True), image_path_concat
+        else:
+            raise gr.Error("The input source portrait or driving video hasn't been prepared yet 💥!", duration=5)
+
+    def run_image_driving(self, driving_image_path, source_path, **kwargs):
+        if self.source_path != source_path or kwargs.get("update_ret", False):
+            # 如果不一样要重新初始化变量
+            self.init_vars(**kwargs)
+            ret = self.prepare_source(source_path)
+            if not ret:
+                raise gr.Error(f"Error in processing source:{source_path} 💥!", duration=5)
+
+        driving_image = cv2.imread(driving_image_path)
+        save_dir = f"./results/{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}"
+        os.makedirs(save_dir, exist_ok=True)
+
+        image_crop_path = os.path.join(save_dir,
+                                       f"{os.path.basename(source_path)}-{os.path.basename(driving_image_path)}-crop.jpg")
+        image_org_path = os.path.join(save_dir,
+                                      f"{os.path.basename(source_path)}-{os.path.basename(driving_image_path)}-org.jpg")
+
+        t0 = time.time()
+        dri_crop, out_crop, out_org = self.run(driving_image, self.src_imgs[0], self.src_infos[0],
+                                               first_frame=True)[:3]
+
+        dri_crop = cv2.resize(dri_crop, (512, 512))
+        out_crop = np.concatenate([dri_crop, out_crop], axis=1)
+        out_crop = cv2.cvtColor(out_crop, cv2.COLOR_RGB2BGR)
+        cv2.imwrite(image_crop_path, out_crop)
+        out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+        cv2.imwrite(image_org_path, out_org)
+        total_time = time.time() - t0
+
+        return image_org_path, image_crop_path, total_time
+
+    def run_video_driving(self, driving_video_path, source_path, **kwargs):
+        t00 = time.time()
+
+        if self.source_path != source_path or kwargs.get("update_ret", False):
+            # 如果不一样要重新初始化变量
+            self.init_vars(**kwargs)
+            ret = self.prepare_source(source_path)
+            if not ret:
+                raise gr.Error(f"Error in processing source:{source_path} 💥!", duration=5)
+
+        vcap = cv2.VideoCapture(driving_video_path)
+        if self.is_source_video:
+            duration, fps = utils.get_video_info(self.source_path)
+            fps = int(fps)
+        else:
+            fps = int(vcap.get(cv2.CAP_PROP_FPS))
+
+        dframe = int(vcap.get(cv2.CAP_PROP_FRAME_COUNT))
+        if self.is_source_video:
+            max_frame = min(dframe, len(self.src_imgs))
+        else:
+            max_frame = dframe
+        h, w = self.src_imgs[0].shape[:2]
+        save_dir = f"./results/{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}"
+        os.makedirs(save_dir, exist_ok=True)
+
+        # render output video
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        vsave_crop_path = os.path.join(save_dir,
+                                       f"{os.path.basename(source_path)}-{os.path.basename(driving_video_path)}-crop.mp4")
+        vout_crop = cv2.VideoWriter(vsave_crop_path, fourcc, fps, (512 * 2, 512))
+        vsave_org_path = os.path.join(save_dir,
+                                      f"{os.path.basename(source_path)}-{os.path.basename(driving_video_path)}-org.mp4")
+        vout_org = cv2.VideoWriter(vsave_org_path, fourcc, fps, (w, h))
+
+        infer_times = []
+        for i in tqdm(range(max_frame)):
+            ret, frame = vcap.read()
+            if not ret:
+                break
+            t0 = time.time()
+            first_frame = i == 0
+            if self.is_source_video:
+                dri_crop, out_crop, out_org = self.run(frame, self.src_imgs[i], self.src_infos[i],
+                                                       first_frame=first_frame)[:3]
+            else:
+                dri_crop, out_crop, out_org = self.run(frame, self.src_imgs[0], self.src_infos[0],
+                                                       first_frame=first_frame)[:3]
+            if out_crop is None:
+                print(f"no face in driving frame:{i}")
+                continue
+            infer_times.append(time.time() - t0)
+            dri_crop = cv2.resize(dri_crop, (512, 512))
+            out_crop = np.concatenate([dri_crop, out_crop], axis=1)
+            out_crop = cv2.cvtColor(out_crop, cv2.COLOR_RGB2BGR)
+            vout_crop.write(out_crop)
+            out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+            vout_org.write(out_org)
+        total_time = time.time() - t00
+        vcap.release()
+        vout_crop.release()
+        vout_org.release()
+
+        if video_has_audio(driving_video_path):
+            vsave_crop_path_new = os.path.splitext(vsave_crop_path)[0] + "-audio.mp4"
+            vsave_org_path_new = os.path.splitext(vsave_org_path)[0] + "-audio.mp4"
+            if self.is_source_video:
+                duration, fps = utils.get_video_info(vsave_crop_path)
+                subprocess.call(
+                    [FFMPEG, "-i", vsave_crop_path, "-i", driving_video_path,
+                     "-b:v", "10M", "-c:v", "libx264", "-map", "0:v", "-map", "1:a",
+                     "-c:a", "aac", "-pix_fmt", "yuv420p",
+                     "-shortest",  # 以最短的流为基准
+                     "-t", str(duration),  # 设置时长
+                     "-r", str(fps),  # 设置帧率
+                     vsave_crop_path_new, "-y"])
+                subprocess.call(
+                    [FFMPEG, "-i", vsave_org_path, "-i", driving_video_path,
+                     "-b:v", "10M", "-c:v", "libx264", "-map", "0:v", "-map", "1:a",
+                     "-c:a", "aac", "-pix_fmt", "yuv420p",
+                     "-shortest",  # 以最短的流为基准
+                     "-t", str(duration),  # 设置时长
+                     "-r", str(fps),  # 设置帧率
+                     vsave_org_path_new, "-y"])
+            else:
+                subprocess.call(
+                    [FFMPEG, "-i", vsave_crop_path, "-i", driving_video_path,
+                     "-b:v", "10M", "-c:v",
+                     "libx264", "-map", "0:v", "-map", "1:a",
+                     "-c:a", "aac",
+                     "-pix_fmt", "yuv420p", vsave_crop_path_new, "-y", "-shortest"])
+                subprocess.call(
+                    [FFMPEG, "-i", vsave_org_path, "-i", driving_video_path,
+                     "-b:v", "10M", "-c:v",
+                     "libx264", "-map", "0:v", "-map", "1:a",
+                     "-c:a", "aac",
+                     "-pix_fmt", "yuv420p", vsave_org_path_new, "-y", "-shortest"])
+
+            return vsave_org_path_new, vsave_crop_path_new, total_time
+        else:
+            return vsave_org_path, vsave_crop_path, total_time
+
+    def run_pickle_driving(self, driving_pickle_path, source_path, **kwargs):
+        t00 = time.time()
+
+        if self.source_path != source_path or kwargs.get("update_ret", False):
+            # 如果不一样要重新初始化变量
+            self.init_vars(**kwargs)
+            ret = self.prepare_source(source_path)
+            if not ret:
+                raise gr.Error(f"Error in processing source:{source_path} 💥!", duration=5)
+
+        with open(driving_pickle_path, "rb") as fin:
+            dri_motion_infos = pickle.load(fin)
+
+        if self.is_source_video:
+            duration, fps = utils.get_video_info(self.source_path)
+            fps = int(fps)
+        else:
+            fps = int(dri_motion_infos["output_fps"])
+
+        motion_lst = dri_motion_infos["motion"]
+        c_eyes_lst = dri_motion_infos["c_eyes_lst"] if "c_eyes_lst" in dri_motion_infos else dri_motion_infos[
+            "c_d_eyes_lst"]
+        c_lip_lst = dri_motion_infos["c_lip_lst"] if "c_lip_lst" in dri_motion_infos else dri_motion_infos[
+            "c_d_lip_lst"]
+        dframe = len(motion_lst)
+
+        if self.is_source_video:
+            max_frame = min(dframe, len(self.src_imgs))
+        else:
+            max_frame = dframe
+        h, w = self.src_imgs[0].shape[:2]
+        save_dir = kwargs.get("save_dir", f"./results/{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}")
+        os.makedirs(save_dir, exist_ok=True)
+
+        # render output video
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        vsave_crop_path = os.path.join(save_dir,
+                                       f"{os.path.basename(source_path)}-{os.path.basename(driving_pickle_path)}-crop.mp4")
+        vout_crop = cv2.VideoWriter(vsave_crop_path, fourcc, fps, (512, 512))
+        vsave_org_path = os.path.join(save_dir,
+                                      f"{os.path.basename(source_path)}-{os.path.basename(driving_pickle_path)}-org.mp4")
+        vout_org = cv2.VideoWriter(vsave_org_path, fourcc, fps, (w, h))
+
+        infer_times = []
+        for frame_ind in tqdm(range(max_frame)):
+            t0 = time.time()
+            first_frame = frame_ind == 0
+            dri_motion_info_ = [motion_lst[frame_ind]]
+            if c_eyes_lst:
+                dri_motion_info_.append(c_eyes_lst[frame_ind])
+            else:
+                dri_motion_info_.append(None)
+            if c_lip_lst:
+                dri_motion_info_.append(c_lip_lst[frame_ind])
+            else:
+                dri_motion_info_.append(None)
+            if self.is_source_video:
+                out_crop, out_org = self.run_with_pkl(dri_motion_info_, self.src_imgs[frame_ind],
+                                                      self.src_infos[frame_ind],
+                                                      first_frame=first_frame)[:3]
+            else:
+                out_crop, out_org = self.run_with_pkl(dri_motion_info_, self.src_imgs[0], self.src_infos[0],
+                                                      first_frame=first_frame)[:3]
+            if out_crop is None:
+                print(f"no face in driving frame:{frame_ind}")
+                continue
+            infer_times.append(time.time() - t0)
+            out_crop = cv2.cvtColor(out_crop, cv2.COLOR_RGB2BGR)
+            vout_crop.write(out_crop)
+            out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+            vout_org.write(out_org)
+        total_time = time.time() - t00
+        vout_crop.release()
+        vout_org.release()
+
+        return vsave_org_path, vsave_crop_path, total_time
+
+    def run_audio_driving(self, driving_audio_path, source_path, **kwargs):
+        t00 = time.time()
+
+        if self.source_path != source_path or kwargs.get("update_ret", False):
+            # 如果不一样要重新初始化变量
+            self.init_vars(**kwargs)
+            ret = self.prepare_source(source_path)
+            if not ret:
+                raise gr.Error(f"Error in processing source:{source_path} 💥!", duration=5)
+        save_dir = kwargs.get("save_dir", f"./results/{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}")
+        os.makedirs(save_dir, exist_ok=True)
+
+        if self.joyvasa_pipe is None:
+            self.joyvasa_pipe = JoyVASAAudio2MotionPipeline(motion_model_path=self.cfg.joyvasa_models.motion_model_path,
+                                                            audio_model_path=self.cfg.joyvasa_models.audio_model_path,
+                                                            motion_template_path=self.cfg.joyvasa_models.motion_template_path,
+                                                            cfg_mode=self.cfg.infer_params.cfg_mode,
+                                                            cfg_scale=self.cfg.infer_params.cfg_scale
+                                                            )
+        t01 = time.time()
+        dri_motion_infos = self.joyvasa_pipe.gen_motion_sequence(driving_audio_path)
+        gr.Info(f"JoyVASA cost time:{time.time() - t01}", duration=2)
+        motion_pickle_path = os.path.join(save_dir,
+                                          f"{os.path.basename(source_path)}-{os.path.basename(driving_audio_path)}.pkl")
+        with open(motion_pickle_path, "wb") as fw:
+            pickle.dump(dri_motion_infos, fw)
+
+        vsave_org_path, vsave_crop_path, total_time = self.run_pickle_driving(motion_pickle_path, source_path,
+                                                                              save_dir=save_dir)
+
+        vsave_crop_path_new = os.path.splitext(vsave_crop_path)[0] + "-audio.mp4"
+        vsave_org_path_new = os.path.splitext(vsave_org_path)[0] + "-audio.mp4"
+
+        duration, fps = utils.get_video_info(vsave_crop_path)
+        subprocess.call(
+            [FFMPEG, "-i", vsave_crop_path, "-i", driving_audio_path,
+             "-b:v", "10M", "-c:v", "libx264", "-map", "0:v", "-map", "1:a",
+             "-c:a", "aac", "-pix_fmt", "yuv420p",
+             "-shortest",  # 以最短的流为基准
+             "-t", str(duration),  # 设置时长
+             "-r", str(fps),  # 设置帧率
+             vsave_crop_path_new, "-y"])
+        subprocess.call(
+            [FFMPEG, "-i", vsave_org_path, "-i", driving_audio_path,
+             "-b:v", "10M", "-c:v", "libx264", "-map", "0:v", "-map", "1:a",
+             "-c:a", "aac", "-pix_fmt", "yuv420p",
+             "-shortest",  # 以最短的流为基准
+             "-t", str(duration),  # 设置时长
+             "-r", str(fps),  # 设置帧率
+             vsave_org_path_new, "-y"])
+
+        return vsave_org_path_new, vsave_crop_path_new, time.time() - t00
+
+    def run_text_driving(self, driving_text, voice_name, source_path, **kwargs):
+        if self.source_path != source_path or kwargs.get("update_ret", False):
+            # 如果不一样要重新初始化变量
+            self.init_vars(**kwargs)
+            ret = self.prepare_source(source_path)
+            if not ret:
+                raise gr.Error(f"Error in processing source:{source_path} 💥!", duration=5)
+        save_dir = kwargs.get("save_dir", f"./results/{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}")
+        os.makedirs(save_dir, exist_ok=True)
+        # TODO: make it better
+        import platform
+        if platform.system() == "Windows":
+            # refer: https://huggingface.co/hexgrad/Kokoro-82M/discussions/12
+            # if you install in different path, remember to change below envs
+            os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
+            os.environ["PHONEMIZER_ESPEAK_PATH"] = r"C:\Program Files\eSpeak NG\espeak-ng.exe"
+        from kokoro import KPipeline, KModel
+        import soundfile as sf
+        import json
+        with open("checkpoints/Kokoro-82M/config.json", "r", encoding="utf-8") as fin:
+            model_config = json.load(fin)
+        model = KModel(config=model_config, model="checkpoints/Kokoro-82M/kokoro-v1_0.pth")
+        pipeline = KPipeline(lang_code=voice_name[0], model=model)  # <= make sure lang_code matches voice
+        model.voices = {}
+        voice_path = "checkpoints/Kokoro-82M/voices"
+        for vname in os.listdir(voice_path):
+            pipeline.voices[os.path.splitext(vname)[0]] = torch.load(os.path.join(voice_path, vname), weights_only=True)
+        generator = pipeline(
+            driving_text, voice=voice_name,  # <= change voice here
+            speed=1, split_pattern=r'\n+'
+        )
+        audios = []
+        for i, (gs, ps, audio) in enumerate(generator):
+            audios.append(audio)
+        audios = np.concatenate(audios)
+        audio_save_path = os.path.join(save_dir, f"kokoro-82m-{voice_name}.wav")
+        sf.write(audio_save_path, audios, 24000)
+        print("save audio to:", audio_save_path)
+        vsave_org_path, vsave_crop_path, total_time = self.run_audio_driving(audio_save_path, source_path,
+                                                                             save_dir=save_dir)
+
+        return vsave_org_path, vsave_crop_path, total_time
+
+    def execute_image(self, input_eye_ratio: float, input_lip_ratio: float, input_image, flag_do_crop=True):
+        """ for single image retargeting
+        """
+        # disposable feature
+        f_s_user, x_s_user, source_lmk_user, crop_M_c2o, mask_ori, img_rgb = \
+            self.prepare_retargeting(input_image, flag_do_crop)
+
+        if input_eye_ratio is None or input_lip_ratio is None:
+            raise gr.Error("Invalid ratio input 💥!", duration=5)
+        else:
+            # ∆_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i)
+            combined_eye_ratio_tensor = self.calc_combined_eye_ratio([[input_eye_ratio]], source_lmk_user)
+            eyes_delta = self.retarget_eye(x_s_user, combined_eye_ratio_tensor)
+            # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
+            combined_lip_ratio_tensor = self.calc_combined_lip_ratio([[input_lip_ratio]], source_lmk_user)
+            lip_delta = self.retarget_lip(x_s_user, combined_lip_ratio_tensor)
+            num_kp = x_s_user.shape[1]
+            # default: use x_s
+            x_d_new = x_s_user + eyes_delta.reshape(-1, num_kp, 3) + lip_delta.reshape(-1, num_kp, 3)
+            # D(W(f_s; x_s, x′_d))
+            out = self.model_dict["warping_spade"].predict(f_s_user, x_s_user, x_d_new)
+            img_rgb = torch.from_numpy(img_rgb).to(self.device)
+            out_to_ori_blend = paste_back_pytorch(out, crop_M_c2o, img_rgb, mask_ori)
+            gr.Info("Run successfully!", duration=2)
+            return out.to(dtype=torch.uint8).cpu().numpy(), out_to_ori_blend.to(dtype=torch.uint8).cpu().numpy()
+
+    def prepare_retargeting(self, input_image, flag_do_crop=True):
+        """ for single image retargeting
+        """
+        if input_image is not None:
+            ######## process source portrait ########
+            img_bgr = cv2.imread(input_image, cv2.IMREAD_COLOR)
+            img_bgr = resize_to_limit(img_bgr, self.cfg.infer_params.source_max_dim,
+                                      self.cfg.infer_params.source_division)
+            img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+
+            if self.is_animal:
+                raise gr.Error("Animal Model Not Supported in Face Retarget 💥!", duration=5)
+            else:
+                src_faces = self.model_dict["face_analysis"].predict(img_bgr)
+
+            if len(src_faces) == 0:
+                raise gr.Error("No face detect in image 💥!", duration=5)
+            src_faces = src_faces[:1]
+            crop_infos = []
+            for i in range(len(src_faces)):
+                # NOTE: temporarily only pick the first face, to support multiple face in the future
+                lmk = src_faces[i]
+                # crop the face
+                ret_dct = crop_image(
+                    img_rgb,  # ndarray
+                    lmk,  # 106x2 or Nx2
+                    dsize=self.cfg.crop_params.src_dsize,
+                    scale=self.cfg.crop_params.src_scale,
+                    vx_ratio=self.cfg.crop_params.src_vx_ratio,
+                    vy_ratio=self.cfg.crop_params.src_vy_ratio,
+                )
+
+                lmk = self.model_dict["landmark"].predict(img_rgb, lmk)
+                ret_dct["lmk_crop"] = lmk
+                ret_dct["lmk_crop_256x256"] = ret_dct["lmk_crop"] * 256 / self.cfg.crop_params.src_dsize
+
+                # update a 256x256 version for network input
+                ret_dct["img_crop_256x256"] = cv2.resize(
+                    ret_dct["img_crop"], (256, 256), interpolation=cv2.INTER_AREA
+                )
+                ret_dct["lmk_crop_256x256"] = ret_dct["lmk_crop"] * 256 / self.cfg.crop_params.src_dsize
+                crop_infos.append(ret_dct)
+            crop_info = crop_infos[0]
+            if flag_do_crop:
+                I_s = crop_info['img_crop_256x256'].copy()
+            else:
+                I_s = img_rgb.copy()
+            pitch, yaw, roll, t, exp, scale, kp = self.model_dict["motion_extractor"].predict(I_s)
+            x_s_info = {
+                "pitch": pitch,
+                "yaw": yaw,
+                "roll": roll,
+                "t": t,
+                "exp": exp,
+                "scale": scale,
+                "kp": kp
+            }
+            R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+            ############################################
+            f_s_user = self.model_dict["app_feat_extractor"].predict(I_s)
+            x_s_user = transform_keypoint(pitch, yaw, roll, t, exp, scale, kp)
+            source_lmk_user = crop_info['lmk_crop']
+            crop_M_c2o = crop_info['M_c2o']
+            crop_M_c2o = torch.from_numpy(crop_M_c2o).to(self.device)
+            mask_ori = prepare_paste_back(self.mask_crop, crop_info['M_c2o'],
+                                          dsize=(img_rgb.shape[1], img_rgb.shape[0]))
+            mask_ori = torch.from_numpy(mask_ori).to(self.device).float()
+            return f_s_user, x_s_user, source_lmk_user, crop_M_c2o, mask_ori, img_rgb
+        else:
+            # when press the clear button, go here
+            raise gr.Error("The retargeting input hasn't been prepared yet 💥!", duration=5)
diff --git a/src/pipelines/joyvasa_audio_to_motion_pipeline.py b/src/pipelines/joyvasa_audio_to_motion_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..35380805ff3cf4a273407b2540984dd0cfd11dde
--- /dev/null
+++ b/src/pipelines/joyvasa_audio_to_motion_pipeline.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/12/15
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: joyvasa_audio_to_motion_pipeline.py
+
+import math
+import pdb
+
+import torch
+import torchaudio
+import numpy as np
+import torch.nn.functional as F
+import pickle
+from tqdm import tqdm
+import pathlib
+import os
+
+from ..models.JoyVASA.dit_talking_head import DitTalkingHead
+from ..models.JoyVASA.helper import NullableArgs
+from ..utils import utils
+
+
+class JoyVASAAudio2MotionPipeline:
+    """
+    JoyVASA 声音生成LivePortrait Motion
+    """
+
+    def __init__(self, **kwargs):
+        self.device, self.dtype = utils.get_opt_device_dtype()
+        # Check if the operating system is Windows
+        if os.name == 'nt':
+            temp = pathlib.PosixPath
+            pathlib.PosixPath = pathlib.WindowsPath
+        motion_model_path = kwargs.get("motion_model_path", "")
+        audio_model_path = kwargs.get("audio_model_path", "")
+        motion_template_path = kwargs.get("motion_template_path", "")
+        model_data = torch.load(motion_model_path, map_location="cpu")
+        model_args = NullableArgs(model_data['args'])
+        model = DitTalkingHead(motion_feat_dim=model_args.motion_feat_dim,
+                               n_motions=model_args.n_motions,
+                               n_prev_motions=model_args.n_prev_motions,
+                               feature_dim=model_args.feature_dim,
+                               audio_model=model_args.audio_model,
+                               n_diff_steps=model_args.n_diff_steps,
+                               audio_encoder_path=audio_model_path)
+        model_data['model'].pop('denoising_net.TE.pe')
+        model.load_state_dict(model_data['model'], strict=False)
+        model.to(self.device, dtype=self.dtype)
+        model.eval()
+
+        # Restore the original PosixPath if it was changed
+        if os.name == 'nt':
+            pathlib.PosixPath = temp
+
+        self.motion_generator = model
+        self.n_motions = model_args.n_motions
+        self.n_prev_motions = model_args.n_prev_motions
+        self.fps = model_args.fps
+        self.audio_unit = 16000. / self.fps  # num of samples per frame
+        self.n_audio_samples = round(self.audio_unit * self.n_motions)
+        self.pad_mode = model_args.pad_mode
+        self.use_indicator = model_args.use_indicator
+        self.cfg_mode = kwargs.get("cfg_mode", "incremental")
+        self.cfg_cond = kwargs.get("cfg_cond", None)
+        self.cfg_scale = kwargs.get("cfg_scale", 2.8)
+        with open(motion_template_path, 'rb') as fin:
+            self.templete_dict = pickle.load(fin)
+
+    @torch.inference_mode()
+    def gen_motion_sequence(self, audio_path, **kwargs):
+        # preprocess audio
+        audio, sample_rate = torchaudio.load(audio_path)
+        if sample_rate != 16000:
+            audio = torchaudio.functional.resample(
+                audio,
+                orig_freq=sample_rate,
+                new_freq=16000,
+            )
+        audio = audio.mean(0).to(self.device, dtype=self.dtype)
+        # audio = F.pad(audio, (1280, 640), "constant", 0)
+        # audio_mean, audio_std = torch.mean(audio), torch.std(audio)
+        # audio = (audio - audio_mean) / (audio_std + 1e-5)
+
+        # crop audio into n_subdivision according to n_motions
+        clip_len = int(len(audio) / 16000 * self.fps)
+        stride = self.n_motions
+        if clip_len <= self.n_motions:
+            n_subdivision = 1
+        else:
+            n_subdivision = math.ceil(clip_len / stride)
+
+        # padding
+        n_padding_audio_samples = self.n_audio_samples * n_subdivision - len(audio)
+        n_padding_frames = math.ceil(n_padding_audio_samples / self.audio_unit)
+        if n_padding_audio_samples > 0:
+            if self.pad_mode == 'zero':
+                padding_value = 0
+            elif self.pad_mode == 'replicate':
+                padding_value = audio[-1]
+            else:
+                raise ValueError(f'Unknown pad mode: {self.pad_mode}')
+            audio = F.pad(audio, (0, n_padding_audio_samples), value=padding_value)
+
+        # generate motions
+        coef_list = []
+        for i in range(0, n_subdivision):
+            start_idx = i * stride
+            end_idx = start_idx + self.n_motions
+            indicator = torch.ones((1, self.n_motions)).to(self.device) if self.use_indicator else None
+            if indicator is not None and i == n_subdivision - 1 and n_padding_frames > 0:
+                indicator[:, -n_padding_frames:] = 0
+            audio_in = audio[round(start_idx * self.audio_unit):round(end_idx * self.audio_unit)].unsqueeze(0)
+
+            if i == 0:
+                motion_feat, noise, prev_audio_feat = self.motion_generator.sample(audio_in,
+                                                                                   indicator=indicator,
+                                                                                   cfg_mode=self.cfg_mode,
+                                                                                   cfg_cond=self.cfg_cond,
+                                                                                   cfg_scale=self.cfg_scale,
+                                                                                   dynamic_threshold=0)
+            else:
+                motion_feat, noise, prev_audio_feat = self.motion_generator.sample(audio_in,
+                                                                                   prev_motion_feat.to(self.dtype),
+                                                                                   prev_audio_feat.to(self.dtype),
+                                                                                   noise.to(self.dtype),
+                                                                                   indicator=indicator,
+                                                                                   cfg_mode=self.cfg_mode,
+                                                                                   cfg_cond=self.cfg_cond,
+                                                                                   cfg_scale=self.cfg_scale,
+                                                                                   dynamic_threshold=0)
+            prev_motion_feat = motion_feat[:, -self.n_prev_motions:].clone()
+            prev_audio_feat = prev_audio_feat[:, -self.n_prev_motions:]
+
+            motion_coef = motion_feat
+            if i == n_subdivision - 1 and n_padding_frames > 0:
+                motion_coef = motion_coef[:, :-n_padding_frames]  # delete padded frames
+            coef_list.append(motion_coef)
+            motion_coef = torch.cat(coef_list, dim=1)
+            # motion_coef = self.reformat_motion(args, motion_coef)
+
+        motion_coef = motion_coef.squeeze().cpu().numpy().astype(np.float32)
+        motion_list = []
+        for idx in tqdm(range(motion_coef.shape[0]), total=motion_coef.shape[0]):
+            exp = motion_coef[idx][:63] * self.templete_dict["std_exp"] + self.templete_dict["mean_exp"]
+            scale = motion_coef[idx][63:64] * (
+                    self.templete_dict["max_scale"] - self.templete_dict["min_scale"]) + self.templete_dict[
+                        "min_scale"]
+            t = motion_coef[idx][64:67] * (self.templete_dict["max_t"] - self.templete_dict["min_t"]) + \
+                self.templete_dict["min_t"]
+            pitch = motion_coef[idx][67:68] * (
+                    self.templete_dict["max_pitch"] - self.templete_dict["min_pitch"]) + self.templete_dict[
+                        "min_pitch"]
+            yaw = motion_coef[idx][68:69] * (self.templete_dict["max_yaw"] - self.templete_dict["min_yaw"]) + \
+                  self.templete_dict["min_yaw"]
+            roll = motion_coef[idx][69:70] * (self.templete_dict["max_roll"] - self.templete_dict["min_roll"]) + \
+                   self.templete_dict["min_roll"]
+
+            R = utils.get_rotation_matrix(pitch, yaw, roll)
+            R = R.reshape(1, 3, 3).astype(np.float32)
+
+            exp = exp.reshape(1, 21, 3).astype(np.float32)
+            scale = scale.reshape(1, 1).astype(np.float32)
+            t = t.reshape(1, 3).astype(np.float32)
+            pitch = pitch.reshape(1, 1).astype(np.float32)
+            yaw = yaw.reshape(1, 1).astype(np.float32)
+            roll = roll.reshape(1, 1).astype(np.float32)
+
+            motion_list.append({"exp": exp, "scale": scale, "R": R, "t": t, "pitch": pitch, "yaw": yaw, "roll": roll})
+        tgt_motion = {'n_frames': motion_coef.shape[0], 'output_fps': self.fps, 'motion': motion_list, 'c_eyes_lst': [],
+                      'c_lip_lst': []}
+        return tgt_motion
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d9085a37e4abb3b69ea913c0919667ff6ca3c8a
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : wenshaoguo0611@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py
diff --git a/src/utils/animal_landmark_runner.py b/src/utils/animal_landmark_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b5c094a4936c138969cc419e01f93bc291fb151
--- /dev/null
+++ b/src/utils/animal_landmark_runner.py
@@ -0,0 +1,144 @@
+# coding: utf-8
+
+"""
+face detectoin and alignment using XPose
+"""
+
+import os
+import pickle
+import torch
+import numpy as np
+from PIL import Image
+from torchvision.ops import nms
+from collections import OrderedDict
+
+
+def clean_state_dict(state_dict):
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k[:7] == 'module.':
+            k = k[7:]  # remove `module.`
+        new_state_dict[k] = v
+    return new_state_dict
+
+
+from src.models.XPose import transforms as T
+from src.models.XPose.models import build_model
+from src.models.XPose.predefined_keypoints import *
+from src.models.XPose.util import box_ops
+from src.models.XPose.util.config import Config
+
+
+class XPoseRunner(object):
+    def __init__(self, model_config_path, model_checkpoint_path, embeddings_cache_path=None, cpu_only=False, **kwargs):
+        self.device_id = kwargs.get("device_id", 0)
+        self.flag_use_half_precision = kwargs.get("flag_use_half_precision", True)
+        self.device = f"cuda:{self.device_id}" if not cpu_only else "cpu"
+        self.model = self.load_animal_model(model_config_path, model_checkpoint_path, self.device)
+        # Load cached embeddings if available
+        try:
+            with open(f'{embeddings_cache_path}_9.pkl', 'rb') as f:
+                self.ins_text_embeddings_9, self.kpt_text_embeddings_9 = pickle.load(f)
+            with open(f'{embeddings_cache_path}_68.pkl', 'rb') as f:
+                self.ins_text_embeddings_68, self.kpt_text_embeddings_68 = pickle.load(f)
+            print("Loaded cached embeddings from file.")
+        except Exception:
+            raise ValueError("Could not load clip embeddings from file, please check your file path.")
+
+    def load_animal_model(self, model_config_path, model_checkpoint_path, device):
+        args = Config.fromfile(model_config_path)
+        args.device = device
+        model = build_model(args)
+        checkpoint = torch.load(model_checkpoint_path, map_location=lambda storage, loc: storage)
+        load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+        model.eval()
+        return model
+
+    def load_image(self, input_image):
+        image_pil = input_image.convert("RGB")
+        transform = T.Compose([
+            T.RandomResize([800], max_size=1333),  # NOTE: fixed size to 800
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ])
+        image, _ = transform(image_pil, None)
+        return image_pil, image
+
+    def get_unipose_output(self, image, instance_text_prompt, keypoint_text_prompt, box_threshold, IoU_threshold):
+        instance_list = instance_text_prompt.split(',')
+
+        if len(keypoint_text_prompt) == 9:
+            # torch.Size([1, 512]) torch.Size([9, 512])
+            ins_text_embeddings, kpt_text_embeddings = self.ins_text_embeddings_9, self.kpt_text_embeddings_9
+        elif len(keypoint_text_prompt) == 68:
+            # torch.Size([1, 512]) torch.Size([68, 512])
+            ins_text_embeddings, kpt_text_embeddings = self.ins_text_embeddings_68, self.kpt_text_embeddings_68
+        else:
+            raise ValueError("Invalid number of keypoint embeddings.")
+        target = {
+            "instance_text_prompt": instance_list,
+            "keypoint_text_prompt": keypoint_text_prompt,
+            "object_embeddings_text": ins_text_embeddings.float(),
+            "kpts_embeddings_text": torch.cat(
+                (kpt_text_embeddings.float(), torch.zeros(100 - kpt_text_embeddings.shape[0], 512, device=self.device)),
+                dim=0),
+            "kpt_vis_text": torch.cat((torch.ones(kpt_text_embeddings.shape[0], device=self.device),
+                                       torch.zeros(100 - kpt_text_embeddings.shape[0], device=self.device)), dim=0)
+        }
+
+        self.model = self.model.to(self.device)
+        image = image.to(self.device)
+
+        with torch.no_grad():
+            with torch.autocast(device_type=self.device[:4], dtype=torch.float16, enabled=self.flag_use_half_precision):
+                outputs = self.model(image[None], [target])
+
+        logits = outputs["pred_logits"].sigmoid()[0]
+        boxes = outputs["pred_boxes"][0]
+        keypoints = outputs["pred_keypoints"][0][:, :2 * len(keypoint_text_prompt)]
+
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        keypoints_filt = keypoints.cpu().clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]
+        boxes_filt = boxes_filt[filt_mask]
+        keypoints_filt = keypoints_filt[filt_mask]
+
+        keep_indices = nms(box_ops.box_cxcywh_to_xyxy(boxes_filt), logits_filt.max(dim=1)[0],
+                           iou_threshold=IoU_threshold)
+
+        filtered_boxes = boxes_filt[keep_indices]
+        filtered_keypoints = keypoints_filt[keep_indices]
+
+        return filtered_boxes, filtered_keypoints
+
+    def run(self, input_image, instance_text_prompt, keypoint_text_example, box_threshold, IoU_threshold):
+        if keypoint_text_example in globals():
+            keypoint_dict = globals()[keypoint_text_example]
+        elif instance_text_prompt in globals():
+            keypoint_dict = globals()[instance_text_prompt]
+        else:
+            keypoint_dict = globals()["animal"]
+
+        keypoint_text_prompt = keypoint_dict.get("keypoints")
+        keypoint_skeleton = keypoint_dict.get("skeleton")
+
+        image_pil, image = self.load_image(input_image)
+        boxes_filt, keypoints_filt = self.get_unipose_output(image, instance_text_prompt, keypoint_text_prompt,
+                                                             box_threshold, IoU_threshold)
+
+        size = image_pil.size
+        H, W = size[1], size[0]
+        keypoints_filt = keypoints_filt[0].squeeze(0)
+        kp = np.array(keypoints_filt.cpu())
+        num_kpts = len(keypoint_text_prompt)
+        Z = kp[:num_kpts * 2] * np.array([W, H] * num_kpts)
+        Z = Z.reshape(num_kpts * 2)
+        x = Z[0::2]
+        y = Z[1::2]
+        return np.stack((x, y), axis=1)
+
+    def warmup(self):
+        img_rgb = Image.fromarray(np.zeros((512, 512, 3), dtype=np.uint8))
+        self.run(img_rgb, 'face', 'face', box_threshold=0.0, IoU_threshold=0.0)
diff --git a/src/utils/crop.py b/src/utils/crop.py
new file mode 100644
index 0000000000000000000000000000000000000000..f63ea46a4e388d79e95f5dc0f9c276994b8b08e3
--- /dev/null
+++ b/src/utils/crop.py
@@ -0,0 +1,490 @@
+# coding: utf-8
+
+"""
+cropping function and the related preprocess functions for cropping
+"""
+import pdb
+
+import numpy as np
+import os.path as osp
+from math import sin, cos, acos, degrees
+import cv2
+import torch
+import torch.nn.functional as F
+import torchgeometry as tgm
+
+DTYPE = np.float32
+CV2_INTERP = cv2.INTER_LINEAR
+
+
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+def _transform_img(img, M, dsize, flags=CV2_INTERP, borderMode=None):
+    """ conduct similarity or affine transformation to the image, do not do border operation!
+    img:
+    M: 2x3 matrix or 3x3 matrix
+    dsize: target shape (width, height)
+    """
+    if isinstance(dsize, tuple) or isinstance(dsize, list):
+        _dsize = tuple(dsize)
+    else:
+        _dsize = (dsize, dsize)
+
+    if borderMode is not None:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0))
+    else:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags)
+
+
+def _transform_img_torch(img, M, dsize, rotation_center=None, flags=None, borderMode=None):
+    """ Conduct similarity or affine transformation to the image using PyTorch CUDA.
+
+    Args:
+    img (torch.Tensor): Input image tensor (C x H x W)
+    M (torch.Tensor): 2x3 or 3x3 transformation matrix
+    dsize (tuple or int): Target shape (width, height)
+    rotation_center (tuple): Center of rotation (x, y), if None, use image center
+    flags: Not used in this implementation (for compatibility)
+    borderMode: 'zeros' or 'border' for handling out-of-bounds pixels
+
+    Returns:
+    torch.Tensor: Transformed image
+    """
+    if isinstance(dsize, tuple) or isinstance(dsize, list):
+        _dsize = tuple(dsize)
+    else:
+        _dsize = (dsize, dsize)
+
+        # Prepare the transformation matrix
+    M = M[:2, :]  # Ensure it's a 2x3 matrix
+    img_transformed = tgm.warp_affine(img.unsqueeze(0), M[None], (_dsize[1], _dsize[0]))
+    img_transformed = img_transformed.squeeze(0)
+    return img_transformed
+
+
+def _transform_pts(pts, M):
+    """ conduct similarity or affine transformation to the pts
+    pts: Nx2 ndarray
+    M: 2x3 matrix or 3x3 matrix
+    return: Nx2
+    """
+    return pts @ M[:2, :2].T + M[:2, 2]
+
+
+def parse_pt2_from_pt101(pt101, use_lip=True):
+    """
+    parsing the 2 points according to the 101 points, which cancels the roll
+    """
+    # the former version use the eye center, but it is not robust, now use interpolation
+    pt_left_eye = np.mean(pt101[[39, 42, 45, 48]], axis=0)  # left eye center
+    pt_right_eye = np.mean(pt101[[51, 54, 57, 60]], axis=0)  # right eye center
+
+    if use_lip:
+        # use lip
+        pt_center_eye = (pt_left_eye + pt_right_eye) / 2
+        pt_center_lip = (pt101[75] + pt101[81]) / 2
+        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
+    else:
+        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
+    return pt2
+
+
+def parse_pt2_from_pt106(pt106, use_lip=True):
+    """
+    parsing the 2 points according to the 106 points, which cancels the roll
+    """
+    pt_left_eye = np.mean(pt106[[33, 35, 40, 39]], axis=0)  # left eye center
+    pt_right_eye = np.mean(pt106[[87, 89, 94, 93]], axis=0)  # right eye center
+
+    if use_lip:
+        # use lip
+        pt_center_eye = (pt_left_eye + pt_right_eye) / 2
+        pt_center_lip = (pt106[52] + pt106[61]) / 2
+        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
+    else:
+        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
+    return pt2
+
+
+def parse_pt2_from_pt203(pt203, use_lip=True):
+    """
+    parsing the 2 points according to the 203 points, which cancels the roll
+    """
+    pt_left_eye = np.mean(pt203[[0, 6, 12, 18]], axis=0)  # left eye center
+    pt_right_eye = np.mean(pt203[[24, 30, 36, 42]], axis=0)  # right eye center
+    if use_lip:
+        # use lip
+        pt_center_eye = (pt_left_eye + pt_right_eye) / 2
+        pt_center_lip = (pt203[48] + pt203[66]) / 2
+        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
+    else:
+        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
+    return pt2
+
+
+def parse_pt2_from_pt68(pt68, use_lip=True):
+    """
+    parsing the 2 points according to the 68 points, which cancels the roll
+    """
+    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55], dtype=np.int32) - 1
+    if use_lip:
+        pt5 = np.stack([
+            np.mean(pt68[lm_idx[[1, 2]], :], 0),  # left eye
+            np.mean(pt68[lm_idx[[3, 4]], :], 0),  # right eye
+            pt68[lm_idx[0], :],  # nose
+            pt68[lm_idx[5], :],  # lip
+            pt68[lm_idx[6], :]  # lip
+        ], axis=0)
+
+        pt2 = np.stack([
+            (pt5[0] + pt5[1]) / 2,
+            (pt5[3] + pt5[4]) / 2
+        ], axis=0)
+    else:
+        pt2 = np.stack([
+            np.mean(pt68[lm_idx[[1, 2]], :], 0),  # left eye
+            np.mean(pt68[lm_idx[[3, 4]], :], 0),  # right eye
+        ], axis=0)
+
+    return pt2
+
+
+def parse_pt2_from_pt5(pt5, use_lip=True):
+    """
+    parsing the 2 points according to the 5 points, which cancels the roll
+    """
+    if use_lip:
+        pt2 = np.stack([
+            (pt5[0] + pt5[1]) / 2,
+            (pt5[3] + pt5[4]) / 2
+        ], axis=0)
+    else:
+        pt2 = np.stack([
+            pt5[0],
+            pt5[1]
+        ], axis=0)
+    return pt2
+
+
+def parse_pt2_from_pt9(pt9, use_lip=True):
+    '''
+    parsing the 2 points according to the 9 points, which cancels the roll
+    ['right eye right', 'right eye left', 'left eye right', 'left eye left', 'nose tip', 'lip right', 'lip left', 'upper lip', 'lower lip']
+    '''
+    if use_lip:
+        pt9 = np.stack([
+            (pt9[2] + pt9[3]) / 2,  # left eye
+            (pt9[0] + pt9[1]) / 2,  # right eye
+            pt9[4],
+            (pt9[5] + pt9[6]) / 2  # lip
+        ], axis=0)
+        pt2 = np.stack([
+            (pt9[0] + pt9[1]) / 2,  # eye
+            pt9[3]  # lip
+        ], axis=0)
+    else:
+        pt2 = np.stack([
+            (pt9[2] + pt9[3]) / 2,
+            (pt9[0] + pt9[1]) / 2,
+        ], axis=0)
+
+    return pt2
+
+
+def parse_pt2_from_pt478(pt478, use_lip=True):
+    """
+    parsing the 2 points according to the 101 points, which cancels the roll
+    """
+    # the former version use the eye center, but it is not robust, now use interpolation
+    pt_left_eye = pt478[468]  # left eye center
+    pt_right_eye = pt478[473]  # right eye center
+
+    if use_lip:
+        # use lip
+        pt_center_eye = (pt_left_eye + pt_right_eye) / 2
+        pt_center_lip = pt478[14]
+        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
+    else:
+        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
+    return pt2
+
+
+def parse_pt2_from_pt_x(pts, use_lip=True):
+    if pts.shape[0] == 101:
+        pt2 = parse_pt2_from_pt101(pts, use_lip=use_lip)
+    elif pts.shape[0] == 106:
+        pt2 = parse_pt2_from_pt106(pts, use_lip=use_lip)
+    elif pts.shape[0] == 68:
+        pt2 = parse_pt2_from_pt68(pts, use_lip=use_lip)
+    elif pts.shape[0] == 5:
+        pt2 = parse_pt2_from_pt5(pts, use_lip=use_lip)
+    elif pts.shape[0] == 478:
+        pt2 = parse_pt2_from_pt478(pts, use_lip=use_lip)
+    elif pts.shape[0] == 203:
+        pt2 = parse_pt2_from_pt203(pts, use_lip=use_lip)
+    elif pts.shape[0] > 101:
+        # take the first 101 points
+        pt2 = parse_pt2_from_pt101(pts[:101], use_lip=use_lip)
+    elif pts.shape[0] == 9:
+        pt2 = parse_pt2_from_pt9(pts, use_lip=use_lip)
+    else:
+        raise Exception(f'Unknow shape: {pts.shape}')
+
+    if not use_lip:
+        # NOTE: to compile with the latter code, need to rotate the pt2 90 degrees clockwise manually
+        v = pt2[1] - pt2[0]
+        pt2[1, 0] = pt2[0, 0] - v[1]
+        pt2[1, 1] = pt2[0, 1] + v[0]
+
+    return pt2
+
+
+def parse_rect_from_landmark(
+        pts,
+        scale=1.5,
+        need_square=True,
+        vx_ratio=0,
+        vy_ratio=0,
+        use_deg_flag=False,
+        **kwargs
+):
+    """parsing center, size, angle from 101/68/5/x landmarks
+    vx_ratio: the offset ratio along the pupil axis x-axis, multiplied by size
+    vy_ratio: the offset ratio along the pupil axis y-axis, multiplied by size, which is used to contain more forehead area
+
+    judge with pts.shape
+    """
+    pt2 = parse_pt2_from_pt_x(pts, use_lip=kwargs.get('use_lip', True))
+
+    uy = pt2[1] - pt2[0]
+    l = np.linalg.norm(uy)
+    if l <= 1e-3:
+        uy = np.array([0, 1], dtype=DTYPE)
+    else:
+        uy /= l
+    ux = np.array((uy[1], -uy[0]), dtype=DTYPE)
+
+    # the rotation degree of the x-axis, the clockwise is positive, the counterclockwise is negative (image coordinate system)
+    # print(uy)
+    # print(ux)
+    angle = acos(ux[0])
+    if ux[1] < 0:
+        angle = -angle
+
+    # rotation matrix
+    M = np.array([ux, uy])
+
+    # calculate the size which contains the angle degree of the bbox, and the center
+    center0 = np.mean(pts, axis=0)
+    rpts = (pts - center0) @ M.T  # (M @ P.T).T = P @ M.T
+    lt_pt = np.min(rpts, axis=0)
+    rb_pt = np.max(rpts, axis=0)
+    center1 = (lt_pt + rb_pt) / 2
+
+    size = rb_pt - lt_pt
+    if need_square:
+        m = max(size[0], size[1])
+        size[0] = m
+        size[1] = m
+
+    size *= scale  # scale size
+    center = center0 + ux * center1[0] + uy * center1[1]  # counterclockwise rotation, equivalent to M.T @ center1.T
+    center = center + ux * (vx_ratio * size) + uy * \
+             (vy_ratio * size)  # considering the offset in vx and vy direction
+
+    if use_deg_flag:
+        angle = degrees(angle)
+
+    return center, size, angle
+
+
+def parse_bbox_from_landmark(pts, **kwargs):
+    center, size, angle = parse_rect_from_landmark(pts, **kwargs)
+    cx, cy = center
+    w, h = size
+
+    # calculate the vertex positions before rotation
+    bbox = np.array([
+        [cx - w / 2, cy - h / 2],  # left, top
+        [cx + w / 2, cy - h / 2],
+        [cx + w / 2, cy + h / 2],  # right, bottom
+        [cx - w / 2, cy + h / 2]
+    ], dtype=DTYPE)
+
+    # construct rotation matrix
+    bbox_rot = bbox.copy()
+    R = np.array([
+        [np.cos(angle), -np.sin(angle)],
+        [np.sin(angle), np.cos(angle)]
+    ], dtype=DTYPE)
+
+    # calculate the relative position of each vertex from the rotation center, then rotate these positions, and finally add the coordinates of the rotation center
+    bbox_rot = (bbox_rot - center) @ R.T + center
+
+    return {
+        'center': center,  # 2x1
+        'size': size,  # scalar
+        'angle': angle,  # rad, counterclockwise
+        'bbox': bbox,  # 4x2
+        'bbox_rot': bbox_rot,  # 4x2
+    }
+
+
+def crop_image_by_bbox(img, bbox, lmk=None, dsize=512, angle=None, flag_rot=False, **kwargs):
+    left, top, right, bot = bbox
+    if int(right - left) != int(bot - top):
+        print(f'right-left {right - left} != bot-top {bot - top}')
+    size = right - left
+
+    src_center = np.array([(left + right) / 2, (top + bot) / 2], dtype=DTYPE)
+    tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE)
+
+    s = dsize / size  # scale
+    if flag_rot and angle is not None:
+        costheta, sintheta = cos(angle), sin(angle)
+        cx, cy = src_center[0], src_center[1]  # ori center
+        tcx, tcy = tgt_center[0], tgt_center[1]  # target center
+        # need to infer
+        M_o2c = np.array(
+            [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],
+             [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],
+            dtype=DTYPE
+        )
+    else:
+        M_o2c = np.array(
+            [[s, 0, tgt_center[0] - s * src_center[0]],
+             [0, s, tgt_center[1] - s * src_center[1]]],
+            dtype=DTYPE
+        )
+
+    # if flag_rot and angle is None:
+    # print('angle is None, but flag_rotate is True', style="bold yellow")
+
+    img_crop = _transform_img(img, M_o2c, dsize=dsize, borderMode=kwargs.get('borderMode', None))
+    lmk_crop = _transform_pts(lmk, M_o2c) if lmk is not None else None
+
+    M_o2c = np.vstack([M_o2c, np.array([0, 0, 1], dtype=DTYPE)])
+    M_c2o = np.linalg.inv(M_o2c)
+
+    # cv2.imwrite('crop.jpg', img_crop)
+
+    return {
+        'img_crop': img_crop,
+        'lmk_crop': lmk_crop,
+        'M_o2c': M_o2c,
+        'M_c2o': M_c2o,
+    }
+
+
+def _estimate_similar_transform_from_pts(
+        pts,
+        dsize,
+        scale=1.5,
+        vx_ratio=0,
+        vy_ratio=-0.1,
+        flag_do_rot=True,
+        **kwargs
+):
+    """ calculate the affine matrix of the cropped image from sparse points, the original image to the cropped image, the inverse is the cropped image to the original image
+    pts: landmark, 101 or 68 points or other points, Nx2
+    scale: the larger scale factor, the smaller face ratio
+    vx_ratio: x shift
+    vy_ratio: y shift, the smaller the y shift, the lower the face region
+    rot_flag: if it is true, conduct correction
+    """
+    center, size, angle = parse_rect_from_landmark(
+        pts, scale=scale, vx_ratio=vx_ratio, vy_ratio=vy_ratio,
+        use_lip=kwargs.get('use_lip', True)
+    )
+
+    s = dsize / size[0]  # scale
+    tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE)  # center of dsize
+
+    if flag_do_rot:
+        costheta, sintheta = cos(angle), sin(angle)
+        cx, cy = center[0], center[1]  # ori center
+        tcx, tcy = tgt_center[0], tgt_center[1]  # target center
+        # need to infer
+        M_INV = np.array(
+            [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],
+             [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],
+            dtype=DTYPE
+        )
+    else:
+        M_INV = np.array(
+            [[s, 0, tgt_center[0] - s * center[0]],
+             [0, s, tgt_center[1] - s * center[1]]],
+            dtype=DTYPE
+        )
+
+    M_INV_H = np.vstack([M_INV, np.array([0, 0, 1])])
+    M = np.linalg.inv(M_INV_H)
+
+    # M_INV is from the original image to the cropped image, M is from the cropped image to the original image
+    return M_INV, M[:2, ...]
+
+
+def crop_image(img, pts: np.ndarray, **kwargs):
+    dsize = kwargs.get('dsize', 224)
+    scale = kwargs.get('scale', 1.5)  # 1.5 | 1.6
+    vy_ratio = kwargs.get('vy_ratio', -0.1)  # -0.0625 | -0.1
+
+    M_INV, _ = _estimate_similar_transform_from_pts(
+        pts,
+        dsize=dsize,
+        scale=scale,
+        vy_ratio=vy_ratio,
+        flag_do_rot=kwargs.get('flag_do_rot', True),
+    )
+
+    img_crop = _transform_img(img, M_INV, dsize)  # origin to crop
+    pt_crop = _transform_pts(pts, M_INV)
+
+    M_o2c = np.vstack([M_INV, np.array([0, 0, 1], dtype=DTYPE)])
+    M_c2o = np.linalg.inv(M_o2c)
+
+    ret_dct = {
+        'M_o2c': M_o2c,  # from the original image to the cropped image 3x3
+        'M_c2o': M_c2o,  # from the cropped image to the original image 3x3
+        'img_crop': img_crop,  # the cropped image
+        'pt_crop': pt_crop,  # the landmarks of the cropped image
+    }
+
+    return ret_dct
+
+
+def average_bbox_lst(bbox_lst):
+    if len(bbox_lst) == 0:
+        return None
+    bbox_arr = np.array(bbox_lst)
+    return np.mean(bbox_arr, axis=0).tolist()
+
+
+def prepare_paste_back(mask_crop, crop_M_c2o, dsize):
+    """prepare mask for later image paste back
+    """
+    mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize)
+    mask_ori = mask_ori.astype(np.float32) / 255.
+    return mask_ori
+
+
+def paste_back(img_crop, M_c2o, img_ori, mask_ori):
+    """paste back the image
+    """
+    dsize = (img_ori.shape[1], img_ori.shape[0])
+    result = _transform_img(img_crop, M_c2o, dsize=dsize)
+    result = np.clip(mask_ori * result + (1 - mask_ori) * img_ori, 0, 255).astype(np.uint8)
+    return result
+
+
+def paste_back_pytorch(img_crop, M_c2o, img_ori, mask_ori):
+    """paste back the image
+    """
+    dsize = (img_ori.shape[1], img_ori.shape[0])
+    img_crop = img_crop.permute(2, 0, 1).float()
+    img_back = _transform_img_torch(img_crop, M_c2o, dsize=dsize)
+    img_back = img_back.permute(1, 2, 0)
+    img_back = torch.clip(mask_ori * img_back + (1 - mask_ori) * img_ori, 0, 255)
+    return img_back
diff --git a/src/utils/face_align.py b/src/utils/face_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..907e9b81d814e1edd0414a8414f7b5259a6ebb90
--- /dev/null
+++ b/src/utils/face_align.py
@@ -0,0 +1,105 @@
+import cv2
+import numpy as np
+from skimage import transform as trans
+
+arcface_dst = np.array(
+    [[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],
+     [41.5493, 92.3655], [70.7299, 92.2041]],
+    dtype=np.float32)
+
+
+def estimate_norm(lmk, image_size=112, mode='arcface'):
+    assert lmk.shape == (5, 2)
+    assert image_size % 112 == 0 or image_size % 128 == 0
+    if image_size % 112 == 0:
+        ratio = float(image_size) / 112.0
+        diff_x = 0
+    else:
+        ratio = float(image_size) / 128.0
+        diff_x = 8.0 * ratio
+    dst = arcface_dst * ratio
+    dst[:, 0] += diff_x
+    tform = trans.SimilarityTransform()
+    tform.estimate(lmk, dst)
+    M = tform.params[0:2, :]
+    return M
+
+
+def norm_crop(img, landmark, image_size=112, mode='arcface'):
+    M = estimate_norm(landmark, image_size, mode)
+    warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
+    return warped
+
+
+def norm_crop2(img, landmark, image_size=112, mode='arcface'):
+    M = estimate_norm(landmark, image_size, mode)
+    warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
+    return warped, M
+
+
+def square_crop(im, S):
+    if im.shape[0] > im.shape[1]:
+        height = S
+        width = int(float(im.shape[1]) / im.shape[0] * S)
+        scale = float(S) / im.shape[0]
+    else:
+        width = S
+        height = int(float(im.shape[0]) / im.shape[1] * S)
+        scale = float(S) / im.shape[1]
+    resized_im = cv2.resize(im, (width, height))
+    det_im = np.zeros((S, S, 3), dtype=np.uint8)
+    det_im[:resized_im.shape[0], :resized_im.shape[1], :] = resized_im
+    return det_im, scale
+
+
+def transform(data, center, output_size, scale, rotation):
+    scale_ratio = scale
+    rot = float(rotation) * np.pi / 180.0
+    # translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)
+    t1 = trans.SimilarityTransform(scale=scale_ratio)
+    cx = center[0] * scale_ratio
+    cy = center[1] * scale_ratio
+    t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))
+    t3 = trans.SimilarityTransform(rotation=rot)
+    t4 = trans.SimilarityTransform(translation=(output_size / 2,
+                                                output_size / 2))
+    t = t1 + t2 + t3 + t4
+    M = t.params[0:2]
+    cropped = cv2.warpAffine(data,
+                             M, (output_size, output_size),
+                             borderValue=0.0)
+    return cropped, M
+
+
+def trans_points2d(pts, M):
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        # print('new_pt', new_pt.shape, new_pt)
+        new_pts[i] = new_pt[0:2]
+
+    return new_pts
+
+
+def trans_points3d(pts, M):
+    scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1])
+    # print(scale)
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        # print('new_pt', new_pt.shape, new_pt)
+        new_pts[i][0:2] = new_pt[0:2]
+        new_pts[i][2] = pts[i][2] * scale
+
+    return new_pts
+
+
+def trans_points(pts, M):
+    if pts.shape[1] == 2:
+        return trans_points2d(pts, M)
+    else:
+        return trans_points3d(pts, M)
diff --git a/src/utils/logger.py b/src/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb8da4470b49f8bfc9164a616d35ad27ddaafc34
--- /dev/null
+++ b/src/utils/logger.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/9/13 20:30
+# @Project : FasterLivePortrait
+# @FileName: logger.py
+
+import platform, sys
+import logging
+from datetime import datetime, timezone
+
+logging.getLogger("numba").setLevel(logging.WARNING)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("wetext-zh_normalizer").setLevel(logging.WARNING)
+logging.getLogger("NeMo-text-processing").setLevel(logging.WARNING)
+
+colorCodePanic = "\x1b[1;31m"
+colorCodeFatal = "\x1b[1;31m"
+colorCodeError = "\x1b[31m"
+colorCodeWarn = "\x1b[33m"
+colorCodeInfo = "\x1b[37m"
+colorCodeDebug = "\x1b[32m"
+colorCodeTrace = "\x1b[36m"
+colorReset = "\x1b[0m"
+
+log_level_color_code = {
+    logging.DEBUG: colorCodeDebug,
+    logging.INFO: colorCodeInfo,
+    logging.WARN: colorCodeWarn,
+    logging.ERROR: colorCodeError,
+    logging.FATAL: colorCodeFatal,
+}
+
+log_level_msg_str = {
+    logging.DEBUG: "DEBU",
+    logging.INFO: "INFO",
+    logging.WARN: "WARN",
+    logging.ERROR: "ERRO",
+    logging.FATAL: "FATL",
+}
+
+
+class Formatter(logging.Formatter):
+    def __init__(self, color=platform.system().lower() != "windows"):
+        self.tz = datetime.now(timezone.utc).astimezone().tzinfo
+        self.color = color
+
+    def format(self, record: logging.LogRecord):
+        logstr = "[" + datetime.now(self.tz).strftime("%z %Y%m%d %H:%M:%S") + "] ["
+        if self.color:
+            logstr += log_level_color_code.get(record.levelno, colorCodeInfo)
+        logstr += log_level_msg_str.get(record.levelno, record.levelname)
+        if self.color:
+            logstr += colorReset
+        if sys.version_info >= (3, 9):
+            fn = record.filename.removesuffix(".py")
+        elif record.filename.endswith(".py"):
+            fn = record.filename[:-3]
+        logstr += f"] {str(record.name)} | {fn} | {str(record.msg) % record.args}"
+        return logstr
+
+
+def get_logger(name: str, lv=logging.INFO, remove_exist=False, format_root=False, log_file=None):
+    logger = logging.getLogger(name)
+    logger.setLevel(lv)
+
+    # Remove existing handlers if requested
+    if remove_exist and logger.hasHandlers():
+        logger.handlers.clear()
+
+    # Console handler
+    if not logger.hasHandlers():
+        syslog = logging.StreamHandler()
+        syslog.setFormatter(Formatter())
+        logger.addHandler(syslog)
+
+    # File handler
+    if log_file:
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(Formatter(color=False))  # No color in file logs
+        logger.addHandler(file_handler)
+
+    # Reformat existing handlers if necessary
+    for h in logger.handlers:
+        h.setFormatter(Formatter())
+
+    # Optionally reformat root logger handlers
+    if format_root:
+        for h in logger.root.handlers:
+            h.setFormatter(Formatter())
+
+    return logger
diff --git a/src/utils/transform.py b/src/utils/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..c095be9714424ae7e081d0aaf34d003510f2140f
--- /dev/null
+++ b/src/utils/transform.py
@@ -0,0 +1,118 @@
+import cv2
+import math
+import numpy as np
+from skimage import transform as trans
+
+
+def transform(data, center, output_size, scale, rotation):
+    scale_ratio = scale
+    rot = float(rotation) * np.pi / 180.0
+    # translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)
+    t1 = trans.SimilarityTransform(scale=scale_ratio)
+    cx = center[0] * scale_ratio
+    cy = center[1] * scale_ratio
+    t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))
+    t3 = trans.SimilarityTransform(rotation=rot)
+    t4 = trans.SimilarityTransform(translation=(output_size / 2,
+                                                output_size / 2))
+    t = t1 + t2 + t3 + t4
+    M = t.params[0:2]
+    cropped = cv2.warpAffine(data,
+                             M, (output_size, output_size),
+                             borderValue=0.0)
+    return cropped, M
+
+
+def trans_points2d(pts, M):
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        # print('new_pt', new_pt.shape, new_pt)
+        new_pts[i] = new_pt[0:2]
+
+    return new_pts
+
+
+def trans_points3d(pts, M):
+    scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1])
+    # print(scale)
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        # print('new_pt', new_pt.shape, new_pt)
+        new_pts[i][0:2] = new_pt[0:2]
+        new_pts[i][2] = pts[i][2] * scale
+
+    return new_pts
+
+
+def trans_points(pts, M):
+    if pts.shape[1] == 2:
+        return trans_points2d(pts, M)
+    else:
+        return trans_points3d(pts, M)
+
+
+def estimate_affine_matrix_3d23d(X, Y):
+    ''' Using least-squares solution 
+    Args:
+        X: [n, 3]. 3d points(fixed)
+        Y: [n, 3]. corresponding 3d points(moving). Y = PX
+    Returns:
+        P_Affine: (3, 4). Affine camera matrix (the third row is [0, 0, 0, 1]).
+    '''
+    X_homo = np.hstack((X, np.ones([X.shape[0], 1])))  # n x 4
+    P = np.linalg.lstsq(X_homo, Y)[0].T  # Affine matrix. 3 x 4
+    return P
+
+
+def P2sRt(P):
+    ''' decompositing camera matrix P
+    Args: 
+        P: (3, 4). Affine Camera Matrix.
+    Returns:
+        s: scale factor.
+        R: (3, 3). rotation matrix.
+        t: (3,). translation. 
+    '''
+    t = P[:, 3]
+    R1 = P[0:1, :3]
+    R2 = P[1:2, :3]
+    s = (np.linalg.norm(R1) + np.linalg.norm(R2)) / 2.0
+    r1 = R1 / np.linalg.norm(R1)
+    r2 = R2 / np.linalg.norm(R2)
+    r3 = np.cross(r1, r2)
+
+    R = np.concatenate((r1, r2, r3), 0)
+    return s, R, t
+
+
+def matrix2angle(R):
+    ''' get three Euler angles from Rotation Matrix
+    Args:
+        R: (3,3). rotation matrix
+    Returns:
+        x: pitch
+        y: yaw
+        z: roll
+    '''
+    sy = math.sqrt(R[0, 0] * R[0, 0] + R[1, 0] * R[1, 0])
+
+    singular = sy < 1e-6
+
+    if not singular:
+        x = math.atan2(R[2, 1], R[2, 2])
+        y = math.atan2(-R[2, 0], sy)
+        z = math.atan2(R[1, 0], R[0, 0])
+    else:
+        x = math.atan2(-R[1, 2], R[1, 1])
+        y = math.atan2(-R[2, 0], sy)
+        z = 0
+
+    # rx, ry, rz = np.rad2deg(x), np.rad2deg(y), np.rad2deg(z)
+    rx, ry, rz = x * 180 / np.pi, y * 180 / np.pi, z * 180 / np.pi
+    return rx, ry, rz
diff --git a/src/utils/utils.py b/src/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..72b92372e1c9a1016ff97023bc72e6d7aa772156
--- /dev/null
+++ b/src/utils/utils.py
@@ -0,0 +1,249 @@
+# -*- coding: utf-8 -*-
+import pdb
+
+import cv2
+import numpy as np
+import ffmpeg
+import os
+import os.path as osp
+import torch
+
+
+def get_opt_device_dtype():
+    if torch.cuda.is_available():
+        return torch.device("cuda"), torch.float16
+    elif torch.backends.mps.is_available():
+        return torch.device("mps"), torch.float32
+    else:
+        return torch.device("cpu"), torch.float32
+
+
+def video_has_audio(video_file):
+    try:
+        ret = ffmpeg.probe(video_file, select_streams='a')
+        return len(ret["streams"]) > 0
+    except ffmpeg.Error:
+        return False
+
+
+def get_video_info(video_path):
+    # 使用 ffmpeg.probe 获取视频信息
+    probe = ffmpeg.probe(video_path)
+    video_streams = [stream for stream in probe['streams'] if stream['codec_type'] == 'video']
+
+    if not video_streams:
+        raise ValueError("No video stream found")
+
+    # 获取视频时长
+    duration = float(probe['format']['duration'])
+
+    # 获取帧率 (r_frame_rate)，通常是一个分数字符串，如 "30000/1001"
+    fps_string = video_streams[0]['r_frame_rate']
+    numerator, denominator = map(int, fps_string.split('/'))
+    fps = numerator / denominator
+
+    return duration, fps
+
+
+def resize_to_limit(img: np.ndarray, max_dim=1280, division=2):
+    """
+    ajust the size of the image so that the maximum dimension does not exceed max_dim, and the width and the height of the image are multiples of n.
+    :param img: the image to be processed.
+    :param max_dim: the maximum dimension constraint.
+    :param n: the number that needs to be multiples of.
+    :return: the adjusted image.
+    """
+    h, w = img.shape[:2]
+
+    # ajust the size of the image according to the maximum dimension
+    if max_dim > 0 and max(h, w) > max_dim:
+        if h > w:
+            new_h = max_dim
+            new_w = int(w * (max_dim / h))
+        else:
+            new_w = max_dim
+            new_h = int(h * (max_dim / w))
+        img = cv2.resize(img, (new_w, new_h))
+
+    # ensure that the image dimensions are multiples of n
+    division = max(division, 1)
+    new_h = img.shape[0] - (img.shape[0] % division)
+    new_w = img.shape[1] - (img.shape[1] % division)
+
+    if new_h == 0 or new_w == 0:
+        # when the width or height is less than n, no need to process
+        return img
+
+    if new_h != img.shape[0] or new_w != img.shape[1]:
+        img = img[:new_h, :new_w]
+
+    return img
+
+
+def get_rotation_matrix(pitch_, yaw_, roll_):
+    """ the input is in degree
+    """
+    PI = np.pi
+    # transform to radian
+    pitch = pitch_ / 180 * PI
+    yaw = yaw_ / 180 * PI
+    roll = roll_ / 180 * PI
+
+    if pitch.ndim == 1:
+        pitch = np.expand_dims(pitch, axis=1)
+    if yaw.ndim == 1:
+        yaw = np.expand_dims(yaw, axis=1)
+    if roll.ndim == 1:
+        roll = np.expand_dims(roll, axis=1)
+
+    # calculate the euler matrix
+    bs = pitch.shape[0]
+    ones = np.ones([bs, 1])
+    zeros = np.zeros([bs, 1])
+    x, y, z = pitch, yaw, roll
+
+    rot_x = np.concatenate([
+        ones, zeros, zeros,
+        zeros, np.cos(x), -np.sin(x),
+        zeros, np.sin(x), np.cos(x)
+    ], axis=1).reshape([bs, 3, 3])
+
+    rot_y = np.concatenate([
+        np.cos(y), zeros, np.sin(y),
+        zeros, ones, zeros,
+        -np.sin(y), zeros, np.cos(y)
+    ], axis=1).reshape([bs, 3, 3])
+
+    rot_z = np.concatenate([
+        np.cos(z), -np.sin(z), zeros,
+        np.sin(z), np.cos(z), zeros,
+        zeros, zeros, ones
+    ], axis=1).reshape([bs, 3, 3])
+
+    rot = np.matmul(rot_z, np.matmul(rot_y, rot_x))
+    return np.transpose(rot, (0, 2, 1))  # transpose
+
+
+def calculate_distance_ratio(lmk: np.ndarray, idx1: int, idx2: int, idx3: int, idx4: int,
+                             eps: float = 1e-6) -> np.ndarray:
+    return (np.linalg.norm(lmk[:, idx1] - lmk[:, idx2], axis=1, keepdims=True) /
+            (np.linalg.norm(lmk[:, idx3] - lmk[:, idx4], axis=1, keepdims=True) + eps))
+
+
+def calc_eye_close_ratio(lmk: np.ndarray, target_eye_ratio: np.ndarray = None) -> np.ndarray:
+    lefteye_close_ratio = calculate_distance_ratio(lmk, 6, 18, 0, 12)
+    righteye_close_ratio = calculate_distance_ratio(lmk, 30, 42, 24, 36)
+    if target_eye_ratio is not None:
+        return np.concatenate([lefteye_close_ratio, righteye_close_ratio, target_eye_ratio], axis=1)
+    else:
+        return np.concatenate([lefteye_close_ratio, righteye_close_ratio], axis=1)
+
+
+def calc_lip_close_ratio(lmk: np.ndarray) -> np.ndarray:
+    return calculate_distance_ratio(lmk, 90, 102, 48, 66)
+
+
+def _transform_img(img, M, dsize, flags=cv2.INTER_LINEAR, borderMode=None):
+    """ conduct similarity or affine transformation to the image, do not do border operation!
+    img:
+    M: 2x3 matrix or 3x3 matrix
+    dsize: target shape (width, height)
+    """
+    if isinstance(dsize, tuple) or isinstance(dsize, list):
+        _dsize = tuple(dsize)
+    else:
+        _dsize = (dsize, dsize)
+
+    if borderMode is not None:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0))
+    else:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags)
+
+
+def prepare_paste_back(mask_crop, crop_M_c2o, dsize):
+    """prepare mask for later image paste back
+    """
+    mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize)
+    mask_ori = mask_ori.astype(np.float32) / 255.
+    return mask_ori
+
+
+def transform_keypoint(pitch, yaw, roll, t, exp, scale, kp):
+    """
+    transform the implicit keypoints with the pose, shift, and expression deformation
+    kp: BxNx3
+    """
+    bs = kp.shape[0]
+    if kp.ndim == 2:
+        num_kp = kp.shape[1] // 3  # Bx(num_kpx3)
+    else:
+        num_kp = kp.shape[1]  # Bxnum_kpx3
+
+    rot_mat = get_rotation_matrix(pitch, yaw, roll)  # (bs, 3, 3)
+
+    # Eqn.2: s * (R * x_c,s + exp) + t
+    kp_transformed = kp.reshape(bs, num_kp, 3) @ rot_mat + exp.reshape(bs, num_kp, 3)
+    kp_transformed *= scale[..., None]  # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)
+    kp_transformed[:, :, 0:2] += t[:, None, 0:2]  # remove z, only apply tx ty
+
+    return kp_transformed
+
+
+def concat_feat(x, y):
+    bs = x.shape[0]
+    return np.concatenate([x.reshape(bs, -1), y.reshape(bs, -1)], axis=1)
+
+
+def is_image(file_path):
+    image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff')
+    return file_path.lower().endswith(image_extensions)
+
+
+def is_video(file_path):
+    if file_path.lower().endswith((".mp4", ".mov", ".avi", ".webm")) or os.path.isdir(file_path):
+        return True
+    return False
+
+
+def make_abs_path(fn):
+    return osp.join(os.path.dirname(osp.dirname(osp.realpath(__file__))), fn)
+
+
+class LowPassFilter:
+    def __init__(self):
+        self.prev_raw_value = None
+        self.prev_filtered_value = None
+
+    def process(self, value, alpha):
+        if self.prev_raw_value is None:
+            s = value
+        else:
+            s = alpha * value + (1.0 - alpha) * self.prev_filtered_value
+        self.prev_raw_value = value
+        self.prev_filtered_value = s
+        return s
+
+
+class OneEuroFilter:
+    def __init__(self, mincutoff=1.0, beta=0.0, dcutoff=1.0, freq=30):
+        self.freq = freq
+        self.mincutoff = mincutoff
+        self.beta = beta
+        self.dcutoff = dcutoff
+        self.x_filter = LowPassFilter()
+        self.dx_filter = LowPassFilter()
+
+    def compute_alpha(self, cutoff):
+        te = 1.0 / self.freq
+        tau = 1.0 / (2 * np.pi * cutoff)
+        return 1.0 / (1.0 + tau / te)
+
+    def get_pre_x(self):
+        return self.x_filter.prev_filtered_value
+
+    def process(self, x):
+        prev_x = self.x_filter.prev_raw_value
+        dx = 0.0 if prev_x is None else (x - prev_x) * self.freq
+        edx = self.dx_filter.process(dx, self.compute_alpha(self.dcutoff))
+        cutoff = self.mincutoff + self.beta * np.abs(edx)
+        return self.x_filter.process(x, self.compute_alpha(cutoff))
diff --git a/tests/test_api.py b/tests/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d99be94b15c58e1302df144621ce1bc16e0b245
--- /dev/null
+++ b/tests/test_api.py
@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/9/14 8:50
+# @Project : FasterLivePortrait
+# @FileName: test_api.py
+import os
+import requests
+import zipfile
+from io import BytesIO
+import datetime
+import json
+
+
+def test_with_pickle_animal():
+    try:
+        data = {
+            'flag_is_animal': True,
+            'flag_pickle': True,
+            'flag_relative_input': True,
+            'flag_do_crop_input': True,
+            'flag_remap_input': True,
+            'driving_multiplier': 1.0,
+            'flag_stitching': True,
+            'flag_crop_driving_video_input': True,
+            'flag_video_editing_head_rotation': False,
+            'scale': 2.3,
+            'vx_ratio': 0.0,
+            'vy_ratio': -0.125,
+            'scale_crop_driving_video': 2.2,
+            'vx_ratio_crop_driving_video': 0.0,
+            'vy_ratio_crop_driving_video': -0.1,
+            'driving_smooth_observation_variance': 1e-7
+        }
+        source_image_path = "./assets/examples/source/s39.jpg"
+        driving_pickle_path = "./assets/examples/driving/d8.pkl"
+
+        # 打开文件
+        files = {
+            'source_image': open(source_image_path, 'rb'),
+            'driving_pickle': open(driving_pickle_path, 'rb')
+        }
+
+        # 发送 POST 请求
+        response = requests.post("http://127.0.0.1:9871/predict/", files=files, data=data)
+        response.raise_for_status()
+        with zipfile.ZipFile(BytesIO(response.content), "r") as zip_ref:
+            # save files for each request in a different folder
+            dt = datetime.datetime.now()
+            ts = int(dt.timestamp())
+            tgt = f"./results/api_{ts}/"
+            os.makedirs(tgt, exist_ok=True)
+            zip_ref.extractall(tgt)
+            print("Extracted files into", tgt)
+
+    except requests.exceptions.RequestException as e:
+        print(f"Request Error: {e}")
+
+
+def test_with_video_animal():
+    try:
+        data = {
+            'flag_is_animal': True,
+            'flag_pickle': False,
+            'flag_relative_input': True,
+            'flag_do_crop_input': True,
+            'flag_remap_input': True,
+            'driving_multiplier': 1.0,
+            'flag_stitching': True,
+            'flag_crop_driving_video_input': True,
+            'flag_video_editing_head_rotation': False,
+            'scale': 2.3,
+            'vx_ratio': 0.0,
+            'vy_ratio': -0.125,
+            'scale_crop_driving_video': 2.2,
+            'vx_ratio_crop_driving_video': 0.0,
+            'vy_ratio_crop_driving_video': -0.1,
+            'driving_smooth_observation_variance': 1e-7
+        }
+        source_image_path = "./assets/examples/source/s39.jpg"
+        driving_video_path = "./assets/examples/driving/d0.mp4"
+        files = {
+            'source_image': open(source_image_path, 'rb'),
+            'driving_video': open(driving_video_path, 'rb')
+        }
+        response = requests.post("http://127.0.0.1:9871/predict/", files=files, data=data)
+        response.raise_for_status()
+        with zipfile.ZipFile(BytesIO(response.content), "r") as zip_ref:
+            # save files for each request in a different folder
+            dt = datetime.datetime.now()
+            ts = int(dt.timestamp())
+            tgt = f"./results/api_{ts}/"
+            os.makedirs(tgt, exist_ok=True)
+            zip_ref.extractall(tgt)
+            print("Extracted files into", tgt)
+
+    except requests.exceptions.RequestException as e:
+        print(f"Request Error: {e}")
+
+
+def test_with_video_human():
+    try:
+        data = {
+            'flag_is_animal': False,
+            'flag_pickle': False,
+            'flag_relative_input': True,
+            'flag_do_crop_input': True,
+            'flag_remap_input': True,
+            'driving_multiplier': 1.0,
+            'flag_stitching': True,
+            'flag_crop_driving_video_input': True,
+            'flag_video_editing_head_rotation': False,
+            'scale': 2.3,
+            'vx_ratio': 0.0,
+            'vy_ratio': -0.125,
+            'scale_crop_driving_video': 2.2,
+            'vx_ratio_crop_driving_video': 0.0,
+            'vy_ratio_crop_driving_video': -0.1,
+            'driving_smooth_observation_variance': 1e-7
+        }
+        source_image_path = "./assets/examples/source/s11.jpg"
+        driving_video_path = "./assets/examples/driving/d0.mp4"
+        files = {
+            'source_image': open(source_image_path, 'rb'),
+            'driving_video': open(driving_video_path, 'rb')
+        }
+        response = requests.post("http://127.0.0.1:9871/predict/", files=files, data=data)
+        response.raise_for_status()
+        with zipfile.ZipFile(BytesIO(response.content), "r") as zip_ref:
+            # save files for each request in a different folder
+            dt = datetime.datetime.now()
+            ts = int(dt.timestamp())
+            tgt = f"./results/api_{ts}/"
+            os.makedirs(tgt, exist_ok=True)
+            zip_ref.extractall(tgt)
+            print("Extracted files into", tgt)
+
+    except requests.exceptions.RequestException as e:
+        print(f"Request Error: {e}")
+
+
+if __name__ == '__main__':
+    test_with_video_animal()
+    # test_with_pickle_animal()
+    # test_with_video_human()
diff --git a/tests/test_gradio_local.py b/tests/test_gradio_local.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d1102b147ca9c65c93db04b310ec7f7ca959bf9
--- /dev/null
+++ b/tests/test_gradio_local.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/12/28
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: test_gradio_local.py
+"""
+python tests/test_gradio_local.py \
+ --src assets/examples/driving/d13.mp4 \
+ --dri assets/examples/driving/d11.mp4 \
+ --cfg configs/trt_infer.yaml
+"""
+
+import sys
+sys.path.append(".")
+import os
+import argparse
+import pdb
+import subprocess
+import ffmpeg
+import cv2
+import time
+import numpy as np
+import os
+import datetime
+import platform
+import pickle
+from omegaconf import OmegaConf
+from tqdm import tqdm
+
+from src.pipelines.gradio_live_portrait_pipeline import GradioLivePortraitPipeline
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Faster Live Portrait Pipeline')
+    parser.add_argument('--src', required=False, type=str, default="assets/examples/source/s12.jpg",
+                        help='source path')
+    parser.add_argument('--dri', required=False, type=str, default="assets/examples/driving/d14.mp4",
+                        help='driving path')
+    parser.add_argument('--cfg', required=False, type=str, default="configs/trt_infer.yaml", help='inference config')
+    parser.add_argument('--animal', action='store_true', help='use animal model')
+    parser.add_argument('--paste_back', action='store_true', default=False, help='paste back to origin image')
+    args, unknown = parser.parse_known_args()
+
+    infer_cfg = OmegaConf.load(args.cfg)
+    pipe = GradioLivePortraitPipeline(infer_cfg)
+    if args.animal:
+        pipe.init_models(is_animal=True)
+
+    dri_ext = os.path.splitext(args.dri)[-1][1:].lower()
+    if dri_ext in ["pkl"]:
+        out_path, out_path_concat, total_time = pipe.run_pickle_driving(args.dri,
+                                                                        args.src,
+                                                                        update_ret=True)
+    elif dri_ext in ["mp4"]:
+        out_path, out_path_concat, total_time = pipe.run_video_driving(args.dri,
+                                                                       args.src,
+                                                                       update_ret=True)
+    elif dri_ext in ["mp3", "wav"]:
+        out_path, out_path_concat, total_time = pipe.run_audio_driving(args.dri,
+                                                                       args.src,
+                                                                       update_ret=True)
+    else:
+        out_path, out_path_concat, total_time = pipe.run_image_driving(args.dri,
+                                                                       args.src,
+                                                                       update_ret=True)
+    print(out_path, out_path_concat, total_time)
diff --git a/tests/test_models.py b/tests/test_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..f054effe095f5c92f555bc8d6dd4a36a2e0d7415
--- /dev/null
+++ b/tests/test_models.py
@@ -0,0 +1,480 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/7/13 17:20
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: test_models.py
+import json
+import os, sys
+import pdb
+
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+
+def test_warping_spade_model():
+    """
+    test warping model in onnx and trt
+    :return:
+    """
+    import numpy as np
+    import time
+    from src.models import WarpingSpadeModel
+
+    # tensorrt 模型加载
+    trt_kwargs = dict(
+        predict_type="trt",
+        model_path="./checkpoints/liveportrait_animal_onnx/warping_spade-fix.trt",
+    )
+
+    trt_model = WarpingSpadeModel(**trt_kwargs)
+
+    # onnx 模型加载
+    onnx_kwargs = dict(
+        predict_type="ort",
+        model_path="./checkpoints/liveportrait_animal_onnx/warping_spade.onnx",
+    )
+    onnx_model = WarpingSpadeModel(**onnx_kwargs)
+
+    feature_3d = np.random.randn(1, 32, 16, 64, 64)
+    kp_source = np.random.randn(1, 21, 3)
+    kp_driving = np.random.randn(1, 21, 3)
+
+    trt_rets = trt_model.predict(feature_3d, kp_source, kp_driving)
+    onnx_rets = onnx_model.predict(feature_3d, kp_source, kp_driving)
+
+    # for i in range(len(trt_rets)):
+    print(f"output max diff:{np.abs(trt_rets - onnx_rets).max()}")
+    infer_times = []
+    for _ in range(30):
+        t0 = time.time()
+        trt_rets = trt_model.predict(feature_3d, kp_source, kp_driving)
+        infer_times.append(time.time() - t0)
+    print(
+        "{} tensorrt inference time: min: {}, max: {}, mean: {}".format(WarpingSpadeModel.__name__, np.min(infer_times),
+                                                                        np.max(infer_times), np.median(infer_times)))
+
+    infer_times = []
+    for _ in range(30):
+        t0 = time.time()
+        onnx_rets = onnx_model.predict(feature_3d, kp_source, kp_driving)
+        infer_times.append(time.time() - t0)
+    print("{} onnx inference time: min: {}, max: {}, mean: {}".format(WarpingSpadeModel.__name__, np.min(infer_times),
+                                                                      np.max(infer_times), np.median(infer_times)))
+
+
+def test_motion_extractor_model():
+    """
+    test motion_extractor model in onnx and trt
+    :return:
+    """
+    import numpy as np
+    import time
+    import cv2
+    from src.models import MotionExtractorModel
+
+    # tensorrt 模型加载
+    trt_kwargs = dict(
+        predict_type="trt",
+        model_path="./checkpoints/liveportrait_animal_onnx/motion_extractor.trt",
+        debug=True
+    )
+
+    trt_model = MotionExtractorModel(**trt_kwargs)
+
+    # onnx 模型加载
+    onnx_kwargs = dict(
+        predict_type="ort",
+        model_path="./checkpoints/liveportrait_animal_onnx/motion_extractor.onnx",
+        debug=True
+    )
+    onnx_model = MotionExtractorModel(**onnx_kwargs)
+
+    img_bgr = cv2.imread("assets/examples/source/s1.jpg")
+    img_rgb = img_bgr[:, :, ::-1]
+    input = cv2.resize(img_rgb, (256, 256))
+
+    trt_rets = trt_model.predict(input)
+    onnx_rets = onnx_model.predict(input)
+    for i in range(len(trt_rets)):
+        print(f"output {i} max diff:{np.abs(trt_rets[i] - onnx_rets[i]).max()}")
+    pdb.set_trace()
+    infer_times = []
+    for _ in range(30):
+        t0 = time.time()
+        trt_rets = trt_model.predict(input)
+        infer_times.append(time.time() - t0)
+    print("{} tensorrt inference time: min: {}, max: {}, mean: {}".format(MotionExtractorModel.__name__,
+                                                                          np.min(infer_times),
+                                                                          np.max(infer_times), np.median(infer_times)))
+
+    infer_times = []
+    for _ in range(30):
+        t0 = time.time()
+        onnx_rets = onnx_model.predict(input)
+        infer_times.append(time.time() - t0)
+    print(
+        "{} onnx inference time: min: {}, max: {}, mean: {}".format(MotionExtractorModel.__name__, np.min(infer_times),
+                                                                    np.max(infer_times), np.median(infer_times)))
+
+
+def test_appearance_extractor_model():
+    """
+    test motion_extractor model in onnx and trt
+    :return:
+    """
+    import numpy as np
+    import time
+    import cv2
+    from src.models import AppearanceFeatureExtractorModel
+
+    # tensorrt 模型加载
+    trt_kwargs = dict(
+        predict_type="trt",
+        model_path="./checkpoints/liveportrait_onnx/appearance_feature_extractor.trt",
+    )
+
+    trt_model = AppearanceFeatureExtractorModel(**trt_kwargs)
+
+    # onnx 模型加载
+    onnx_kwargs = dict(
+        predict_type="ort",
+        model_path="./checkpoints/liveportrait_onnx/appearance_feature_extractor.onnx",
+    )
+    onnx_model = AppearanceFeatureExtractorModel(**onnx_kwargs)
+
+    img_bgr = cv2.imread("assets/examples/source/s1.jpg")
+    img_rgb = img_bgr[:, :, ::-1]
+    input = cv2.resize(img_rgb, (256, 256))
+
+    trt_rets = trt_model.predict(input)
+    onnx_rets = onnx_model.predict(input)
+    print(f"output max diff:{np.abs(trt_rets - onnx_rets).max()}")
+    pdb.set_trace()
+    infer_times = []
+    for _ in range(20):
+        t0 = time.time()
+        trt_rets = trt_model.predict(input)
+        infer_times.append(time.time() - t0)
+    print("{} tensorrt inference time: min: {}, max: {}, mean: {}".format(AppearanceFeatureExtractorModel.__name__,
+                                                                          np.min(infer_times),
+                                                                          np.max(infer_times), np.mean(infer_times)))
+
+    # onnx is so slow, don't why, maybe the grid_sample op not implemented well?
+    infer_times = []
+    for _ in range(20):
+        t0 = time.time()
+        onnx_rets = onnx_model.predict(input)
+        infer_times.append(time.time() - t0)
+    print(
+        "{} onnx inference time: min: {}, max: {}, mean: {}".format(AppearanceFeatureExtractorModel.__name__,
+                                                                    np.min(infer_times),
+                                                                    np.max(infer_times), np.mean(infer_times)))
+
+
+def test_landmark_model():
+    """
+    test motion_extractor model in onnx and trt
+    :return:
+    """
+    import numpy as np
+    import time
+    import cv2
+    from src.models import LandmarkModel
+
+    # tensorrt 模型加载
+    trt_kwargs = dict(
+        predict_type="trt",
+        model_path="./checkpoints/liveportrait_onnx/landmark.trt",
+        debug=True
+    )
+
+    trt_model = LandmarkModel(**trt_kwargs)
+
+    # onnx 模型加载
+    onnx_kwargs = dict(
+        predict_type="ort",
+        model_path="./checkpoints/liveportrait_onnx/landmark.onnx",
+        debug=True
+    )
+    onnx_model = LandmarkModel(**onnx_kwargs)
+
+    img_bgr = cv2.imread("assets/examples/source/s1.jpg")
+    img_rgb = img_bgr[:, :, ::-1]
+    input = cv2.resize(img_rgb, (224, 224))
+
+    trt_rets = trt_model.predict(input)
+    onnx_rets = onnx_model.predict(input)
+    print(f"output max diff:{np.abs(trt_rets - onnx_rets).max()}")
+    pdb.set_trace()
+
+    infer_times = []
+    for _ in range(30):
+        t0 = time.time()
+        trt_rets = trt_model.predict(input)
+        infer_times.append(time.time() - t0)
+    print("{} tensorrt inference time: min: {}, max: {}, mean: {}".format(LandmarkModel.__name__,
+                                                                          np.min(infer_times),
+                                                                          np.max(infer_times), np.median(infer_times)))
+
+    # onnx is so slow, don't why, maybe the grid_sample op not implemented well?
+    infer_times = []
+    for _ in range(30):
+        t0 = time.time()
+        onnx_rets = onnx_model.predict(input)
+        infer_times.append(time.time() - t0)
+    print(
+        "{} onnx inference time: min: {}, max: {}, mean: {}".format(LandmarkModel.__name__,
+                                                                    np.min(infer_times),
+                                                                    np.max(infer_times), np.median(infer_times)))
+
+
+def test_face_analysis_model():
+    import numpy as np
+    import cv2
+    import time
+    from src.models import FaceAnalysisModel
+    img_bgr = cv2.imread("assets/examples/source/s1.jpg")
+
+    # onnx 模型加载
+    onnx_kwargs = dict(
+        predict_type="ort",
+        model_path=["./checkpoints/liveportrait_onnx/retinaface_det_static.onnx",
+                    "./checkpoints/liveportrait_onnx/face_2dpose_106_static.onnx"],
+    )
+    onnx_model = FaceAnalysisModel(**onnx_kwargs)
+
+    # tensorrt 模型加载
+    trt_kwargs = dict(
+        predict_type="trt",
+        model_path=["./checkpoints/liveportrait_onnx/retinaface_det_static.trt",
+                    "./checkpoints/liveportrait_onnx/face_2dpose_106_static.trt"],
+    )
+
+    trt_model = FaceAnalysisModel(**trt_kwargs)
+
+    trt_rets = trt_model.predict(img_bgr)[0]
+    onnx_rets = onnx_model.predict(img_bgr)[0]
+    for key in trt_rets:
+        print(f"output {key} max diff:{np.abs(trt_rets[key] - onnx_rets[key]).max()}")
+    pdb.set_trace()
+    infer_times = []
+    for _ in range(30):
+        t0 = time.time()
+        trt_rets = trt_model.predict(img_bgr)
+        infer_times.append(time.time() - t0)
+    print("{} tensorrt inference time: min: {}, max: {}, mean: {}".format(FaceAnalysisModel.__name__,
+                                                                          np.min(infer_times),
+                                                                          np.max(infer_times), np.median(infer_times)))
+
+    infer_times = []
+    for _ in range(30):
+        t0 = time.time()
+        onnx_rets = onnx_model.predict(img_bgr)
+        infer_times.append(time.time() - t0)
+    print(
+        "{} onnx inference time: min: {}, max: {}, mean: {}".format(FaceAnalysisModel.__name__, np.min(infer_times),
+                                                                    np.max(infer_times), np.median(infer_times)))
+
+
+def test_stitching_model():
+    """
+    test stitching model in onnx and trt
+    :return:
+    """
+    import numpy as np
+    import time
+    from src.models import StitchingModel
+
+    # tensorrt 模型加载
+    trt_kwargs = dict(
+        predict_type="trt",
+        model_path="./checkpoints/liveportrait_onnx/stitching.trt",
+    )
+
+    trt_model = StitchingModel(**trt_kwargs)
+
+    # onnx 模型加载
+    onnx_kwargs = dict(
+        predict_type="ort",
+        model_path="./checkpoints/liveportrait_onnx/stitching.onnx"
+    )
+    onnx_model = StitchingModel(**onnx_kwargs)
+
+    input = np.random.randn(1, 126)
+
+    trt_rets = trt_model.predict(input)
+    onnx_rets = onnx_model.predict(input)
+    print(f"output max diff:{np.abs(trt_rets - onnx_rets).max()}")
+
+    infer_times = []
+    for _ in range(20):
+        t0 = time.time()
+        trt_rets = trt_model.predict(input)
+        infer_times.append(time.time() - t0)
+    print("{} tensorrt inference time: min: {}, max: {}, mean: {}".format(StitchingModel.__name__,
+                                                                          np.min(infer_times),
+                                                                          np.max(infer_times), np.median(infer_times)))
+
+    # onnx is so slow, don't why, maybe the grid_sample op not implemented well?
+    infer_times = []
+    for _ in range(20):
+        t0 = time.time()
+        onnx_rets = onnx_model.predict(input)
+        infer_times.append(time.time() - t0)
+    print(
+        "{} onnx inference time: min: {}, max: {}, mean: {}".format(StitchingModel.__name__,
+                                                                    np.min(infer_times),
+                                                                    np.max(infer_times), np.median(infer_times)))
+
+
+def test_mediapipe_face():
+    img_path = ""
+    import cv2
+    import mediapipe as mp
+    mp_drawing = mp.solutions.drawing_utils
+    mp_drawing_styles = mp.solutions.drawing_styles
+    mp_face_mesh = mp.solutions.face_mesh
+    os.makedirs('./results/mediapipe_test', exist_ok=True)
+    # For static images:
+    IMAGE_FILES = ["assets/examples/source/s9.jpg"]
+    drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1)
+    with mp_face_mesh.FaceMesh(
+            static_image_mode=True,
+            max_num_faces=1,
+            refine_landmarks=True,
+            min_detection_confidence=0.5) as face_mesh:
+        for idx, file in enumerate(IMAGE_FILES):
+            image = cv2.imread(file)
+            # Convert the BGR image to RGB before processing.
+            results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+
+            # Print and draw face mesh landmarks on the image.
+            if not results.multi_face_landmarks:
+                continue
+            annotated_image = image.copy()
+            for face_landmarks in results.multi_face_landmarks:
+                landmarks = []
+                for landmark in face_landmarks.landmark:
+                    # 提取每个关键点的 x, y, z 坐标
+                    landmarks.append({
+                        'x': landmark.x,
+                        'y': landmark.y,
+                        'z': landmark.z
+                    })
+                pdb.set_trace()
+                mp_drawing.draw_landmarks(
+                    image=annotated_image,
+                    landmark_list=face_landmarks,
+                    connections=mp_face_mesh.FACEMESH_TESSELATION,
+                    landmark_drawing_spec=None,
+                    connection_drawing_spec=mp_drawing_styles
+                    .get_default_face_mesh_tesselation_style())
+                mp_drawing.draw_landmarks(
+                    image=annotated_image,
+                    landmark_list=face_landmarks,
+                    connections=mp_face_mesh.FACEMESH_CONTOURS,
+                    landmark_drawing_spec=None,
+                    connection_drawing_spec=mp_drawing_styles
+                    .get_default_face_mesh_contours_style())
+                mp_drawing.draw_landmarks(
+                    image=annotated_image,
+                    landmark_list=face_landmarks,
+                    connections=mp_face_mesh.FACEMESH_IRISES,
+                    landmark_drawing_spec=None,
+                    connection_drawing_spec=mp_drawing_styles
+                    .get_default_face_mesh_iris_connections_style())
+            cv2.imwrite('./results/mediapipe_test/' + os.path.basename(file), annotated_image)
+
+
+def test_kokoro_model():
+    import os
+    os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
+    os.environ["PHONEMIZER_ESPEAK_PATH"] = r"C:\Program Files\eSpeak NG\espeak-ng.exe"
+    import torchaudio
+
+    from src.models.kokoro.models import build_model
+    from src.models.kokoro.kokoro import generate
+    import torch
+
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    MODEL = build_model('checkpoints/Kokoro-82M/kokoro-v0_19.pth', device)
+    VOICE_NAME = [
+        'af',  # Default voice is a 50-50 mix of Bella & Sarah
+        'af_bella', 'af_sarah', 'am_adam', 'am_michael',
+        'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
+        'af_nicole', 'af_sky',
+    ][0]
+    VOICEPACK = torch.load(f'checkpoints/Kokoro-82M/voices/{VOICE_NAME}.pt', weights_only=True).to(device)
+    print(f'Loaded voice: {VOICE_NAME}')
+
+    text = "How could I know? It's an unanswerable question. Like asking an unborn child if they'll lead a good life. They haven't even been born."
+    audio, out_ps = generate(MODEL, text, VOICEPACK, lang=VOICE_NAME[0])
+    audio_save_path = "./results/kokoro-82m/kokoro_test.wav"
+    os.makedirs(os.path.dirname(audio_save_path), exist_ok=True)
+    torchaudio.save(audio_save_path, audio[0], 24000)
+    print(f"audio save to {audio_save_path}")
+
+
+def test_kokoro_v1_model():
+    # import os
+    # os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
+    # os.environ["PHONEMIZER_ESPEAK_PATH"] = r"C:\Program Files\eSpeak NG\espeak-ng.exe"
+    import torchaudio
+    from kokoro import KPipeline, KModel
+    import soundfile as sf
+    import numpy as np
+    import torch
+
+    # 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
+    # 🇯🇵 'j' => Japanese: pip install misaki[ja]
+    # 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
+    voice = 'jf_tebukuro'
+    with open("checkpoints/Kokoro-82M/config.json", "r", encoding="utf-8") as fin:
+        model_config = json.load(fin)
+    model = KModel(config=model_config, model="checkpoints/Kokoro-82M/kokoro-v1_0.pth")
+    pipeline = KPipeline(lang_code=voice[0], model=model)  # <= make sure lang_code matches voice
+    model.voices = {}
+    voice_path = "checkpoints/Kokoro-82M/voices"
+    for vname in os.listdir(voice_path):
+        pipeline.voices[os.path.splitext(vname)[0]] = torch.load(os.path.join(voice_path, vname), weights_only=True)
+    # This text is for demonstration purposes only, unseen during training
+    # text = '''
+    # The sky above the port was the color of television, tuned to a dead channel.
+    # "It's not like I'm using," Case heard someone say, as he shouldered his way through the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."
+    # It was a Sprawl voice and a Sprawl joke. The Chatsubo was a bar for professional expatriates; you could drink there for a week and never hear two words in Japanese.
+    #
+    # These were to have an enormous impact, not only because they were associated with Constantine, but also because, as in so many other areas, the decisions taken by Constantine (or in his name) were to have great significance for centuries to come. One of the main issues was the shape that Christian churches were to take, since there was not, apparently, a tradition of monumental church buildings when Constantine decided to help the Christian church build a series of truly spectacular structures. The main form that these churches took was that of the basilica, a multipurpose rectangular structure, based ultimately on the earlier Greek stoa, which could be found in most of the great cities of the empire. Christianity, unlike classical polytheism, needed a large interior space for the celebration of its religious services, and the basilica aptly filled that need. We naturally do not know the degree to which the emperor was involved in the design of new churches, but it is tempting to connect this with the secular basilica that Constantine completed in the Roman forum (the so-called Basilica of Maxentius) and the one he probably built in Trier, in connection with his residence in the city at a time when he was still caesar.
+    #
+    # [Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
+    # '''
+    text = '「もしおれがただ偶然、そしてこうしようというつもりでなくここに立っているのなら、ちょっとばかり絶望するところだな」と、そんなことが彼の頭に思い浮かんだ。'
+    # text = '中國人民不信邪也不怕邪，不惹事也不怕事，任何外國不要指望我們會拿自己的核心利益做交易，不要指望我們會吞下損害我國主權、安全、發展利益的苦果！'
+    # text = 'Los partidos políticos tradicionales compiten con los populismos y los movimientos asamblearios.'
+    # text = 'Le dromadaire resplendissant déambulait tranquillement dans les méandres en mastiquant de petites feuilles vernissées.'
+    # text = 'ट्रांसपोर्टरों की हड़ताल लगातार पांचवें दिन जारी, दिसंबर से इलेक्ट्रॉनिक टोल कलेक्शनल सिस्टम'
+    # text = "Allora cominciava l'insonnia, o un dormiveglia peggiore dell'insonnia, che talvolta assumeva i caratteri dell'incubo."
+    # text = 'Elabora relatórios de acompanhamento cronológico para as diferentes unidades do Departamento que propõem contratos.'
+
+    # 4️⃣ Generate, display, and save audio files in a loop.
+    generator = pipeline(
+        text, voice=voice,  # <= change voice here
+        speed=1, split_pattern=r'\n+'
+    )
+    audios = []
+    for i, (gs, ps, audio) in enumerate(generator):
+        audios.append(audio)
+    audios = np.concatenate(audios)
+    sf.write(f'./results/kokoro-82m/kokoro_v1_0_{voice}.wav', audios, 24000)  # save each audio file
+    print(f'./results/kokoro-82m/kokoro_v1_0_{voice}.wav')
+
+
+if __name__ == '__main__':
+    # test_warping_spade_model()
+    # test_motion_extractor_model()
+    # test_landmark_model()
+    # test_face_analysis_model()
+    # test_appearance_extractor_model()
+    # test_stitching_model()
+    # test_mediapipe_face()
+    # test_kokoro_model()
+    test_kokoro_v1_model()
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
new file mode 100644
index 0000000000000000000000000000000000000000..db66a9ba246307a8d55a29ba80f2c66f9099c008
--- /dev/null
+++ b/tests/test_pipelines.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/12/15
+# @Author  : wenshao
+# @Email   : wenshaoguo1026@gmail.com
+# @Project : FasterLivePortrait
+# @FileName: test_pipelines.py
+import pdb
+import pickle
+import sys
+
+sys.path.append(".")
+
+
+def test_joyvasa_pipeline():
+    from src.pipelines.joyvasa_audio_to_motion_pipeline import JoyVASAAudio2MotionPipeline
+
+    pipe = JoyVASAAudio2MotionPipeline(
+        motion_model_path="checkpoints/JoyVASA/motion_generator/motion_generator_hubert_chinese.pt",
+        audio_model_path="checkpoints/chinese-hubert-base",
+        motion_template_path="checkpoints/JoyVASA/motion_template/motion_template.pkl")
+
+    audio_path = "assets/examples/driving/a-01.wav"
+    motion_data = pipe.gen_motion_sequence(audio_path)
+    with open("assets/examples/driving/d1-joyvasa.pkl", "wb") as fw:
+        pickle.dump(motion_data, fw)
+    pdb.set_trace()
+
+
+if __name__ == '__main__':
+    test_joyvasa_pipeline()
diff --git a/update.bat b/update.bat
new file mode 100644
index 0000000000000000000000000000000000000000..3f51db02c066516915eb9844e9d13709d7260a6e
--- /dev/null
+++ b/update.bat
@@ -0,0 +1,7 @@
+@echo off
+git fetch origin
+git reset --hard origin/master
+
+".\venv\python.exe" -c "import pip;  try: pip.main(['config', 'unset', 'global.proxy']) except Exception: pass"
+".\venv\python.exe" -m pip install -r .\requirements_win.txt
+pause
\ No newline at end of file
diff --git a/webui.bat b/webui.bat
new file mode 100644
index 0000000000000000000000000000000000000000..fa6be93de44824824588093df7ccff36b4b49b14
--- /dev/null
+++ b/webui.bat
@@ -0,0 +1 @@
+.\venv\python.exe .\webui.py --mode trt
\ No newline at end of file
diff --git a/webui.py b/webui.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c2414bd9390c611192b39b1108752a4a0563e65
--- /dev/null
+++ b/webui.py
@@ -0,0 +1,350 @@
+# coding: utf-8
+
+"""
+The entrance of the gradio
+"""
+import os
+import pdb
+
+import gradio as gr
+import os.path as osp
+from omegaconf import OmegaConf
+
+from src.pipelines.gradio_live_portrait_pipeline import GradioLivePortraitPipeline
+
+
+def load_description(fp):
+    with open(fp, 'r', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+import argparse
+
+parser = argparse.ArgumentParser(description='Faster Live Portrait Pipeline')
+parser.add_argument('--mode', required=False, type=str, default="onnx")
+parser.add_argument('--use_mp', action='store_true', help='use mediapipe or not')
+parser.add_argument(
+    "--host_ip", type=str, default="127.0.0.1", help="host ip"
+)
+parser.add_argument("--port", type=int, default=9870, help="server port")
+args, unknown = parser.parse_known_args()
+
+if args.mode == "onnx":
+    cfg_path = "configs/onnx_mp_infer.yaml" if args.use_mp else "configs/onnx_infer.yaml"
+else:
+    cfg_path = "configs/trt_mp_infer.yaml" if args.use_mp else "configs/trt_infer.yaml"
+infer_cfg = OmegaConf.load(cfg_path)
+gradio_pipeline = GradioLivePortraitPipeline(infer_cfg)
+
+
+def gpu_wrapped_execute_video(*args, **kwargs):
+    return gradio_pipeline.execute_video(*args, **kwargs)
+
+
+def gpu_wrapped_execute_image(*args, **kwargs):
+    return gradio_pipeline.execute_image(*args, **kwargs)
+
+
+def change_animal_model(is_animal):
+    global gradio_pipeline
+    gradio_pipeline.clean_models()
+    gradio_pipeline.init_models(is_animal=is_animal)
+
+
+# assets
+title_md = "assets/gradio/gradio_title.md"
+example_portrait_dir = "assets/examples/source"
+example_video_dir = "assets/examples/driving"
+#################### interface logic ####################
+
+# Define components first
+eye_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label="target eyes-open ratio")
+lip_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label="target lip-open ratio")
+retargeting_input_image = gr.Image(type="filepath")
+output_image = gr.Image(format="png", type="numpy")
+output_image_paste_back = gr.Image(format="png", type="numpy")
+
+js_func = """
+    function refresh() {
+        const url = new URL(window.location);
+
+        if (url.searchParams.get('__theme') !== 'dark') {
+            url.searchParams.set('__theme', 'dark');
+            window.location.href = url.href;
+        }
+    }
+    """
+
+with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")]), js=js_func) as demo:
+    gr.HTML(load_description(title_md))
+
+    gr.Markdown(load_description("assets/gradio/gradio_description_upload.md"))
+    with gr.Row():
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("🖼️ Source Image") as tab_image:
+                    with gr.Accordion(open=True, label="Source Image"):
+                        source_image_input = gr.Image(type="filepath")
+                        gr.Examples(
+                            examples=[
+                                [osp.join(example_portrait_dir, "s9.jpg")],
+                                [osp.join(example_portrait_dir, "s6.jpg")],
+                                [osp.join(example_portrait_dir, "s10.jpg")],
+                                [osp.join(example_portrait_dir, "s5.jpg")],
+                                [osp.join(example_portrait_dir, "s7.jpg")],
+                                [osp.join(example_portrait_dir, "s12.jpg")],
+                            ],
+                            inputs=[source_image_input],
+                            cache_examples=False,
+                        )
+
+                with gr.TabItem("🎞️ Source Video") as tab_video:
+                    with gr.Accordion(open=True, label="Source Video"):
+                        source_video_input = gr.Video()
+                        gr.Examples(
+                            examples=[
+                                [osp.join(example_video_dir, "d9.mp4")],
+                                [osp.join(example_video_dir, "d10.mp4")],
+                                [osp.join(example_video_dir, "d11.mp4")],
+                                [osp.join(example_video_dir, "d12.mp4")],
+                                [osp.join(example_video_dir, "d13.mp4")],
+                                [osp.join(example_video_dir, "d14.mp4")],
+                            ],
+                            inputs=[source_video_input],
+                            cache_examples=False,
+                        )
+
+                tab_selection = gr.Textbox(visible=False)
+                tab_image.select(lambda: "Image", None, tab_selection)
+                tab_video.select(lambda: "Video", None, tab_selection)
+            with gr.Accordion(open=True, label="Cropping Options for Source Image or Video"):
+                with gr.Row():
+                    flag_do_crop_input = gr.Checkbox(value=True, label="do crop (source)")
+                    scale = gr.Number(value=2.3, label="source crop scale", minimum=1.8, maximum=3.2, step=0.05)
+                    vx_ratio = gr.Number(value=0.0, label="source crop x", minimum=-0.5, maximum=0.5, step=0.01)
+                    vy_ratio = gr.Number(value=-0.125, label="source crop y", minimum=-0.5, maximum=0.5, step=0.01)
+
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("🎞️ Driving Video") as v_tab_video:
+                    with gr.Accordion(open=True, label="Driving Video"):
+                        driving_video_input = gr.Video()
+                        gr.Examples(
+                            examples=[
+                                [osp.join(example_video_dir, "d9.mp4")],
+                                [osp.join(example_video_dir, "d10.mp4")],
+                                [osp.join(example_video_dir, "d11.mp4")],
+                                [osp.join(example_video_dir, "d12.mp4")],
+                                [osp.join(example_video_dir, "d13.mp4")],
+                                [osp.join(example_video_dir, "d14.mp4")],
+                            ],
+                            inputs=[driving_video_input],
+                            cache_examples=False,
+                        )
+                with gr.TabItem("🖼️ Driving Image") as v_tab_image:
+                    with gr.Accordion(open=True, label="Driving Image"):
+                        driving_image_input = gr.Image(type="filepath")
+                        gr.Examples(
+                            examples=[
+                                [osp.join(example_portrait_dir, "s9.jpg")],
+                                [osp.join(example_portrait_dir, "s6.jpg")],
+                                [osp.join(example_portrait_dir, "s10.jpg")],
+                                [osp.join(example_portrait_dir, "s5.jpg")],
+                                [osp.join(example_portrait_dir, "s7.jpg")],
+                                [osp.join(example_portrait_dir, "s12.jpg")],
+                            ],
+                            inputs=[driving_image_input],
+                            cache_examples=False,
+                        )
+
+                with gr.TabItem("📁 Driving Pickle") as v_tab_pickle:
+                    with gr.Accordion(open=True, label="Driving Pickle"):
+                        driving_pickle_input = gr.File(type="filepath", file_types=[".pkl"])
+                        gr.Examples(
+                            examples=[
+                                [osp.join(example_video_dir, "d2.pkl")],
+                                [osp.join(example_video_dir, "d8.pkl")],
+                            ],
+                            inputs=[driving_pickle_input],
+                            cache_examples=False,
+                        )
+
+                with gr.TabItem("🎵 Driving Audio") as v_tab_audio:
+                    with gr.Accordion(open=True, label="Driving Audio"):
+                        driving_audio_input = gr.Audio(
+                            value=None,
+                            type="filepath",
+                            interactive=True,
+                            show_label=False,
+                            waveform_options=gr.WaveformOptions(
+                                sample_rate=24000,
+                            ),
+                        )
+                        gr.Examples(
+                            examples=[
+                                [osp.join(example_video_dir, "a-01.wav")],
+                            ],
+                            inputs=[driving_audio_input],
+                            cache_examples=False,
+                        )
+
+                with gr.TabItem("📄Driving Text") as v_tab_text:
+                    with gr.Accordion(open=True, label="Driving Text"):
+                        driving_text_input = gr.Textbox(value="Hi, I am created by Faster LivePortrait!",
+                                                        label="Driving Text")
+                        voice_dir = "checkpoints/Kokoro-82M/voices/"
+                        voice_names = [os.path.splitext(vname)[0] for vname in os.listdir(voice_dir) if vname.endswith(".pt")]
+                        voice_name = gr.Dropdown(
+                            choices=voice_names, value='af_heart', label="Voice Name")
+
+                v_tab_selection = gr.Textbox(value="Video", visible=False)
+                v_tab_video.select(lambda: "Video", None, v_tab_selection)
+                v_tab_image.select(lambda: "Image", None, v_tab_selection)
+                v_tab_pickle.select(lambda: "Pickle", None, v_tab_selection)
+                v_tab_audio.select(lambda: "Audio", None, v_tab_selection)
+                v_tab_text.select(lambda: "Text", None, v_tab_selection)
+
+            # with gr.Accordion(open=False, label="Animation Instructions"):
+            # gr.Markdown(load_description("assets/gradio/gradio_description_animation.md"))
+            with gr.Accordion(open=True, label="Cropping Options for Driving Video"):
+                with gr.Row():
+                    flag_crop_driving_video_input = gr.Checkbox(value=False, label="do crop (driving)")
+                    scale_crop_driving_video = gr.Number(value=2.2, label="driving crop scale", minimum=1.8,
+                                                         maximum=3.2, step=0.05)
+                    vx_ratio_crop_driving_video = gr.Number(value=0.0, label="driving crop x", minimum=-0.5,
+                                                            maximum=0.5, step=0.01)
+                    vy_ratio_crop_driving_video = gr.Number(value=-0.1, label="driving crop y", minimum=-0.5,
+                                                            maximum=0.5, step=0.01)
+
+    with gr.Row():
+        with gr.Accordion(open=True, label="Animation Options"):
+            with gr.Row():
+                flag_relative_input = gr.Checkbox(value=False, label="relative motion")
+                flag_stitching = gr.Checkbox(value=True, label="stitching")
+                driving_multiplier = gr.Number(value=1.0, label="driving multiplier", minimum=0.0, maximum=2.0,
+                                               step=0.02)
+                cfg_scale = gr.Number(value=4.0, label="cfg_scale", minimum=0.0, maximum=10.0, step=0.5)
+                flag_remap_input = gr.Checkbox(value=True, label="paste-back")
+                animation_region = gr.Radio(["exp", "pose", "lip", "eyes", "all"], value="all",
+                                            label="animation region")
+                flag_video_editing_head_rotation = gr.Checkbox(value=False, label="relative head rotation (v2v)")
+                driving_smooth_observation_variance = gr.Number(value=1e-7, label="motion smooth strength (v2v)",
+                                                                minimum=1e-11, maximum=1e-2, step=1e-8)
+                flag_is_animal = gr.Checkbox(value=False, label="is_animal")
+
+    gr.Markdown(load_description("assets/gradio/gradio_description_animate_clear.md"))
+    with gr.Row():
+        process_button_animation = gr.Button("🚀 Animate", variant="primary")
+
+    with gr.Column():
+        with gr.Row():
+            with gr.Column():
+                output_video_i2v = gr.Video(autoplay=False, label="The animated video in the original image space")
+            with gr.Column():
+                output_video_concat_i2v = gr.Video(autoplay=False, label="The animated video")
+        with gr.Row():
+            with gr.Column():
+                output_image_i2i = gr.Image(format="png", type="numpy",
+                                            label="The animated image in the original image space",
+                                            visible=False)
+            with gr.Column():
+                output_image_concat_i2i = gr.Image(format="png", type="numpy", label="The animated image",
+                                                   visible=False)
+    with gr.Row():
+        process_button_reset = gr.ClearButton(
+            [source_image_input, source_video_input, driving_pickle_input, driving_video_input,
+             driving_image_input, output_video_i2v, output_video_concat_i2v, output_image_i2i, output_image_concat_i2i],
+            value="🧹 Clear")
+
+    # Retargeting
+    gr.Markdown(load_description("assets/gradio/gradio_description_retargeting.md"), visible=True)
+    with gr.Row(visible=True):
+        eye_retargeting_slider.render()
+        lip_retargeting_slider.render()
+    with gr.Row(visible=True):
+        process_button_retargeting = gr.Button("🚗 Retargeting", variant="primary")
+        process_button_reset_retargeting = gr.ClearButton(
+            [
+                eye_retargeting_slider,
+                lip_retargeting_slider,
+                retargeting_input_image,
+                output_image,
+                output_image_paste_back
+            ],
+            value="🧹 Clear"
+        )
+    with gr.Row(visible=True):
+        with gr.Column():
+            with gr.Accordion(open=True, label="Retargeting Input"):
+                retargeting_input_image.render()
+                gr.Examples(
+                    examples=[
+                        [osp.join(example_portrait_dir, "s9.jpg")],
+                        [osp.join(example_portrait_dir, "s6.jpg")],
+                        [osp.join(example_portrait_dir, "s10.jpg")],
+                        [osp.join(example_portrait_dir, "s5.jpg")],
+                        [osp.join(example_portrait_dir, "s7.jpg")],
+                        [osp.join(example_portrait_dir, "s12.jpg")],
+                    ],
+                    inputs=[retargeting_input_image],
+                    cache_examples=False,
+                )
+        with gr.Column():
+            with gr.Accordion(open=True, label="Retargeting Result"):
+                output_image.render()
+        with gr.Column():
+            with gr.Accordion(open=True, label="Paste-back Result"):
+                output_image_paste_back.render()
+
+    flag_is_animal.change(change_animal_model, inputs=[flag_is_animal])
+    # binding functions for buttons
+    process_button_retargeting.click(
+        # fn=gradio_pipeline.execute_image,
+        fn=gpu_wrapped_execute_image,
+        inputs=[eye_retargeting_slider, lip_retargeting_slider, retargeting_input_image, flag_do_crop_input],
+        outputs=[output_image, output_image_paste_back],
+        show_progress=True
+    )
+    process_button_animation.click(
+        fn=gpu_wrapped_execute_video,
+        inputs=[
+            source_image_input,
+            source_video_input,
+            driving_video_input,
+            driving_image_input,
+            driving_pickle_input,
+            driving_audio_input,
+            driving_text_input,
+            flag_relative_input,
+            flag_do_crop_input,
+            flag_remap_input,
+            driving_multiplier,
+            flag_stitching,
+            flag_crop_driving_video_input,
+            flag_video_editing_head_rotation,
+            flag_is_animal,
+            animation_region,
+            scale,
+            vx_ratio,
+            vy_ratio,
+            scale_crop_driving_video,
+            vx_ratio_crop_driving_video,
+            vy_ratio_crop_driving_video,
+            driving_smooth_observation_variance,
+            tab_selection,
+            v_tab_selection,
+            cfg_scale,
+            voice_name
+        ],
+        outputs=[output_video_i2v, output_video_i2v, output_video_concat_i2v, output_video_concat_i2v,
+                 output_image_i2i, output_image_i2i, output_image_concat_i2i, output_image_concat_i2i],
+        show_progress=True
+    )
+
+if __name__ == '__main__':
+    demo.launch(
+        server_port=args.port,
+        share=False,
+        server_name=args.host_ip
+    )