Spaces:

AndroidGuy
/

FasterLivepotrait

Configuration error

App Files Files Community

AndroidGuy commited on 18 days ago

Commit

8dc9718

1 Parent(s): 736c8f2

Add files with Git LFS support

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
.gitignore +14 -0
DockerfileAPI +7 -0
LICENSE +33 -0
README.md +183 -11
README_ZH.md +173 -0
api.py +479 -0
assets/.gitignore +2 -0
assets/docs/API.md +41 -0
assets/docs/API_ZH.md +47 -0
assets/gradio/gradio_description_animate_clear.md +6 -0
assets/gradio/gradio_description_animation.md +19 -0
assets/gradio/gradio_description_retargeting.md +14 -0
assets/gradio/gradio_description_upload.md +16 -0
assets/gradio/gradio_title.md +19 -0
assets/mask_template.png +0 -0
camera.bat +32 -0
configs/onnx_infer.yaml +114 -0
configs/onnx_mp_infer.yaml +108 -0
configs/trt_infer.yaml +114 -0
configs/trt_mp_infer.yaml +108 -0
requirements.txt +18 -0
requirements_macos.txt +18 -0
requirements_win.txt +17 -0
run.py +322 -0
scripts/all_onnx2trt.bat +29 -0
scripts/all_onnx2trt.sh +17 -0
scripts/all_onnx2trt_animal.sh +12 -0
scripts/onnx2trt.py +161 -0
scripts/start_api.sh +3 -0
src/__init__.py +5 -0
src/models/JoyVASA/__init__.py +6 -0
src/models/JoyVASA/common.py +46 -0
src/models/JoyVASA/dit_talking_head.py +538 -0
src/models/JoyVASA/helper.py +32 -0
src/models/JoyVASA/hubert.py +51 -0
src/models/JoyVASA/wav2vec2.py +119 -0
src/models/XPose/__init__.py +6 -0
src/models/XPose/config_model/UniPose_SwinT.py +125 -0
src/models/XPose/config_model/__init__.py +6 -0
src/models/XPose/config_model/coco_transformer.py +8 -0
src/models/XPose/models/UniPose/__init__.py +10 -0
src/models/XPose/models/UniPose/attention.py +373 -0
src/models/XPose/models/UniPose/backbone.py +211 -0
src/models/XPose/models/UniPose/deformable_transformer.py +1230 -0
src/models/XPose/models/UniPose/fuse_modules.py +276 -0
src/models/XPose/models/UniPose/mask_generate.py +56 -0
src/models/XPose/models/UniPose/ops/__init__.py +6 -0
src/models/XPose/models/UniPose/ops/functions/__init__.py +10 -0
src/models/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py +61 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+__pycache__
+.idea
+*.pyc
+.DS_Store
+checkpoints
+results
+venv
+*.egg-info
+build
+dist
+*.eg
+checkpoints_test
+logs
+third_party

DockerfileAPI ADDED Viewed

	@@ -0,0 +1,7 @@

+FROM shaoguo/faster_liveportrait:v3
+USER root
+RUN mkdir -p /root/FasterLiveportrait
+RUN chown -R /root/FasterLiveportrait
+COPY . /root/FasterLiveportrait
+WORKDIR /root/FasterLiveportrait
+CMD ["/bin/bash && bash scripts/start_api.sh"]

LICENSE ADDED Viewed

	@@ -0,0 +1,33 @@

+MIT License
+Copyright (c) 2025 warmshao
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+---
+ADDITIONAL NOTICE FOR MODELS:
+This repository may contain or reference machine learning models. These models
+are subject to their respective licenses, which may differ from the MIT license
+applied to the code in this repository. Users are responsible for complying
+with the license terms of any models they use. This repository and its
+maintainers assume no responsibility for model licensing compliance.
+Please check the original source and license of each model before use.

README.md CHANGED Viewed

@@ -1,11 +1,183 @@
----
-title: FasterLivepotrait
-emoji: 💻
-colorFrom: purple
-colorTo: gray
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+## FasterLivePortrait: Bring portraits to life in Real Time!
+<a href="README.md">English</a> | <a href="README_ZH.md">中文</a>
+**Original repository: [LivePortrait](https://github.com/KwaiVGI/LivePortrait), thanks to the authors for sharing**
+**New features:**
+* Achieved real-time running of LivePortrait on RTX 3090 GPU using TensorRT, reaching speeds of 30+ FPS. This is the speed for rendering a single frame, including pre- and post-processing, not just the model inference speed.
+* Seamless support for native gradio app, with several times faster speed and support for simultaneous inference on multiple faces and Animal Model.
+* Added support for [JoyVASA](https://github.com/jdh-algo/JoyVASA), which can drive videos or images with audio.
+**If you find this project useful, please give it a star ✨✨**
+### Demo (Explore more features)
+* Anyone want this? Fell free to contact me.
+<video src="https://github.com/user-attachments/assets/554c37fc-d098-4938-a638-1660d85d222e" controls="controls" width="500" height="300">Your browser does not support this video!</video>
+* Text-driven video, based on kokoro-82M:
+<video src="https://github.com/user-attachments/assets/04e962e2-6c57-4d01-ae4a-2f6d2d501c5a" controls="controls" width="500" height="300">Your browser does not support this video!</video>
+* Audio-driven video (real-time):
+<video src="https://github.com/user-attachments/assets/98bb5ff7-0796-42db-9d7b-e04ddd2c3c14" controls="controls" width="500" height="300">Your browser does not support this video!</video>
+* Animal-driven:
+<video src="https://github.com/user-attachments/assets/dada0a92-593a-480b-a034-cbcce16e38b9" controls="controls" width="500" height="300">Your browser does not support this video!</video>
+* Multiple faces driven simultaneously:
+<video src="https://github.com/KwaiVGI/LivePortrait/assets/138360003/b37de35d-6feb-4100-b73f-58ac23121483" controls="controls" width="500" height="300">Your browser does not support this video!</video>
+### Environment Setup
+* Option 1 (recommended): If you are a Windows user, you can directly download the [integrated package](https://github.com/warmshao/FasterLivePortrait/releases/tag/v1.8).
+    * You need to install [git](https://git-scm.com/downloads) first, then double-click `update.bat` to update the code.
+    * Double-click `scripts/all_onnx2trt.bat` to convert onnx files to tensorrt files.
+    * Double-click `webui.bat` to open the webpage, or double-click `camera.bat` to open the camera for real-time operation.
+* Option 2: Docker.A docker image is provided for  eliminating the need to install onnxruntime-gpu and TensorRT manually.
+  * Install [Docker](https://docs.docker.com/desktop/install/windows-install/) according to your system
+  * Download the image: `docker pull shaoguo/faster_liveportrait:v3`
+  * Execute the command, replace `$FasterLivePortrait_ROOT` with the local directory where you downloaded FasterLivePortrait:
+  ```shell
+  docker run -it --gpus=all \
+  --name faster_liveportrait \
+  -v $FasterLivePortrait_ROOT:/root/FasterLivePortrait \
+  --restart=always \
+  -p 9870:9870 \
+  shaoguo/faster_liveportrait:v3 \
+  /bin/bash
+  ```
+* Option 3: Create a new Python virtual environment and install the necessary Python packages manually.
+  * First, install [ffmpeg](https://www.ffmpeg.org/download.html)
+  * Run `pip install -r requirements.txt`
+  * Then follow the tutorials below to install onnxruntime-gpu or TensorRT. Note that this has only been tested on Linux systems.
+### Usage
+#### 1. TensorRT Inference(Recommended)
+* (Ignored in Docker) Install TensorRT 8.x (versions >=10.x are not compatible). Remember the installation path of [TensorRT](https://developer.nvidia.com/tensorrt).
+* (Ignored in Docker) Install the grid_sample TensorRT plugin, as the model uses grid sample that requires 5D input, which is not supported by the native grid_sample operator.
+  * `git clone https://github.com/SeanWangJS/grid-sample3d-trt-plugin`
+  * Modify line 30 in `CMakeLists.txt` to: `set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "60;70;75;80;86")`
+  * `export PATH=/usr/local/cuda/bin:$PATH`
+  * `mkdir build && cd build`
+  * `cmake .. -DTensorRT_ROOT=$TENSORRT_HOME`, replace $TENSORRT_HOME with your own TensorRT root directory.
+  * `make`, remember the address of the .so file, replace `/opt/grid-sample3d-trt-plugin/build/libgrid_sample_3d_plugin.so` in `scripts/onnx2trt.py` and `src/models/predictor.py` with your own .so file path
+* Download ONNX model files:`huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`. Convert all ONNX models to TensorRT, run `sh scripts/all_onnx2trt.sh` and `sh scripts/all_onnx2trt_animal.sh`
+* Test the pipeline using tensorrt:
+  ```shell
+   python run.py \
+   --src_image assets/examples/source/s10.jpg \
+   --dri_video assets/examples/driving/d14.mp4 \
+   --cfg configs/trt_infer.yaml
+* To run in real-time using a camera:
+  ```shell
+   python run.py \
+   --src_image assets/examples/source/s10.jpg \
+   --dri_video 0 \
+   --cfg configs/trt_infer.yaml \
+   --realtime
+  ```
+#### 2. Onnxruntime Inference
+* First, download the converted onnx model files:`huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`.
+* (Ignored in Docker)If you want to use onnxruntime cpu inference, simply `pip install onnxruntime`. However, cpu inference is extremely slow and not recommended. The latest onnxruntime-gpu still doesn't support grid_sample cuda, but I found a branch that supports it. Follow these steps to install `onnxruntime-gpu` from source:
+  * `git clone https://github.com/microsoft/onnxruntime`
+  * `git checkout liqun/ImageDecoder-cuda`. Thanks to liqun for the grid_sample with cuda implementation!
+  * Run the following commands to compile, changing `cuda_version` and `CMAKE_CUDA_ARCHITECTURES` according to your machine (your cuDNN version must be 8.x, 9.x is not compatible):
+  ```shell
+  ./build.sh --parallel \
+  --build_shared_lib --use_cuda \
+  --cuda_version 11.8 \
+  --cuda_home /usr/local/cuda --cudnn_home /usr/local/cuda/ \
+  --config Release --build_wheel --skip_tests \
+  --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES="60;70;75;80;86" \
+  --cmake_extra_defines CMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+  --disable_contrib_ops \
+  --allow_running_as_root
+  ```
+  * `pip install build/Linux/Release/dist/onnxruntime_gpu-1.17.0-cp310-cp310-linux_x86_64.whl`
+* Test the pipeline using onnxruntime:
+    ```
+      python run.py \
+     --src_image assets/examples/source/s10.jpg \
+     --dri_video assets/examples/driving/d14.mp4 \
+     --cfg configs/onnx_infer.yaml
+     ```
+### Gradio WebUI
+* onnxruntime: `python webui.py --mode onnx`
+* tensorrt: `python webui.py --mode trt`
+* The default port is 9870. Open the webpage: `http://localhost:9870/`
+Hotkeys for webcam mode (when render window is on focus)\
+Q > exit\
+S > Stitching\
+Z > RelativeMotion\
+X > AnimationRegion\
+C > CropDrivingVideo\
+K,L > AdjustSourceScale\
+N,M > AdjustDriverScale
+## License
+- **Code**: This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+- **Models**: Any machine learning models used in this project are subject to their respective licenses. Please refer to the original model sources for license information. We do not take responsibility for model license compliance.
+**Changelog**
+- [x] **2025/06/29:** LivePortrait animal v1.1 onnx models are available. Download from [this](https://huggingface.co/warmshao/FasterLivePortrait/tree/main/liveportrait_animal_onnx_v1.1).
+- [x] **2024/12/22:** Add API Deployment `python api.py`, For more information, please refer to the [tutorial](assets/docs/API.md).
+- [x] **2024/12/21:** Added support for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M), enabling text-driven video or image generation.
+  - Updated code: `git pull origin master` and install the latest Python dependencies `pip install requirements.txt`, or simply double-click `update.bat` on Windows.
+  - Download the model: `huggingface-cli download hexgrad/Kokoro-82M --local-dir .\checkpoints\Kokoro-82M`.
+  - For Linux, install `espeak-ng`: `apt-get -qq -y install espeak-ng > /dev/null 2>&1`
+  - For Windows, refer to [manual installation instructions](https://huggingface.co/hexgrad/Kokoro-82M/discussions/12) and configure the `espeak-ng` environment variables.  The current read location is [here](src/pipelines/gradio_live_portrait_pipeline.py:437); modify it if your installation path differs.
+  -  Now you can use it normally in the "Drive Text" tab.
+- [x] **2024/12/16:** Added support for [JoyVASA](https://github.com/jdh-algo/JoyVASA), which can drive videos or images with audio. Very cool!
+ - Update code, then download the models: `huggingface-cli download TencentGameMate/chinese-hubert-base --local-dir .\checkpoints\chinese-hubert-base` and `huggingface-cli download jdh-algo/JoyVASA --local-dir ./checkpoints/JoyVASA`
+ - After launching the webui, follow the tutorial below. When the source is a video, it's recommended to only drive the mouth movements
+  <video src="https://github.com/user-attachments/assets/42fb24be-0cde-4138-9671-e52eec95e7f5" controls="controls" width="500" height="400">您的浏览器不支持播放该视频！</video>
+- [x] **2024/12/14:** Added pickle and image driving, as well as region driving animation_region.
+  - Please update the latest code. Windows users can directly double-click `update.bat` to update, but note that your local code will be overwritten.
+  - Running `python run.py` now automatically saves the corresponding pickle to the same directory as the driving video, allowing for direct reuse.
+  - After opening webui, you can experience the new pickle and image driving, as well as the region driving animation_region features. Note that for image driving, remember to disable `relative motion`.
+- [x] **2024/08/11:** Optimized paste_back speed and fixed some bugs.
+  - Used torchgeometry + cuda to optimize the paste_back function, significantly improving speed. Example: `python run.py --src_image assets/examples/source/s39.jpg --dri_video assets/examples/driving/d0.mp4 --cfg configs/trt_infer.yaml --paste_back --animal`
+  - Fixed issues with Xpose ops causing errors on some GPUs and other bugs. Please use the latest docker image: `docker pull shaoguo/faster_liveportrait:v3`
+- [x] **2024/08/11:** Optimized paste_back speed and fixed some bugs.
+  - Used torchgeometry + cuda to optimize the paste_back function, significantly improving speed. Example: `python run.py --src_image assets/examples/source/s39.jpg --dri_video assets/examples/driving/d0.mp4 --cfg configs/trt_infer.yaml --paste_back --animal`
+  - Fixed issues with Xpose ops causing errors on some GPUs and other bugs. Please use the latest docker image: `docker pull shaoguo/faster_liveportrait:v3`
+- [x] **2024/08/07:** Added support for animal models and MediaPipe models, so you no longer need to worry about copyright issues.
+  - Added support for animal models.
+    - Download the animal ONNX file: `huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`, then convert it to TRT format.
+    - Update the Docker image: `docker pull shaoguo/faster_liveportrait:v3`. Using animal model:`python run.py --src_image assets/examples/source/s39.jpg --dri_video 0 --cfg configs/trt_infer.yaml --realtime --animal`
+    - Windows users can download the latest [Windows all-in-one package](https://github.com/warmshao/FasterLivePortrait/releases) from the release page, then unzip and use it.
+    - Simple usage tutorial:
+    <video src="https://github.com/user-attachments/assets/dc37e2dd-551a-43b0-8929-fc5d5fe16ec5" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+  - Using MediaPipe model to replace InsightFace
+    - For web usage: `python webui.py --mode trt --mp` or `python webui.py --mode onnx --mp`
+    - For local webcam: `python run.py --src_image assets/examples/source/s12.jpg --dri_video 0 --cfg configs/trt_mp_infer.yaml`
+- [x] **2024/07/24:** Windows integration package, no installation required, one-click run, supports TensorRT and OnnxruntimeGPU. Thanks to @zhanghongyong123456 for their contribution in this [issue](https://github.com/warmshao/FasterLivePortrait/issues/22).
+  - [Optional] If you have already installed CUDA and cuDNN on your Windows computer, please skip this step. I have only verified on CUDA 12.2. If you haven't installed CUDA or encounter CUDA-related errors, you need to follow these steps:
+    - Download [CUDA 12.2](https://developer.nvidia.com/cuda-12-2-0-download-archive?target_os=Windows&target_arch=x86_64), double-click the exe and install following the default settings step by step.
+    - Download the [cuDNN](https://developer.nvidia.com/downloads/compute/cudnn/secure/8.9.7/local_installers/12.x/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip) zip file, extract it, and copy the lib, bin, and include folders from the cuDNN folder to the CUDA 12.2 folder (default is C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2)
+  - Download the installation-free [Windows integration package](https://github.com/warmshao/FasterLivePortrait/releases) from the release page and extract it.
+  - Enter `FasterLivePortrait-windows` and double-click `scripts/all_onnx2trt.bat` to convert onnx files, which will take some time.
+  - For web demo: Double-click `webui.bat`, open the webpage: `http://localhost:9870/`
+  - For real-time camera operation, double-click `camera.bat`，press `q` to stop. If you want to change the target image, run in command line: `camera.bat assets/examples/source/s9.jpg`
+- [x] **2024/07/18:** macOS support added(No need for Docker, Python is enough). M1/M2 chips are faster, but it's still quite slow 😟
+  - Install ffmpeg: `brew install ffmpeg`
+  - Set up a Python 3.10 virtual environment. Recommend using [miniforge](https://github.com/conda-forge/miniforge): `conda create -n flip python=3.10 && conda activate flip`
+  - Install requirements: `pip install -r requirements_macos.txt`
+  - Download ONNX files: `huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`
+  - Test: `python webui.py --mode onnx`
+- [x] **2024/07/17:** Added support for Docker environment, providing a runnable image.

README_ZH.md ADDED Viewed

	@@ -0,0 +1,173 @@

+## FasterLivePortrait：Bring portrait to life in Real Time!
+<a href="README.md">English</a> | <a href="README_ZH.md">中文</a>
+**原仓库: [LivePortrait](https://github.com/KwaiVGI/LivePortrait)，感谢作者的分享**
+**新增功能：**
+* 通过TensorRT实现在RTX 3090显卡上**实时**运行LivePortrait，速度达到 30+ FPS. 这个速度是实测渲染出一帧的速度，而不仅仅是模型的推理时间。
+* 无缝支持原生的gradio app, 速度快了好几倍，同时支持多张人脸、Animal模型。
+* 增加[JoyVASA](https://github.com/jdh-algo/JoyVASA)的支持，可以用音频驱动视频或图片。
+**如果你觉得这个项目有用，帮我点个star吧✨✨**
+### Demo(还有很多功能等你探索)
+* 文本驱动视频，基于kokoro-82M:
+<video src="https://github.com/user-attachments/assets/04e962e2-6c57-4d01-ae4a-2f6d2d501c5a" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+* 声音驱动视频(可以实时):
+<video src="https://github.com/user-attachments/assets/98bb5ff7-0796-42db-9d7b-e04ddd2c3c14" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+* 动物驱动:
+<video src="https://github.com/user-attachments/assets/dada0a92-593a-480b-a034-cbcce16e38b9" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+* 多张人脸同时驱动:
+<video src="https://github.com/KwaiVGI/LivePortrait/assets/138360003/b37de35d-6feb-4100-b73f-58ac23121483" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+### 环境安装
+* 方式1：如果你是Windows用户，推荐可以直接下载[整合包](https://github.com/warmshao/FasterLivePortrait/releases/tag/v1.8)。
+  * 需要先安装好[git](https://git-scm.com/downloads), 双击`update.bat`更新代码。
+  * 双击`scripts/all_onnx2trt.bat`转换onnx文件为tensorrt文件。
+  * 双击`webui.bat`打开网页，或者双击`camera.bat`打开摄像头实时运行。
+* 方式2：Docker，提供了一个镜像，不用再自己安装onnxruntime-gpu和TensorRT。
+  * 根据自己的系统安装[docker](https://docs.docker.com/desktop/install/windows-install/)
+  * 下载镜像：`docker pull shaoguo/faster_liveportrait:v3`
+  * 执行命令, `$FasterLivePortrait_ROOT`要替换成你下载的FasterLivePortrait在本地的目录:
+  ```shell
+  docker run -it --gpus=all \
+  --name faster_liveportrait \
+  -v $FasterLivePortrait_ROOT:/root/FasterLivePortrait \
+  --restart=always \
+  -p 9870:9870 \
+  shaoguo/faster_liveportrait:v3 \
+  /bin/bash
+  ```
+  * 然后可以根据下面Onnxruntime 推理和TensorRT 推理教程进行使用。
+* 方式3：新建一个python虚拟环境，自己安装必要的python包
+  * 请先安装[ffmpeg](https://www.ffmpeg.org/download.html)
+  * `pip install -r requirements.txt`
+  * 再根据以下教程安装onnxruntime-gpu或TensorRT。
+### 使用方法
+#### 1. TensorRT 推理(推荐, 可以实时)
+* (Docker环境可忽略）安装TensorRT，请记住[TensorRT](https://developer.nvidia.com/tensorrt)安装的路径。
+* (Docker环境可忽略）安装 grid_sample的tensorrt插件，因为模型用到的grid sample需要有5d的输入,原生的grid_sample 算子不支持。
+  * `git clone https://github.com/SeanWangJS/grid-sample3d-trt-plugin`
+  * 修改`CMakeLists.txt`中第30行为:`set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "60;70;75;80;86")`
+  * `export PATH=/usr/local/cuda/bin:$PATH`
+  * `mkdir build && cd build`
+  * `cmake .. -DTensorRT_ROOT=$TENSORRT_HOME`,$TENSORRT_HOME 替换成你自己TensorRT的根目录。
+  * `make`，记住so文件的地址，将`scripts/onnx2trt.py`和`src/models/predictor.py`里`/opt/grid-sample3d-trt-plugin/build/libgrid_sample_3d_plugin.so`替换成自己的so路径
+* 下载Onnx文件：`huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`。将onnx模型转为tensorrt，运行`sh scripts/all_onnx2trt.sh`和`sh scripts/all_onnx2trt_animal.sh`
+* 用tensorrt测试pipeline：
+  ```shell
+   python run.py \
+   --src_image assets/examples/source/s10.jpg \
+   --dri_video assets/examples/driving/d14.mp4 \
+   --cfg configs/trt_infer.yaml
+  ```
+  如果要使用摄像头实时运行：
+  ```shell
+   python run.py \
+   --src_image assets/examples/source/s10.jpg \
+   --dri_video 0 \
+   --cfg configs/trt_infer.yaml \
+   --realtime
+  ```
+#### 2. Onnxruntime 推理
+* 首先下载我转换好的[模型onnx文件](https://huggingface.co/warmshao/FasterLivePortrait): `huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`。
+* (Docker环境可忽略）如果你要用onnxruntime cpu推理的话，直接`pip install onnxruntime`即可，但是cpu推理超级慢。但是最新的onnxruntime-gpu仍然无法支持grid_sample cuda，好在我看到一位大佬在分支上支持了，按照以下步骤源码安装`onnxruntime-gpu`:
+  * `git clone https://github.com/microsoft/onnxruntime`
+  * `git checkout liqun/ImageDecoder-cuda`. Thanks for liqun's grid_sample with cuda implementation!
+  * 运行以下命令编译,`cuda_version`和`CMAKE_CUDA_ARCHITECTURES`根据自己的机器更改:
+  ```shell
+  ./build.sh --parallel \
+  --build_shared_lib --use_cuda \
+  --cuda_version 11.8 \
+  --cuda_home /usr/local/cuda --cudnn_home /usr/local/cuda/ \
+  --config Release --build_wheel --skip_tests \
+  --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES="60;70;75;80;86" \
+  --cmake_extra_defines CMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+  --disable_contrib_ops \
+  --allow_running_as_root
+  ```
+  * `pip install build/Linux/Release/dist/onnxruntime_gpu-1.17.0-cp310-cp310-linux_x86_64.whl`就可以了
+* 用onnxruntime测试pipeline：
+  ```shell
+   python run.py \
+   --src_image assets/examples/source/s10.jpg \
+   --dri_video assets/examples/driving/d14.mp4 \
+   --cfg configs/onnx_infer.yaml
+  ```
+### Gradio WebUI
+* onnxruntime: `python webui.py --mode onnx`
+* tensorrt: `python webui.py --mode trt`
+* 默认端口在9870，打开网页：`http://localhost:9870/`
+Hotkeys for webcam mode (when render window is on focus)\
+Q > exit\
+S > Stitching\
+Z > RelativeMotion\
+X > AnimationRegion\
+C > CropDrivingVideo\
+K,L > AdjustSourceScale\
+N,M > AdjustDriverScale
+## 许可证
+- **代码**: 本项目采用 MIT 许可证 - 详细信息请查看 [LICENSE](LICENSE) 文件。
+- **模型**: 本项目中使用的任何机器学习模型均遵循其各自的许可证。请参考原始模型来源获取许可证信息。我们不承担模型许可证合规性的责任。
+**日志**
+- [x] **2025/06/29:** [LivePortrait animal v1.1 onnx模型](https://huggingface.co/warmshao/FasterLivePortrait/tree/main/liveportrait_animal_onnx_v1.1)。
+- [x] **2024/12/22:** 增加api部署`python api.py`, 其他参考[教程](assets/docs/API_ZH.md)使用。
+- [x] **2024/12/21:** 增加[Kokoro-82M](hhttps://huggingface.co/hexgrad/Kokoro-82M)的支持，可以用文本驱动视频或图片。
+  - 更新代码, `git pull origin master`并安装最新的python依赖 `pip install requirements.txt`， 或者 windows下直接双击 `update.bat`.
+  - 然后下载模型: `huggingface-cli download hexgrad/Kokoro-82M --local-dir .\checkpoints\Kokoro-82M`.
+  - 如果是Linux请安装`apt-get -qq -y install espeak-ng > /dev/null 2>&1`
+  - 如果是windows请参考[自行安装](https://huggingface.co/hexgrad/Kokoro-82M/discussions/12)并配置好`espeak-ng`环境变量。我是在[这里](src/pipelines/gradio_live_portrait_pipeline.py:437)读取，如果你的位置变了，请自行修改。
+  - 然后就可以在Drive Text的标签页正常使用了。
+- [x] **2024/12/16:** 增加[JoyVASA](https://github.com/jdh-algo/JoyVASA)的支持，可以用音频驱动视频或图片。非常酷！
+  - 更新代码，然后下载模型: `huggingface-cli download TencentGameMate/chinese-hubert-base --local-dir .\checkpoints\chinese-hubert-base` 和 ` huggingface-cli download jdh-algo/JoyVASA --local-dir ./checkpoints/JoyVASA`
+  - 启动webui后根据以下教程使用即可，建议source 是视频的情况下只驱动嘴部
+   <video src="https://github.com/user-attachments/assets/42fb24be-0cde-4138-9671-e52eec95e7f5" controls="controls" width="500" height="400">您的浏览器不支持播放该视频！</video>
+- [x] **2024/12/14:** 增加pickle和image驱动以及区域驱动`animation_region`。
+  - 请更新最新的代码，windows用户可以直接双击`update.bat`更新，但请注意本地的代码将会被覆盖。
+  - `python run.py ` 现在运行 `driving video`会自动保存对应的pickle到跟`driving video`一样的目录，可以直接复用。
+  - 打开`webui`后即可体验新的pickle和image驱动以及区域驱动`animation_region`等功能。注意image驱动记得把`relative motion`取消掉。
+- [x] **2024/08/11:** 优化paste_back的速度，修复一些bug。
+  - 用torchgeometry + cuda优化paste_back函数，现在速度提升了很多。示例：`python run.py --src_image assets/examples/source/s39.jpg --dri_video assets/examples/driving/d0.mp4 --cfg configs/trt_infer.yaml --paste_back --animal`
+  - 修复Xpose的ops在一些显卡运行报错的问题等bug。请使用最新的镜像:`docker pull shaoguo/faster_liveportrait:v3`
+- [x] **2024/08/07:** 增加animal模型的支持，同时支持mediapipe模型，现在你不用再担心版权的问题。
+  - 增加对animal模型的支持。
+    - 需要下载animal的onnx文件：`huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`，然后转换成trt文件。
+    - 更新镜像`docker pull shaoguo/faster_liveportrait:v3`, 使用animal模型的示例:`python run.py --src_image assets/examples/source/s39.jpg --dri_video 0 --cfg configs/trt_infer.yaml --realtime --animal`
+    - windows系统可以从release页下载最新的[windows 整合包](https://github.com/warmshao/FasterLivePortrait/releases)，解压后使用。
+    - 简单的使用教程：
+    <video src="https://github.com/user-attachments/assets/dc37e2dd-551a-43b0-8929-fc5d5fe16ec5" controls="controls" width="500" height="300">您的浏览器不支持播放该视频！</video>
+  - 使用mediapipe模型替代insight_face
+    - 网页端使用: `python webui.py --mode trt --mp` 或 `python webui.py --mode onnx --mp`
+    - 本地摄像头运行: `python run.py --src_image assets/examples/source/s12.jpg --dri_video assets/examples/driving/d0.mp4 --cfg configs/trt_mp_infer.yaml`
+- [x] **2024/07/24:** Windows的整合包, 免安装一键运行，支持TensorRT和OnnxruntimeGPU。感谢@zhanghongyong123456在[issue](https://github.com/warmshao/FasterLivePortrait/issues/22)的贡献。
+  - 【可选】如果你的windows电脑已经装过cuda和cudnn，请忽略这一步。我只在cuda12.2上验证过，如果没安装cuda或报cuda相关的错，你需要按照以下步骤进行安装：
+    - 下载[cuda12.2](https://developer.nvidia.com/cuda-12-2-0-download-archive?target_os=Windows&target_arch=x86_64), 双击exe后按照默认设置一步步安装即可。
+    - 下载[cudnn](https://developer.nvidia.com/downloads/compute/cudnn/secure/8.9.7/local_installers/12.x/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip) 压缩包，解压后将cudnn 文件夹下的lib、bin、include 文件夹复制到 CUDA12.2 文件夹下（默认为C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2）
+  - 从release页下载免安装[windows 整合包](https://github.com/warmshao/FasterLivePortrait/releases)并解压。
+  - 进入`FasterLivePortrait-windows`后双击`scripts/all_onnx2trt.bat`对onnx文件进行转换，这会等上一段时间。
+  - 网页端demo：双击`webui.bat`, 打开网页：`http://localhost:9870/`
+  - 摄像头实时运行，双击`camera.bat`，按`q`停止。如果你想更换目标图像，命令行运行:`camera.bat assets/examples/source/s9.jpg`。
+- [x] **2024/07/18:** MacOS支持(不需要Docker，python就可以了），M1/M2的速度比较快，但还是很慢😟
+  - 安装ffmpeg: `brew install ffmpeg`
+  - 安装python=3.10的虚拟环境，推荐可以用[miniforge](https://github.com/conda-forge/miniforge).`conda create -n flip python=3.10 && conda activate flip`
+  - `pip install -r requirements_macos.txt`
+  - 下载onnx文件: `huggingface-cli download warmshao/FasterLivePortrait --local-dir ./checkpoints`
+  - 测试: `python webui.py --mode onnx`
+- [x] **2024/07/17:** 增加docker环境的支持，提供可运行的镜像。

api.py ADDED Viewed

	@@ -0,0 +1,479 @@

+# -*- coding: utf-8 -*-
+# @Time    : 2024/9/13 0:23
+# @Project : FasterLivePortrait
+# @FileName: api.py
+import pdb
+import shutil
+from typing import Optional, Dict, Any
+import io
+import os
+import subprocess
+import uvicorn
+import cv2
+import time
+import numpy as np
+import os
+import datetime
+import platform
+import pickle
+from tqdm import tqdm
+from pydantic import BaseModel
+from fastapi import APIRouter, Depends, FastAPI, Request, Response, UploadFile
+from fastapi import File, Body, Form
+from omegaconf import OmegaConf
+from fastapi.responses import StreamingResponse
+from zipfile import ZipFile
+from src.pipelines.faster_live_portrait_pipeline import FasterLivePortraitPipeline
+from src.utils.utils import video_has_audio
+from src.utils import logger
+# model dir
+project_dir = os.path.dirname(__file__)
+checkpoints_dir = os.environ.get("FLIP_CHECKPOINT_DIR", os.path.join(project_dir, "checkpoints"))
+log_dir = os.path.join(project_dir, "logs")
+os.makedirs(log_dir, exist_ok=True)
+result_dir = os.path.join(project_dir, "results")
+os.makedirs(result_dir, exist_ok=True)
+logger_f = logger.get_logger("faster_liveportrait_api", log_file=os.path.join(log_dir, "log_run.log"))
+app = FastAPI()
+global pipe
+if platform.system().lower() == 'windows':
+    FFMPEG = "third_party/ffmpeg-7.0.1-full_build/bin/ffmpeg.exe"
+else:
+    FFMPEG = "ffmpeg"
+def check_all_checkpoints_exist(infer_cfg):
+    """
+    check whether all checkpoints exist
+    :return:
+    """
+    ret = True
+    for name in infer_cfg.models:
+        if not isinstance(infer_cfg.models[name].model_path, str):
+            for i in range(len(infer_cfg.models[name].model_path)):
+                infer_cfg.models[name].model_path[i] = infer_cfg.models[name].model_path[i].replace("./checkpoints",
+                                                                                                    checkpoints_dir)
+                if not os.path.exists(infer_cfg.models[name].model_path[i]) and not os.path.exists(
+                        infer_cfg.models[name].model_path[i][:-4] + ".onnx"):
+                    return False
+        else:
+            infer_cfg.models[name].model_path = infer_cfg.models[name].model_path.replace("./checkpoints",
+                                                                                          checkpoints_dir)
+            if not os.path.exists(infer_cfg.models[name].model_path) and not os.path.exists(
+                    infer_cfg.models[name].model_path[:-4] + ".onnx"):
+                return False
+    for name in infer_cfg.animal_models:
+        if not isinstance(infer_cfg.animal_models[name].model_path, str):
+            for i in range(len(infer_cfg.animal_models[name].model_path)):
+                infer_cfg.animal_models[name].model_path[i] = infer_cfg.animal_models[name].model_path[i].replace(
+                    "./checkpoints",
+                    checkpoints_dir)
+                if not os.path.exists(infer_cfg.animal_models[name].model_path[i]) and not os.path.exists(
+                        infer_cfg.animal_models[name].model_path[i][:-4] + ".onnx"):
+                    return False
+        else:
+            infer_cfg.animal_models[name].model_path = infer_cfg.animal_models[name].model_path.replace("./checkpoints",
+                                                                                                        checkpoints_dir)
+            if not os.path.exists(infer_cfg.animal_models[name].model_path) and not os.path.exists(
+                    infer_cfg.animal_models[name].model_path[:-4] + ".onnx"):
+                return False
+    # XPOSE
+    xpose_model_path = os.path.join(checkpoints_dir, "liveportrait_animal_onnx/xpose.pth")
+    if not os.path.exists(xpose_model_path):
+        return False
+    embeddings_cache_9_path = os.path.join(checkpoints_dir, "liveportrait_animal_onnx/clip_embedding_9.pkl")
+    if not os.path.exists(embeddings_cache_9_path):
+        return False
+    embeddings_cache_68_path = os.path.join(checkpoints_dir, "liveportrait_animal_onnx/clip_embedding_68.pkl")
+    if not os.path.exists(embeddings_cache_68_path):
+        return False
+    return ret
+def convert_onnx_to_trt_models(infer_cfg):
+    ret = True
+    for name in infer_cfg.models:
+        if not isinstance(infer_cfg.models[name].model_path, str):
+            for i in range(len(infer_cfg.models[name].model_path)):
+                trt_path = infer_cfg.models[name].model_path[i]
+                onnx_path = trt_path[:-4] + ".onnx"
+                if not os.path.exists(trt_path):
+                    convert_cmd = f"python scripts/onnx2trt.py -o {onnx_path}"
+                    logger_f.info(f"convert onnx model: {onnx_path}")
+                    result = subprocess.run(convert_cmd, shell=True, check=True)
+                    # 检查结果
+                    if result.returncode == 0:
+                        logger_f.info(f"convert onnx model: {onnx_path} successful")
+                    else:
+                        logger_f.error(f"convert onnx model: {onnx_path} failed")
+                        return False
+        else:
+            trt_path = infer_cfg.models[name].model_path
+            onnx_path = trt_path[:-4] + ".onnx"
+            if not os.path.exists(trt_path):
+                convert_cmd = f"python scripts/onnx2trt.py -o {onnx_path}"
+                logger_f.info(f"convert onnx model: {onnx_path}")
+                result = subprocess.run(convert_cmd, shell=True, check=True)
+                # 检查结果
+                if result.returncode == 0:
+                    logger_f.info(f"convert onnx model: {onnx_path} successful")
+                else:
+                    logger_f.error(f"convert onnx model: {onnx_path} failed")
+                    return False
+    for name in infer_cfg.animal_models:
+        if not isinstance(infer_cfg.animal_models[name].model_path, str):
+            for i in range(len(infer_cfg.animal_models[name].model_path)):
+                trt_path = infer_cfg.animal_models[name].model_path[i]
+                onnx_path = trt_path[:-4] + ".onnx"
+                if not os.path.exists(trt_path):
+                    convert_cmd = f"python scripts/onnx2trt.py -o {onnx_path}"
+                    logger_f.info(f"convert onnx model: {onnx_path}")
+                    result = subprocess.run(convert_cmd, shell=True, check=True)
+                    # 检查结果
+                    if result.returncode == 0:
+                        logger_f.info(f"convert onnx model: {onnx_path} successful")
+                    else:
+                        logger_f.error(f"convert onnx model: {onnx_path} failed")
+                        return False
+        else:
+            trt_path = infer_cfg.animal_models[name].model_path
+            onnx_path = trt_path[:-4] + ".onnx"
+            if not os.path.exists(trt_path):
+                convert_cmd = f"python scripts/onnx2trt.py -o {onnx_path}"
+                logger_f.info(f"convert onnx model: {onnx_path}")
+                result = subprocess.run(convert_cmd, shell=True, check=True)
+                # 检查结果
+                if result.returncode == 0:
+                    logger_f.info(f"convert onnx model: {onnx_path} successful")
+                else:
+                    logger_f.error(f"convert onnx model: {onnx_path} failed")
+                    return False
+    return ret
+@app.on_event("startup")
+async def startup_event():
+    global pipe
+    # default use trt model
+    cfg_file = os.path.join(project_dir, "configs/trt_infer.yaml")
+    infer_cfg = OmegaConf.load(cfg_file)
+    checkpoints_exist = check_all_checkpoints_exist(infer_cfg)
+    # first: download model if not exist
+    if not checkpoints_exist:
+        download_cmd = f"huggingface-cli download warmshao/FasterLivePortrait --local-dir {checkpoints_dir}"
+        logger_f.info(f"download model: {download_cmd}")
+        result = subprocess.run(download_cmd, shell=True, check=True)
+        # 检查结果
+        if result.returncode == 0:
+            logger_f.info(f"Download checkpoints to {checkpoints_dir} successful")
+        else:
+            logger_f.error(f"Download checkpoints to {checkpoints_dir} failed")
+            exit(1)
+    # second: convert onnx model to trt
+    convert_ret = convert_onnx_to_trt_models(infer_cfg)
+    if not convert_ret:
+        logger_f.error(f"convert onnx model to trt failed")
+        exit(1)
+    infer_cfg.infer_params.flag_pasteback = True
+    pipe = FasterLivePortraitPipeline(cfg=infer_cfg, is_animal=True)
+def run_with_video(source_image_path, driving_video_path, save_dir):
+    global pipe
+    ret = pipe.prepare_source(source_image_path, realtime=False)
+    if not ret:
+        logger_f.warning(f"no face in {source_image_path}! exit!")
+        return
+    vcap = cv2.VideoCapture(driving_video_path)
+    fps = int(vcap.get(cv2.CAP_PROP_FPS))
+    h, w = pipe.src_imgs[0].shape[:2]
+    # render output video
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    vsave_crop_path = os.path.join(save_dir,
+                                   f"{os.path.basename(source_image_path)}-{os.path.basename(driving_video_path)}-crop.mp4")
+    vout_crop = cv2.VideoWriter(vsave_crop_path, fourcc, fps, (512 * 2, 512))
+    vsave_org_path = os.path.join(save_dir,
+                                  f"{os.path.basename(source_image_path)}-{os.path.basename(driving_video_path)}-org.mp4")
+    vout_org = cv2.VideoWriter(vsave_org_path, fourcc, fps, (w, h))
+    infer_times = []
+    motion_lst = []
+    c_eyes_lst = []
+    c_lip_lst = []
+    frame_ind = 0
+    while vcap.isOpened():
+        ret, frame = vcap.read()
+        if not ret:
+            break
+        t0 = time.time()
+        first_frame = frame_ind == 0
+        dri_crop, out_crop, out_org, dri_motion_info = pipe.run(frame, pipe.src_imgs[0], pipe.src_infos[0],
+                                                                first_frame=first_frame)
+        frame_ind += 1
+        if out_crop is None:
+            logger_f.warning(f"no face in driving frame:{frame_ind}")
+            continue
+        motion_lst.append(dri_motion_info[0])
+        c_eyes_lst.append(dri_motion_info[1])
+        c_lip_lst.append(dri_motion_info[2])
+        infer_times.append(time.time() - t0)
+        # print(time.time() - t0)
+        dri_crop = cv2.resize(dri_crop, (512, 512))
+        out_crop = np.concatenate([dri_crop, out_crop], axis=1)
+        out_crop = cv2.cvtColor(out_crop, cv2.COLOR_RGB2BGR)
+        vout_crop.write(out_crop)
+        out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+        vout_org.write(out_org)
+    vcap.release()
+    vout_crop.release()
+    vout_org.release()
+    if video_has_audio(driving_video_path):
+        vsave_crop_path_new = os.path.splitext(vsave_crop_path)[0] + "-audio.mp4"
+        subprocess.call(
+            [FFMPEG, "-i", vsave_crop_path, "-i", driving_video_path,
+             "-b:v", "10M", "-c:v",
+             "libx264", "-map", "0:v", "-map", "1:a",
+             "-c:a", "aac",
+             "-pix_fmt", "yuv420p", vsave_crop_path_new, "-y", "-shortest"])
+        vsave_org_path_new = os.path.splitext(vsave_org_path)[0] + "-audio.mp4"
+        subprocess.call(
+            [FFMPEG, "-i", vsave_org_path, "-i", driving_video_path,
+             "-b:v", "10M", "-c:v",
+             "libx264", "-map", "0:v", "-map", "1:a",
+             "-c:a", "aac",
+             "-pix_fmt", "yuv420p", vsave_org_path_new, "-y", "-shortest"])
+        logger_f.info(vsave_crop_path_new)
+        logger_f.info(vsave_org_path_new)
+    else:
+        logger_f.info(vsave_crop_path)
+        logger_f.info(vsave_org_path)
+    logger_f.info(
+        "inference median time: {} ms/frame, mean time: {} ms/frame".format(np.median(infer_times) * 1000,
+                                                                            np.mean(infer_times) * 1000))
+    # save driving motion to pkl
+    template_dct = {
+        'n_frames': len(motion_lst),
+        'output_fps': fps,
+        'motion': motion_lst,
+        'c_eyes_lst': c_eyes_lst,
+        'c_lip_lst': c_lip_lst,
+    }
+    template_pkl_path = os.path.join(save_dir,
+                                     f"{os.path.basename(driving_video_path)}.pkl")
+    with open(template_pkl_path, "wb") as fw:
+        pickle.dump(template_dct, fw)
+    logger_f.info(f"save driving motion pkl file at : {template_pkl_path}")
+def run_with_pkl(source_image_path, driving_pickle_path, save_dir):
+    global pipe
+    ret = pipe.prepare_source(source_image_path, realtime=False)
+    if not ret:
+        logger_f.warning(f"no face in {source_image_path}! exit!")
+        return
+    with open(driving_pickle_path, "rb") as fin:
+        dri_motion_infos = pickle.load(fin)
+    fps = int(dri_motion_infos["output_fps"])
+    h, w = pipe.src_imgs[0].shape[:2]
+    # render output video
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    vsave_crop_path = os.path.join(save_dir,
+                                   f"{os.path.basename(source_image_path)}-{os.path.basename(driving_pickle_path)}-crop.mp4")
+    vout_crop = cv2.VideoWriter(vsave_crop_path, fourcc, fps, (512, 512))
+    vsave_org_path = os.path.join(save_dir,
+                                  f"{os.path.basename(source_image_path)}-{os.path.basename(driving_pickle_path)}-org.mp4")
+    vout_org = cv2.VideoWriter(vsave_org_path, fourcc, fps, (w, h))
+    infer_times = []
+    motion_lst = dri_motion_infos["motion"]
+    c_eyes_lst = dri_motion_infos["c_eyes_lst"] if "c_eyes_lst" in dri_motion_infos else dri_motion_infos[
+        "c_d_eyes_lst"]
+    c_lip_lst = dri_motion_infos["c_lip_lst"] if "c_lip_lst" in dri_motion_infos else dri_motion_infos["c_d_lip_lst"]
+    frame_num = len(motion_lst)
+    for frame_ind in tqdm(range(frame_num)):
+        t0 = time.time()
+        first_frame = frame_ind == 0
+        dri_motion_info_ = [motion_lst[frame_ind], c_eyes_lst[frame_ind], c_lip_lst[frame_ind]]
+        out_crop, out_org = pipe.run_with_pkl(dri_motion_info_, pipe.src_imgs[0], pipe.src_infos[0],
+                                              first_frame=first_frame)
+        if out_crop is None:
+            logger_f.warning(f"no face in driving frame:{frame_ind}")
+            continue
+        infer_times.append(time.time() - t0)
+        # print(time.time() - t0)
+        out_crop = cv2.cvtColor(out_crop, cv2.COLOR_RGB2BGR)
+        vout_crop.write(out_crop)
+        out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+        vout_org.write(out_org)
+    vout_crop.release()
+    vout_org.release()
+    logger_f.info(vsave_crop_path)
+    logger_f.info(vsave_org_path)
+    logger_f.info(
+        "inference median time: {} ms/frame, mean time: {} ms/frame".format(np.median(infer_times) * 1000,
+                                                                            np.mean(infer_times) * 1000))
+class LivePortraitParams(BaseModel):
+    flag_pickle: bool = False
+    flag_relative_input: bool = True
+    flag_do_crop_input: bool = True
+    flag_remap_input: bool = True
+    driving_multiplier: float = 1.0
+    flag_stitching: bool = True
+    flag_crop_driving_video_input: bool = True
+    flag_video_editing_head_rotation: bool = False
+    flag_is_animal: bool = True
+    scale: float = 2.3
+    vx_ratio: float = 0.0
+    vy_ratio: float = -0.125
+    scale_crop_driving_video: float = 2.2
+    vx_ratio_crop_driving_video: float = 0.0
+    vy_ratio_crop_driving_video: float = -0.1
+    driving_smooth_observation_variance: float = 1e-7
+@app.post("/predict/")
+async def upload_files(
+        source_image: Optional[UploadFile] = File(None),
+        driving_video: Optional[UploadFile] = File(None),
+        driving_pickle: Optional[UploadFile] = File(None),
+        flag_is_animal: bool = Form(...),
+        flag_pickle: bool = Form(...),
+        flag_relative_input: bool = Form(...),
+        flag_do_crop_input: bool = Form(...),
+        flag_remap_input: bool = Form(...),
+        driving_multiplier: float = Form(...),
+        flag_stitching: bool = Form(...),
+        flag_crop_driving_video_input: bool = Form(...),
+        flag_video_editing_head_rotation: bool = Form(...),
+        scale: float = Form(...),
+        vx_ratio: float = Form(...),
+        vy_ratio: float = Form(...),
+        scale_crop_driving_video: float = Form(...),
+        vx_ratio_crop_driving_video: float = Form(...),
+        vy_ratio_crop_driving_video: float = Form(...),
+        driving_smooth_observation_variance: float = Form(...)
+):
+    # 根据传入的表单参数构建 infer_params
+    infer_params = LivePortraitParams(
+        flag_is_animal=flag_is_animal,
+        flag_pickle=flag_pickle,
+        flag_relative_input=flag_relative_input,
+        flag_do_crop_input=flag_do_crop_input,
+        flag_remap_input=flag_remap_input,
+        driving_multiplier=driving_multiplier,
+        flag_stitching=flag_stitching,
+        flag_crop_driving_video_input=flag_crop_driving_video_input,
+        flag_video_editing_head_rotation=flag_video_editing_head_rotation,
+        scale=scale,
+        vx_ratio=vx_ratio,
+        vy_ratio=vy_ratio,
+        scale_crop_driving_video=scale_crop_driving_video,
+        vx_ratio_crop_driving_video=vx_ratio_crop_driving_video,
+        vy_ratio_crop_driving_video=vy_ratio_crop_driving_video,
+        driving_smooth_observation_variance=driving_smooth_observation_variance
+    )
+    global pipe
+    pipe.init_vars()
+    if infer_params.flag_is_animal != pipe.is_animal:
+        pipe.init_models(is_animal=infer_params.flag_is_animal)
+    args_user = {
+        'flag_relative_motion': infer_params.flag_relative_input,
+        'flag_do_crop': infer_params.flag_do_crop_input,
+        'flag_pasteback': infer_params.flag_remap_input,
+        'driving_multiplier': infer_params.driving_multiplier,
+        'flag_stitching': infer_params.flag_stitching,
+        'flag_crop_driving_video': infer_params.flag_crop_driving_video_input,
+        'flag_video_editing_head_rotation': infer_params.flag_video_editing_head_rotation,
+        'src_scale': infer_params.scale,
+        'src_vx_ratio': infer_params.vx_ratio,
+        'src_vy_ratio': infer_params.vy_ratio,
+        'dri_scale': infer_params.scale_crop_driving_video,
+        'dri_vx_ratio': infer_params.vx_ratio_crop_driving_video,
+        'dri_vy_ratio': infer_params.vy_ratio_crop_driving_video,
+    }
+    # update config from user input
+    update_ret = pipe.update_cfg(args_user)
+    # 保存 source_image 到指定目录
+    temp_dir = os.path.join(result_dir, f"temp-{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}")
+    os.makedirs(temp_dir, exist_ok=True)
+    if source_image and source_image.filename:
+        source_image_path = os.path.join(temp_dir, source_image.filename)
+        with open(source_image_path, "wb") as buffer:
+            buffer.write(await source_image.read())  # 将内容写入文件
+    else:
+        source_image_path = None
+    if driving_video and driving_video.filename:
+        driving_video_path = os.path.join(temp_dir, driving_video.filename)
+        with open(driving_video_path, "wb") as buffer:
+            buffer.write(await driving_video.read())  # 将内容写入文件
+    else:
+        driving_video_path = None
+    if driving_pickle and driving_pickle.filename:
+        driving_pickle_path = os.path.join(temp_dir, driving_pickle.filename)
+        with open(driving_pickle_path, "wb") as buffer:
+            buffer.write(await driving_pickle.read())  # 将内容写入文件
+    else:
+        driving_pickle_path = None
+    save_dir = os.path.join(result_dir, f"{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}")
+    os.makedirs(save_dir, exist_ok=True)
+    if infer_params.flag_pickle:
+        if source_image_path and driving_pickle_path:
+            run_with_pkl(source_image_path, driving_pickle_path, save_dir)
+    else:
+        if source_image_path and driving_video_path:
+            run_with_video(source_image_path, driving_video_path, save_dir)
+    # zip all files and return
+    # 使用 BytesIO 在内存中创建一个字节流
+    zip_buffer = io.BytesIO()
+    # 使用 ZipFile 将文件夹内容压缩到 zip_buffer 中
+    with ZipFile(zip_buffer, "w") as zip_file:
+        for root, dirs, files in os.walk(save_dir):
+            for file in files:
+                file_path = os.path.join(root, file)
+                # 添加文件到 ZIP 文件中
+                zip_file.write(file_path, arcname=os.path.relpath(file_path, save_dir))
+    # 确保缓冲区指针在开始位置，以便读取整个内容
+    zip_buffer.seek(0)
+    shutil.rmtree(temp_dir)
+    shutil.rmtree(save_dir)
+    # 通过 StreamingResponse 返回 zip 文件
+    return StreamingResponse(zip_buffer, media_type="application/zip",
+                             headers={"Content-Disposition": "attachment; filename=output.zip"})
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host=os.environ.get("FLIP_IP", "127.0.0.1"), port=os.environ.get("FLIP_PORT", 9871))

assets/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ examples/driving/*.pkl
2	+ examples/driving/*_crop.mp4

assets/docs/API.md ADDED Viewed

	@@ -0,0 +1,41 @@

+## FasterLivePortrait API Usage Guide
+### Building the Image
+* Decide on an image name, for example `shaoguo/faster_liveportrait_api:v1.0`. Replace the `-t` parameter in the following command with your chosen name.
+* Run `docker build -t shaoguo/faster_liveportrait_api:v1.0 -f DockerfileAPI .`
+### Running the Image
+Ensure that your machine has Nvidia GPU drivers installed. CUDA version should be 12.0 or higher. Two scenarios are described below.
+* Running on a Local Machine (typically for self-testing)
+  * Modify the image name according to what you defined above.
+  * Confirm the service port number, default is `9871`. You can define your own by changing the `SERVER_PORT` environment variable in the command below. Remember to also change `-p 9871:9871` to map the port.
+  * Set the model path environment variable `CHECKPOINT_DIR`. If you've previously downloaded FasterLivePortrait's onnx model and converted it to trt, I recommend mapping the model files into the container using `-v`, for example `-v E:\my_projects\FasterLivePortrait\checkpoints:/root/FasterLivePortrait/checkpoints`. This avoids re-downloading the onnx model and doing trt conversion. Otherwise, I will check if `CHECKPOINT_DIR` has models, and if not, I will automatically download (ensure network connectivity) and do trt conversion, which will take considerable time.
+  * Run command (note: modify the following command according to your settings):
+    ```shell
+    docker run -d --gpus=all \
+    --name faster_liveportrait_api \
+    -v E:\my_projects\FasterLivePortrait\checkpoints:/root/FasterLivePortrait/checkpoints \
+    -e CHECKPOINT_DIR=/root/FasterLivePortrait/checkpoints \
+    -e SERVER_PORT=9871 \
+    -p 9871:9871 \
+    --restart=always \
+    shaoguo/faster_liveportrait_api:v1.0 \
+    /bin/bash
+    ```
+  * Normal operation should display the following information(docker logs $container_id). The running logs are saved in `/root/FasterLivePortrait/logs/log_run.log`:
+    ```shell
+    INFO:     Application startup complete.
+    INFO:     Uvicorn running on http://0.0.0.0:9871 (Press CTRL+C to quit)
+    ```
+* Running on Cloud GPU Cluster (production environment)
+  * This needs to be configured according to different clusters, but the core is the configuration of docker image and environment variables.
+  * Load balancing may need to be set up.
+### API Call Testing
+Refer to `tests/test_api.py`. The default is the Animal model, but now it also supports the Human model.
+The return is a compressed package, by default unzipped to `./results/api_*`. Confirm according to the actual printed log.
+* `test_with_video_animal()`, image and video driving. Set `flag_pickle=False`. It will additionally return the driving video's pkl file, which can be called directly next time.
+* `test_with_pkl_animal()`, image and pkl driving.
+* `test_with_video_human()`, image and video driving under the Human model, set `flag_is_animal=False`

assets/docs/API_ZH.md ADDED Viewed

	@@ -0,0 +1,47 @@

+## FasterLivePortrait API使用教程
+### 构建镜像
+* 确定镜像的名字，比如 `shaoguo/faster_liveportrait_api:v1.0`。确认后替换为下面命令 `-t` 的参数。
+* 运行 `docker build -t shaoguo/faster_liveportrait_api:v1.0 -f DockerfileAPI .`
+### 运行镜像
+请确保你的机器已经装了Nvidia显卡的驱动。CUDA的版本在cuda12.0及以上。以下分两种情况介绍。
+* 本地机器运行(一般自己测试使用)
+    * 镜像名称根据上面你自己定义的更改。
+    * 确认服务的端口号，默认为`9871`，你可以自己定义，更改下面命令里环境变量`SERVER_PORT`。同时要记得更改`-p 9871:9871`,
+      将端口映射出来。
+    * 设置模型路径环境变量 `CHECKPOINT_DIR`。如果你之前下载过FasterLivePortrait的onnx模型并做过trt的转换，我建议
+      是可以通过 `-v`把
+      模型文件映射进入容器，比如 `-v E:\my_projects\FasterLivePortrait\checkpoints:/root/FasterLivePortrait/checkpoints`,
+      这样就避免重新下载onnx模型和做trt的转换。否则我将会检测`CHECKPOINT_DIR`是否有模型，没有的话，我将自动下载（确保有网络）和做trt的转换，这将耗时比较久的时间。
+    * 运行命令(注意你要根据自己的设置更改以下命令的信息）：
+  ```shell
+    docker run -d --gpus=all \
+    --name faster_liveportrait_api \
+    -v E:\my_projects\FasterLivePortrait\checkpoints:/root/FasterLivePortrait/checkpoints \
+    -e CHECKPOINT_DIR=/root/FasterLivePortrait/checkpoints \
+    -e SERVER_PORT=9871 \
+    -p 9871:9871 \
+    --restart=always \
+    shaoguo/faster_liveportrait_api:v1.0
+  ```
+    * 正常运行应该会显示以下信息(docker logs container_id), 运行的日志保存在`/root/FasterLivePortrait/logs/log_run.log`:
+  ```shell
+    INFO:     Application startup complete.
+    INFO:     Uvicorn running on http://0.0.0.0:9871 (Press CTRL+C to quit)
+  ```
+* 云端GPU集群运行（生产环境）
+    * 这需要根据不同的集群做配置，但核心就是镜像和环境变量的配置。
+    * 可能要设置负载均衡。
+### API调用测试
+可以参考`tests/test_api.py`, 默认是Animal的模型，但现在同时也支持Human的模型了。
+返回的是压缩包，默认解压在`./results/api_*`, 根据实际打印出来的日志确认。
+* `test_with_video_animal()`, 图像和视频的驱动。设置`flag_pickle=False`。会额外返回driving video的pkl文件，下次可以直接调用。
+* `test_with_pkl_animal()`, 图像和pkl的驱动。
+* `test_with_video_human()`, Human模型下图像和视频的驱动，设置`flag_is_animal=False`

assets/gradio/gradio_description_animate_clear.md ADDED Viewed

	@@ -0,0 +1,6 @@

+<div style="font-size: 1.2em; text-align: center;">
+    Step 3: Click the <strong>🚀 Animate</strong> button below to generate, or click <strong>🧹 Clear</strong> to erase the results
+</div>
+<!-- <div style="font-size: 1.1em; text-align: center;">
+    <strong style="color: red;">Note:</strong>  If both <strong>Source Image</strong> and <strong>Video</strong> are uploaded, the <strong>Source Image</strong> will be used. Please click the <strong>🧹 Clear</strong> button, then re-upload the <strong>Source Image</strong> or <strong>Video</strong>.
+</div> -->

assets/gradio/gradio_description_animation.md ADDED Viewed

	@@ -0,0 +1,19 @@

+<span style="font-size: 1.2em;">🔥 To animate the source image or video with the driving video, please follow these steps:</span>
+<div style="font-size: 1.2em; margin-left: 20px;">
+1. In the <strong>Animation Options for Source Image or Video</strong> section, we recommend enabling the <code>do crop (source)</code> option if faces occupy a small portion of your source image or video.
+</div>
+<div style="font-size: 1.2em; margin-left: 20px;">
+2. In the <strong>Animation Options for Driving Video</strong> section, the <code>relative head rotation</code> and <code>smooth strength</code> options only take effect if the source input is a video.
+</div>
+<div style="font-size: 1.2em; margin-left: 20px;">
+3. Press the <strong>🚀 Animate</strong> button and wait for a moment. Your animated video will appear in the result block. This may take a few moments. If the input is a source video, the length of the animated video is the minimum of the length of the source video and the driving video.
+</div>
+<div style="font-size: 1.2em; margin-left: 20px;">
+4. If you want to upload your own driving video, <strong>the best practice</strong>:
+ - Crop it to a 1:1 aspect ratio (e.g., 512x512 or 256x256 pixels), or enable auto-driving by checking `do crop (driving video)`.
+ - Focus on the head area, similar to the example videos.
+ - Minimize shoulder movement.
+ - Make sure the first frame of driving video is a frontal face with **neutral expression**.
+</div>

assets/gradio/gradio_description_retargeting.md ADDED Viewed

	@@ -0,0 +1,14 @@

+<br>
+<!-- ## Retargeting -->
+<!-- <span style="font-size: 1.2em;">🔥 To edit the eyes and lip open ratio of the source portrait, drag the sliders and click the <strong>🚗 Retargeting</strong> button. You can try running it multiple times. <strong>😊 Set both ratios to 0.8 to see what's going on!</strong> </span> -->
+<div style="display: flex; justify-content: center; align-items: center; text-align: center; font-size: 1.2em;">
+  <div>
+    <h2>Retargeting</h2>
+    <p>Upload a Source Portrait as Retargeting Input, then drag the sliders and click the <strong>🚗 Retargeting</strong> button. You can try running it multiple times.
+    <br>
+    <strong>😊 Set both ratios to 0.8 to see what's going on!</strong></p>
+  </div>
+</div>

assets/gradio/gradio_description_upload.md ADDED Viewed

	@@ -0,0 +1,16 @@

+<br>
+<div style="font-size: 1.2em; display: flex; justify-content: space-between;">
+    <div style="flex: 1; text-align: center; margin-right: 20px;">
+        <div style="display: inline-block;">
+            Step 1: Upload a <strong>Source Image</strong> or <strong>Video</strong> (any aspect ratio) ⬇️
+        </div>
+    </div>
+    <div style="flex: 1; text-align: center; margin-left: 20px;">
+        <div style="display: inline-block;">
+            Step 2: Upload a <strong>Driving Video</strong> (any aspect ratio) ⬇️
+        </div>
+        <div style="display: inline-block; font-size: 0.8em;">
+            <strong>Tips:</strong> Focus on the head, minimize shoulder movement, <strong>neutral expression</strong> in first frame.
+        </div>
+    </div>
+</div>

assets/gradio/gradio_title.md ADDED Viewed

	@@ -0,0 +1,19 @@

+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+  <div>
+    <h1>FasterLivePortrait: Bring Portraits to Life in Real Time</h1>
+    <span>Built on <a href="https://github.com/KwaiVGI/LivePortrait">LivePortrait</a></span>
+    <div style="display: flex; justify-content: center; align-items: center; text-align: center; margin-top: 10px;">
+      <a href="https://huggingface.co/warmshao/FasterLivePortrait">
+        <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue" alt="Hugging Face Spaces">
+      </a>
+      &nbsp;
+      <a href="https://github.com/warmshao/FasterLivePortrait">
+        <img src="https://img.shields.io/badge/Github-Code-blue" alt="Github Code">
+      </a>
+      &nbsp;
+      <a href="https://github.com/warmshao/FasterLivePortrait">
+        <img src="https://img.shields.io/github/stars/warmshao/FasterLivePortrait" alt="Github Stars">
+      </a>
+    </div>
+  </div>
+</div>

assets/mask_template.png ADDED Viewed

camera.bat ADDED Viewed

	@@ -0,0 +1,32 @@

+@echo off
+setlocal enabledelayedexpansion
+REM 设置默认源图像路径
+set "default_src_image=assets\examples\source\s12.jpg"
+set "src_image=%default_src_image%"
+set "animal_param="
+set "paste_back="
+REM 解析命名参数
+:parse_args
+if "%~1"=="" goto end_parse_args
+if /i "%~1"=="--src_image" (
+    set "src_image=%~2"
+    shift
+) else if /i "%~1"=="--animal" (
+    set "animal_param=--animal"
+) else if /i "%~1"=="--paste_back" (
+    set "paste_back=--paste_back"
+)
+shift
+goto parse_args
+:end_parse_args
+echo source image: [!src_image!]
+echo use animal: [!animal_param!]
+echo paste_back: [!paste_back!]
+REM 执行Python命令
+.\venv\python.exe .\run.py --cfg configs/trt_infer.yaml --realtime --dri_video 0 --src_image !src_image! !animal_param! !paste_back!
+endlocal

configs/onnx_infer.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/warping_spade.onnx"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/motion_extractor.onnx"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.onnx"
+  face_analysis:
+    name: "FaceAnalysisModel"
+    predict_type: "ort"
+    model_path:
+      - "./checkpoints/liveportrait_onnx/retinaface_det_static.onnx"
+      - "./checkpoints/liveportrait_onnx/face_2dpose_106_static.onnx"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/appearance_feature_extractor.onnx"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching.onnx"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_eye.onnx"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_lip.onnx"
+animal_models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/warping_spade.onnx"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/motion_extractor.onnx"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/appearance_feature_extractor.onnx"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching.onnx"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_eye.onnx"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_lip.onnx"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.onnx"
+  face_analysis:
+    name: "FaceAnalysisModel"
+    predict_type: "ort"
+    model_path:
+      - "./checkpoints/liveportrait_onnx/retinaface_det_static.onnx"
+      - "./checkpoints/liveportrait_onnx/face_2dpose_106_static.onnx"
+joyvasa_models:
+  motion_model_path: "checkpoints/JoyVASA/motion_generator/motion_generator_hubert_chinese.pt"
+  audio_model_path: "checkpoints/chinese-hubert-base"
+  motion_template_path: "checkpoints/JoyVASA/motion_template/motion_template.pkl"
+crop_params:
+  src_dsize: 512
+  src_scale: 2.3
+  src_vx_ratio: 0.0
+  src_vy_ratio: -0.125
+  dri_scale: 2.2
+  dri_vx_ratio: 0.0
+  dri_vy_ratio: -0.1
+infer_params:
+  flag_crop_driving_video: False
+  flag_normalize_lip: True
+  flag_source_video_eye_retargeting: False
+  flag_video_editing_head_rotation: False
+  flag_eye_retargeting: False
+  flag_lip_retargeting: False
+  flag_stitching: True
+  flag_relative_motion: True
+  flag_pasteback: True
+  flag_do_crop: True
+  flag_do_rot: True
+  # NOT EXPOERTED PARAMS
+  lip_normalize_threshold: 0.03 # threshold for flag_normalize_lip
+  source_video_eye_retargeting_threshold: 0.18 # threshold for eyes retargeting if the input is a source video
+  driving_smooth_observation_variance: 1e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+  anchor_frame: 0 # TO IMPLEMENT
+  mask_crop_path: "./assets/mask_template.png"
+  driving_multiplier: 1.0
+  animation_region: "all"
+  cfg_mode: "incremental"
+  cfg_scale: 1.2
+  source_max_dim: 1280 # the max dim of height and width of source image
+  source_division: 2 # make sure the height and width of source image can be divided by this number

configs/onnx_mp_infer.yaml ADDED Viewed

	@@ -0,0 +1,108 @@

+models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/warping_spade.onnx"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/motion_extractor.onnx"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.onnx"
+  face_analysis:
+    name: "MediaPipeFaceModel"
+    predict_type: "mp"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/appearance_feature_extractor.onnx"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching.onnx"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_eye.onnx"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_lip.onnx"
+animal_models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/warping_spade.onnx"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/motion_extractor.onnx"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/appearance_feature_extractor.onnx"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching.onnx"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_eye.onnx"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_lip.onnx"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "ort"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.onnx"
+  face_analysis:
+    name: "MediaPipeFaceModel"
+    predict_type: "mp"
+joyvasa_models:
+  motion_model_path: "checkpoints/JoyVASA/motion_generator/motion_generator_hubert_chinese.pt"
+  audio_model_path: "checkpoints/chinese-hubert-base"
+  motion_template_path: "checkpoints/JoyVASA/motion_template/motion_template.pkl"
+crop_params:
+  src_dsize: 512
+  src_scale: 2.3
+  src_vx_ratio: 0.0
+  src_vy_ratio: -0.125
+  dri_scale: 2.2
+  dri_vx_ratio: 0.0
+  dri_vy_ratio: -0.1
+infer_params:
+  flag_crop_driving_video: False
+  flag_normalize_lip: True
+  flag_source_video_eye_retargeting: False
+  flag_video_editing_head_rotation: False
+  flag_eye_retargeting: False
+  flag_lip_retargeting: False
+  flag_stitching: True
+  flag_relative_motion: True
+  flag_pasteback: True
+  flag_do_crop: True
+  flag_do_rot: True
+  # NOT EXPOERTED PARAMS
+  lip_normalize_threshold: 0.03 # threshold for flag_normalize_lip
+  source_video_eye_retargeting_threshold: 0.18 # threshold for eyes retargeting if the input is a source video
+  driving_smooth_observation_variance: 1e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+  anchor_frame: 0 # TO IMPLEMENT
+  mask_crop_path: "./assets/mask_template.png"
+  driving_multiplier: 1.0
+  animation_region: "all"
+  cfg_mode: "incremental"
+  cfg_scale: 1.2
+  source_max_dim: 1280 # the max dim of height and width of source image
+  source_division: 2 # make sure the height and width of source image can be divided by this number

configs/trt_infer.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/warping_spade-fix.trt"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/motion_extractor.trt"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.trt"
+  face_analysis:
+    name: "FaceAnalysisModel"
+    predict_type: "trt"
+    model_path:
+      - "./checkpoints/liveportrait_onnx/retinaface_det_static.trt"
+      - "./checkpoints/liveportrait_onnx/face_2dpose_106_static.trt"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/appearance_feature_extractor.trt"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching.trt"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_eye.trt"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_lip.trt"
+animal_models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/warping_spade-fix-v1.1.trt"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/motion_extractor-v1.1.trt"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/appearance_feature_extractor-v1.1.trt"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching-v1.1.trt"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_eye-v1.1.trt"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_lip-v1.1.trt"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.trt"
+  face_analysis:
+    name: "FaceAnalysisModel"
+    predict_type: "trt"
+    model_path:
+      - "./checkpoints/liveportrait_onnx/retinaface_det_static.trt"
+      - "./checkpoints/liveportrait_onnx/face_2dpose_106_static.trt"
+joyvasa_models:
+  motion_model_path: "checkpoints/JoyVASA/motion_generator/motion_generator_hubert_chinese.pt"
+  audio_model_path: "checkpoints/chinese-hubert-base"
+  motion_template_path: "checkpoints/JoyVASA/motion_template/motion_template.pkl"
+crop_params:
+  src_dsize: 512
+  src_scale: 2.3
+  src_vx_ratio: 0.0
+  src_vy_ratio: -0.125
+  dri_scale: 2.2
+  dri_vx_ratio: 0.0
+  dri_vy_ratio: -0.1
+infer_params:
+  flag_crop_driving_video: False
+  flag_normalize_lip: True
+  flag_source_video_eye_retargeting: False
+  flag_video_editing_head_rotation: False
+  flag_eye_retargeting: False
+  flag_lip_retargeting: False
+  flag_stitching: True
+  flag_relative_motion: True
+  flag_pasteback: True
+  flag_do_crop: True
+  flag_do_rot: True
+  # NOT EXPOERTED PARAMS
+  lip_normalize_threshold: 0.1 # threshold for flag_normalize_lip
+  source_video_eye_retargeting_threshold: 0.18 # threshold for eyes retargeting if the input is a source video
+  driving_smooth_observation_variance: 1e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+  anchor_frame: 0 # TO IMPLEMENT
+  mask_crop_path: "./assets/mask_template.png"
+  driving_multiplier: 1.0
+  animation_region: "all"
+  cfg_mode: "incremental"
+  cfg_scale: 1.2
+  source_max_dim: 1280 # the max dim of height and width of source image
+  source_division: 2 # make sure the height and width of source image can be divided by this number

configs/trt_mp_infer.yaml ADDED Viewed

	@@ -0,0 +1,108 @@

+models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/warping_spade-fix.trt"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/motion_extractor.trt"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.trt"
+  face_analysis:
+    name: "MediaPipeFaceModel"
+    predict_type: "mp"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/appearance_feature_extractor.trt"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching.trt"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_eye.trt"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/stitching_lip.trt"
+animal_models:
+  warping_spade:
+    name: "WarpingSpadeModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/warping_spade-fix-v1.1.trt"
+  motion_extractor:
+    name: "MotionExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/motion_extractor-v1.1.trt"
+  app_feat_extractor:
+    name: "AppearanceFeatureExtractorModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/appearance_feature_extractor-v1.1.trt"
+  stitching:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching-v1.1.trt"
+  stitching_eye_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_eye-v1.1.trt"
+  stitching_lip_retarget:
+    name: "StitchingModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_animal_onnx/stitching_lip-v1.1.trt"
+  landmark:
+    name: "LandmarkModel"
+    predict_type: "trt"
+    model_path: "./checkpoints/liveportrait_onnx/landmark.trt"
+  face_analysis:
+    name: "MediaPipeFaceModel"
+    predict_type: "mp"
+joyvasa_models:
+  motion_model_path: "checkpoints/JoyVASA/motion_generator/motion_generator_hubert_chinese.pt"
+  audio_model_path: "checkpoints/chinese-hubert-base"
+  motion_template_path: "checkpoints/JoyVASA/motion_template/motion_template.pkl"
+crop_params:
+  src_dsize: 512
+  src_scale: 2.3
+  src_vx_ratio: 0.0
+  src_vy_ratio: -0.125
+  dri_scale: 2.2
+  dri_vx_ratio: 0.0
+  dri_vy_ratio: -0.1
+infer_params:
+  flag_crop_driving_video: False
+  flag_normalize_lip: True
+  flag_source_video_eye_retargeting: False
+  flag_video_editing_head_rotation: False
+  flag_eye_retargeting: False
+  flag_lip_retargeting: False
+  flag_stitching: True
+  flag_relative_motion: True
+  flag_pasteback: True
+  flag_do_crop: True
+  flag_do_rot: True
+  animation_region: "all"
+  # NOT EXPOERTED PARAMS
+  lip_normalize_threshold: 0.03 # threshold for flag_normalize_lip
+  source_video_eye_retargeting_threshold: 0.18 # threshold for eyes retargeting if the input is a source video
+  driving_smooth_observation_variance: 1e-7 # smooth strength scalar for the animated video when the input is a source video, the larger the number, the smoother the animated video; too much smoothness would result in loss of motion accuracy
+  anchor_frame: 0 # TO IMPLEMENT
+  mask_crop_path: "./assets/mask_template.png"
+  driving_multiplier: 1.0
+  cfg_mode: "incremental"
+  cfg_scale: 1.2
+  source_max_dim: 1280 # the max dim of height and width of source image
+  source_division: 2 # make sure the height and width of source image can be divided by this number

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+ffmpeg-python
+omegaconf
+onnx
+pycuda
+numpy
+opencv-python
+gradio
+scikit-image
+insightface
+huggingface_hub[cli]
+mediapipe
+torchgeometry
+soundfile
+munch
+phonemizer
+kokoro>=0.3.4
+misaki[ja]
+misaki[zh]

requirements_macos.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+ffmpeg-python
+omegaconf
+onnx
+onnxruntime
+numpy
+opencv-python
+gradio
+scikit-image
+insightface
+huggingface_hub[cli]
+mediapipe
+torchgeometry
+soundfile
+munch
+phonemizer
+kokoro>=0.3.4
+misaki[ja]
+misaki[zh]

requirements_win.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+ffmpeg-python
+omegaconf
+onnx
+numpy
+opencv-python
+gradio
+scikit-image
+insightface
+huggingface_hub[cli]
+mediapipe
+torchgeometry
+soundfile
+munch
+phonemizer
+kokoro>=0.3.4
+misaki[ja]
+misaki[zh]

run.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : [email protected]
+# @Project : FasterLivePortrait
+# @FileName: run.py
+"""
+# video
+ python run.py \
+ --src_image assets/examples/driving/d13.mp4 \
+ --dri_video assets/examples/driving/d11.mp4 \
+ --cfg configs/trt_infer.yaml \
+ --paste_back \
+ --animal
+# pkl
+ python run.py \
+ --src_image assets/examples/source/s12.jpg \
+ --dri_video ./results/2024-09-13-081710/d0.mp4.pkl \
+ --cfg configs/trt_infer.yaml \
+ --paste_back \
+ --animal
+"""
+import os
+import argparse
+import pdb
+import subprocess
+import ffmpeg
+import cv2
+import time
+import numpy as np
+import os
+import datetime
+import platform
+import pickle
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from colorama import Fore, Back, Style
+from src.pipelines.faster_live_portrait_pipeline import FasterLivePortraitPipeline
+from src.utils.utils import video_has_audio
+if platform.system().lower() == 'windows':
+    FFMPEG = "third_party/ffmpeg-7.0.1-full_build/bin/ffmpeg.exe"
+else:
+    FFMPEG = "ffmpeg"
+def run_with_video(args):
+    print(Fore.RED+'Render,  Q > exit,  S > Stitching,  Z > RelativeMotion,  X > AnimationRegion,  C > CropDrivingVideo, KL > AdjustSourceScale, NM > AdjustDriverScale,  Space > Webcamassource,  R > SwitchRealtimeWebcamUpdate'+Style.RESET_ALL)
+    infer_cfg = OmegaConf.load(args.cfg)
+    infer_cfg.infer_params.flag_pasteback = args.paste_back
+    pipe = FasterLivePortraitPipeline(cfg=infer_cfg, is_animal=args.animal)
+    ret = pipe.prepare_source(args.src_image, realtime=args.realtime)
+    if not ret:
+        print(f"no face in {args.src_image}! exit!")
+        exit(1)
+    if not args.dri_video or not os.path.exists(args.dri_video):
+        # read frame from camera if no driving video input
+        vcap = cv2.VideoCapture(0)
+        if not vcap.isOpened():
+            print("no camera found! exit!")
+            exit(1)
+    else:
+        vcap = cv2.VideoCapture(args.dri_video)
+    fps = int(vcap.get(cv2.CAP_PROP_FPS))
+    h, w = pipe.src_imgs[0].shape[:2]
+    save_dir = f"./results/{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}"
+    os.makedirs(save_dir, exist_ok=True)
+    # render output video
+    if not args.realtime:
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        vsave_crop_path = os.path.join(save_dir,
+                                       f"{os.path.basename(args.src_image)}-{os.path.basename(args.dri_video)}-crop.mp4")
+        vout_crop = cv2.VideoWriter(vsave_crop_path, fourcc, fps, (512 * 2, 512))
+        vsave_org_path = os.path.join(save_dir,
+                                      f"{os.path.basename(args.src_image)}-{os.path.basename(args.dri_video)}-org.mp4")
+        vout_org = cv2.VideoWriter(vsave_org_path, fourcc, fps, (w, h))
+    infer_times = []
+    motion_lst = []
+    c_eyes_lst = []
+    c_lip_lst = []
+    frame_ind = 0
+    while vcap.isOpened():
+        ret, frame = vcap.read()
+        if not ret:
+            break
+        t0 = time.time()
+        first_frame = frame_ind == 0
+        dri_crop, out_crop, out_org, dri_motion_info = pipe.run(frame, pipe.src_imgs[0], pipe.src_infos[0],
+                                                                first_frame=first_frame)
+        frame_ind += 1
+        if out_crop is None:
+            print(f"no face in driving frame:{frame_ind}")
+            continue
+        motion_lst.append(dri_motion_info[0])
+        c_eyes_lst.append(dri_motion_info[1])
+        c_lip_lst.append(dri_motion_info[2])
+        infer_times.append(time.time() - t0)
+        # print(time.time() - t0)
+        dri_crop = cv2.resize(dri_crop, (512, 512))
+        out_crop = np.concatenate([dri_crop, out_crop], axis=1)
+        out_crop = cv2.cvtColor(out_crop, cv2.COLOR_RGB2BGR)
+        if not args.realtime:
+            vout_crop.write(out_crop)
+            out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+            vout_org.write(out_org)
+        else:
+            if infer_cfg.infer_params.flag_pasteback:
+                out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+                cv2.imshow('Render', out_org)
+            else:
+                # image show in realtime mode
+                cv2.imshow('Render', out_crop)
+            # 按下'q'键退出循环
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                break
+    vcap.release()
+    if not args.realtime:
+        vout_crop.release()
+        vout_org.release()
+        if video_has_audio(args.dri_video):
+            vsave_crop_path_new = os.path.splitext(vsave_crop_path)[0] + "-audio.mp4"
+            subprocess.call(
+                [FFMPEG, "-i", vsave_crop_path, "-i", args.dri_video,
+                 "-b:v", "10M", "-c:v",
+                 "libx264", "-map", "0:v", "-map", "1:a",
+                 "-c:a", "aac",
+                 "-pix_fmt", "yuv420p", vsave_crop_path_new, "-y", "-shortest"])
+            vsave_org_path_new = os.path.splitext(vsave_org_path)[0] + "-audio.mp4"
+            subprocess.call(
+                [FFMPEG, "-i", vsave_org_path, "-i", args.dri_video,
+                 "-b:v", "10M", "-c:v",
+                 "libx264", "-map", "0:v", "-map", "1:a",
+                 "-c:a", "aac",
+                 "-pix_fmt", "yuv420p", vsave_org_path_new, "-y", "-shortest"])
+            print(vsave_crop_path_new)
+            print(vsave_org_path_new)
+        else:
+            print(vsave_crop_path)
+            print(vsave_org_path)
+    else:
+        cv2.destroyAllWindows()
+    print(
+        "inference median time: {} ms/frame, mean time: {} ms/frame".format(np.median(infer_times) * 1000,
+                                                                            np.mean(infer_times) * 1000))
+    # save driving motion to pkl
+    template_dct = {
+        'n_frames': len(motion_lst),
+        'output_fps': fps,
+        'motion': motion_lst,
+        'c_eyes_lst': c_eyes_lst,
+        'c_lip_lst': c_lip_lst,
+    }
+    template_pkl_path = os.path.join(save_dir,
+                                     f"{os.path.basename(args.dri_video)}.pkl")
+    with open(template_pkl_path, "wb") as fw:
+        pickle.dump(template_dct, fw)
+    print(f"save driving motion pkl file at : {template_pkl_path}")
+def run_with_pkl(args):
+    infer_cfg = OmegaConf.load(args.cfg)
+    infer_cfg.infer_params.flag_pasteback = args.paste_back
+    pipe = FasterLivePortraitPipeline(cfg=infer_cfg, is_animal=args.animal)
+    ret = pipe.prepare_source(args.src_image, realtime=args.realtime)
+    if not ret:
+        print(f"no face in {args.src_image}! exit!")
+        return
+    with open(args.dri_video, "rb") as fin:
+        dri_motion_infos = pickle.load(fin)
+    fps = int(dri_motion_infos["output_fps"])
+    h, w = pipe.src_imgs[0].shape[:2]
+    save_dir = f"./results/{datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')}"
+    os.makedirs(save_dir, exist_ok=True)
+    # render output video
+    if not args.realtime:
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        vsave_crop_path = os.path.join(save_dir,
+                                       f"{os.path.basename(args.src_image)}-{os.path.basename(args.dri_video)}-crop.mp4")
+        vout_crop = cv2.VideoWriter(vsave_crop_path, fourcc, fps, (512, 512))
+        vsave_org_path = os.path.join(save_dir,
+                                      f"{os.path.basename(args.src_image)}-{os.path.basename(args.dri_video)}-org.mp4")
+        vout_org = cv2.VideoWriter(vsave_org_path, fourcc, fps, (w, h))
+    infer_times = []
+    motion_lst = dri_motion_infos["motion"]
+    c_eyes_lst = dri_motion_infos["c_eyes_lst"] if "c_eyes_lst" in dri_motion_infos else dri_motion_infos[
+        "c_d_eyes_lst"]
+    c_lip_lst = dri_motion_infos["c_lip_lst"] if "c_lip_lst" in dri_motion_infos else dri_motion_infos["c_d_lip_lst"]
+    frame_num = len(motion_lst)
+    for frame_ind in tqdm(range(frame_num)):
+        t0 = time.time()
+        first_frame = frame_ind == 0
+        dri_motion_info_ = [motion_lst[frame_ind], c_eyes_lst[frame_ind], c_lip_lst[frame_ind]]
+        out_crop, out_org = pipe.run_with_pkl(dri_motion_info_, pipe.src_imgs[0], pipe.src_infos[0],
+                                              first_frame=first_frame)
+        if out_crop is None:
+            print(f"no face in driving frame:{frame_ind}")
+            continue
+        infer_times.append(time.time() - t0)
+        # print(time.time() - t0)
+        out_crop = cv2.cvtColor(out_crop, cv2.COLOR_RGB2BGR)
+        if not args.realtime:
+            vout_crop.write(out_crop)
+            out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+            vout_org.write(out_org)
+        else:
+            if infer_cfg.infer_params.flag_pasteback:
+                out_org = cv2.cvtColor(out_org, cv2.COLOR_RGB2BGR)
+                cv2.imshow('Render,  Q > exit,  S > Stitching,  Z > RelativeMotion,  X > AnimationRegion,  C > CropDrivingVideo, KL > AdjustSourceScale, NM > AdjustDriverScale,  Space > Webcamassource,  R > SwitchRealtimeWebcamUpdate',out_org)
+            else:
+                # image show in realtime mode
+                cv2.imshow('Render,  Q > exit,  S > Stitching,  Z > RelativeMotion,  X > AnimationRegion,  C > CropDrivingVideo, KL > AdjustSourceScale, NM > AdjustDriverScale,  Space > Webcamassource,  R > SwitchRealtimeWebcamUpdate', out_crop)
+            # Press the 'q' key to exit the loop, r to switch realtime src_webcam update, spacebar to switch sourceisWebcam
+            k = cv2.waitKey(1) & 0xFF
+            if k == ord('q'):
+                break
+            # Key for Interesting Params
+            if k == ord('s'):
+                infer_cfg.infer_params.flag_stitching = not infer_cfg.infer_params.flag_stitching
+                print('flag_stitching:'+str(infer_cfg.infer_params.flag_stitching))
+            if k == ord('z'):
+                infer_cfg.infer_params.flag_relative_motion = not infer_cfg.infer_params.flag_relative_motion
+                print('flag_relative_motion:'+str(infer_cfg.infer_params.flag_relative_motion))
+            if k == ord('x'):
+                if infer_cfg.infer_params.animation_region == "all": infer_cfg.infer_params.animation_region = "exp", print('animation_region = "exp"')
+                else:infer_cfg.infer_params.animation_region = "all", print('animation_region = "all"')
+            if k == ord('c'):
+                infer_cfg.infer_params.flag_crop_driving_video = not infer_cfg.infer_params.flag_crop_driving_video
+                print('flag_crop_driving_video:'+str(infer_cfg.infer_params.flag_crop_driving_video))
+            if k == ord('v'):
+                infer_cfg.infer_params.flag_pasteback = not infer_cfg.infer_params.flag_pasteback
+                print('flag_pasteback:'+str(infer_cfg.infer_params.flag_pasteback))
+            if k == ord('a'):
+                infer_cfg.infer_params.flag_normalize_lip = not infer_cfg.infer_params.flag_normalize_lip
+                print('flag_normalize_lip:'+str(infer_cfg.infer_params.flag_normalize_lip))
+            if k == ord('d'):
+                infer_cfg.infer_params.flag_source_video_eye_retargeting = not infer_cfg.infer_params.flag_source_video_eye_retargeting
+                print('flag_source_video_eye_retargeting:'+str(infer_cfg.infer_params.flag_source_video_eye_retargeting))
+            if k == ord('f'):
+                infer_cfg.infer_params.flag_video_editing_head_rotation = not infer_cfg.infer_params.flag_video_editing_head_rotation
+                print('flag_video_editing_head_rotation:'+str(infer_cfg.infer_params.flag_video_editing_head_rotation))
+            if k == ord('g'):
+                infer_cfg.infer_params.flag_eye_retargeting = not infer_cfg.infer_params.flag_eye_retargeting
+                print('flag_eye_retargeting:'+str(infer_cfg.infer_params.flag_eye_retargeting))
+            if k == ord('k'):
+                infer_cfg.crop_params.src_scale -= 0.1
+                ret = pipe.prepare_source(args.src_image, realtime=args.realtime)
+                print('src_scale:'+str(infer_cfg.crop_params.src_scale))
+            if k == ord('l'):
+                infer_cfg.crop_params.src_scale += 0.1
+                ret = pipe.prepare_source(args.src_image, realtime=args.realtime)
+                print('src_scale:'+str(infer_cfg.crop_params.src_scale))
+            if k == ord('n'):
+                infer_cfg.crop_params.dri_scale -= 0.1
+                print('dri_scale:'+str(infer_cfg.crop_params.dri_scale))
+            if k == ord('m'):
+                infer_cfg.crop_params.dri_scale += 0.1
+                print('dri_scale:'+str(infer_cfg.crop_params.dri_scale))
+    if not args.realtime:
+        vout_crop.release()
+        vout_org.release()
+        if video_has_audio(args.dri_video):
+            vsave_crop_path_new = os.path.splitext(vsave_crop_path)[0] + "-audio.mp4"
+            subprocess.call(
+                [FFMPEG, "-i", vsave_crop_path, "-i", args.dri_video,
+                 "-b:v", "10M", "-c:v",
+                 "libx264", "-map", "0:v", "-map", "1:a",
+                 "-c:a", "aac",
+                 "-pix_fmt", "yuv420p", vsave_crop_path_new, "-y", "-shortest"])
+            vsave_org_path_new = os.path.splitext(vsave_org_path)[0] + "-audio.mp4"
+            subprocess.call(
+                [FFMPEG, "-i", vsave_org_path, "-i", args.dri_video,
+                 "-b:v", "10M", "-c:v",
+                 "libx264", "-map", "0:v", "-map", "1:a",
+                 "-c:a", "aac",
+                 "-pix_fmt", "yuv420p", vsave_org_path_new, "-y", "-shortest"])
+            print(vsave_crop_path_new)
+            print(vsave_org_path_new)
+        else:
+            print(vsave_crop_path)
+            print(vsave_org_path)
+    else:
+        cv2.destroyAllWindows()
+    print(
+        "inference median time: {} ms/frame, mean time: {} ms/frame".format(np.median(infer_times) * 1000,
+                                                                            np.mean(infer_times) * 1000))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Faster Live Portrait Pipeline')
+    parser.add_argument('--src_image', required=False, type=str, default="assets/examples/source/s12.jpg",
+                        help='source image')
+    parser.add_argument('--dri_video', required=False, type=str, default="assets/examples/driving/d14.mp4",
+                        help='driving video')
+    parser.add_argument('--cfg', required=False, type=str, default="configs/onnx_infer.yaml", help='inference config')
+    parser.add_argument('--realtime', action='store_true', help='realtime inference')
+    parser.add_argument('--animal', action='store_true', help='use animal model')
+    parser.add_argument('--paste_back', action='store_true', default=False, help='paste back to origin image')
+    args, unknown = parser.parse_known_args()
+    if args.dri_video.endswith(".pkl"):
+        run_with_pkl(args)
+    else:
+        run_with_video(args)

scripts/all_onnx2trt.bat ADDED Viewed

	@@ -0,0 +1,29 @@

+@echo off
+REM warping+spade model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\warping_spade-fix.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\warping_spade-fix.onnx
+REM landmark model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\landmark.onnx
+REM motion_extractor model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\motion_extractor.onnx -p fp32
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\motion_extractor.onnx -p fp32
+REM face_analysis model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\retinaface_det_static.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\face_2dpose_106_static.onnx
+REM appearance_extractor model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\appearance_feature_extractor.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\appearance_feature_extractor.onnx
+REM stitching model
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\stitching.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\stitching_eye.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_onnx\stitching_lip.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\stitching.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\stitching_eye.onnx
+.\venv\python.exe scripts\onnx2trt.py -o .\checkpoints\liveportrait_animal_onnx\stitching_lip.onnx

scripts/all_onnx2trt.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/bin/bash
+# warping+spade model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/warping_spade-fix.onnx
+# landmark model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/landmark.onnx
+# motion_extractor model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/motion_extractor.onnx -p fp32
+# face_analysis model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/retinaface_det_static.onnx
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/face_2dpose_106_static.onnx
+# appearance_extractor model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/appearance_feature_extractor.onnx
+# stitching model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/stitching.onnx
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/stitching_eye.onnx
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_onnx/stitching_lip.onnx

scripts/all_onnx2trt_animal.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/bin/bash
+# warping+spade model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/warping_spade-fix-v1.1.onnx
+# motion_extractor model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/motion_extractor-v1.1.onnx -p fp32
+# appearance_extractor model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/appearance_feature_extractor-v1.1.onnx
+# stitching model
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/stitching-v1.1.onnx
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/stitching_eye-v1.1.onnx
+python scripts/onnx2trt.py -o ./checkpoints/liveportrait_animal_onnx/stitching_lip-v1.1.onnx

scripts/onnx2trt.py ADDED Viewed

	@@ -0,0 +1,161 @@

+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import pdb
+import sys
+import logging
+import argparse
+import platform
+import tensorrt as trt
+import ctypes
+import numpy as np
+logging.basicConfig(level=logging.INFO)
+logging.getLogger("EngineBuilder").setLevel(logging.INFO)
+log = logging.getLogger("EngineBuilder")
+def load_plugins(logger: trt.Logger):
+    # 加载插件库
+    if platform.system().lower() == 'linux':
+        ctypes.CDLL("./checkpoints/liveportrait_onnx/libgrid_sample_3d_plugin.so", mode=ctypes.RTLD_GLOBAL)
+    else:
+        ctypes.CDLL("./checkpoints/liveportrait_onnx/grid_sample_3d_plugin.dll", mode=ctypes.RTLD_GLOBAL, winmode=0)
+    # 初始化TensorRT的插件库
+    trt.init_libnvinfer_plugins(logger, "")
+class EngineBuilder:
+    """
+    Parses an ONNX graph and builds a TensorRT engine from it.
+    """
+    def __init__(self, verbose=False):
+        """
+        :param verbose: If enabled, a higher verbosity level will be set on the TensorRT logger.
+        """
+        self.trt_logger = trt.Logger(trt.Logger.INFO)
+        if verbose:
+            self.trt_logger.min_severity = trt.Logger.Severity.VERBOSE
+        trt.init_libnvinfer_plugins(self.trt_logger, namespace="")
+        self.builder = trt.Builder(self.trt_logger)
+        self.config = self.builder.create_builder_config()
+        self.config.max_workspace_size = 12 * (2 ** 30)  # 12 GB
+        profile = self.builder.create_optimization_profile()
+        # for face_2dpose_106.onnx
+        # profile.set_shape("data", (1, 3, 192, 192), (1, 3, 192, 192), (1, 3, 192, 192))
+        # for retinaface_det.onnx
+        # profile.set_shape("input.1", (1, 3, 512, 512), (1, 3, 512, 512), (1, 3, 512, 512))
+        self.config.add_optimization_profile(profile)
+        # 严格类型约束
+        self.config.set_flag(trt.BuilderFlag.STRICT_TYPES)
+        self.batch_size = None
+        self.network = None
+        self.parser = None
+        # 加载自定义插件
+        load_plugins(self.trt_logger)
+    def create_network(self, onnx_path):
+        """
+        Parse the ONNX graph and create the corresponding TensorRT network definition.
+        :param onnx_path: The path to the ONNX graph to load.
+        """
+        network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+        self.network = self.builder.create_network(network_flags)
+        self.parser = trt.OnnxParser(self.network, self.trt_logger)
+        onnx_path = os.path.realpath(onnx_path)
+        with open(onnx_path, "rb") as f:
+            if not self.parser.parse(f.read()):
+                log.error("Failed to load ONNX file: {}".format(onnx_path))
+                for error in range(self.parser.num_errors):
+                    log.error(self.parser.get_error(error))
+                sys.exit(1)
+        inputs = [self.network.get_input(i) for i in range(self.network.num_inputs)]
+        outputs = [self.network.get_output(i) for i in range(self.network.num_outputs)]
+        log.info("Network Description")
+        for input in inputs:
+            self.batch_size = input.shape[0]
+            log.info("Input '{}' with shape {} and dtype {}".format(input.name, input.shape, input.dtype))
+        for output in outputs:
+            log.info("Output '{}' with shape {} and dtype {}".format(output.name, output.shape, output.dtype))
+        # assert self.batch_size > 0
+        self.builder.max_batch_size = 1
+    def create_engine(
+            self,
+            engine_path,
+            precision
+    ):
+        """
+        Build the TensorRT engine and serialize it to disk.
+        :param engine_path: The path where to serialize the engine to.
+        :param precision: The datatype to use for the engine, either 'fp32', 'fp16' or 'int8'.
+        """
+        engine_path = os.path.realpath(engine_path)
+        engine_dir = os.path.dirname(engine_path)
+        os.makedirs(engine_dir, exist_ok=True)
+        log.info("Building {} Engine in {}".format(precision, engine_path))
+        if precision == "fp16":
+            if not self.builder.platform_has_fast_fp16:
+                log.warning("FP16 is not supported natively on this platform/device")
+            else:
+                self.config.set_flag(trt.BuilderFlag.FP16)
+        with self.builder.build_engine(self.network, self.config) as engine, open(engine_path, "wb") as f:
+            log.info("Serializing engine to file: {:}".format(engine_path))
+            f.write(engine.serialize())
+def main(args):
+    builder = EngineBuilder(args.verbose)
+    builder.create_network(args.onnx)
+    builder.create_engine(
+        args.engine,
+        args.precision
+    )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--onnx", required=True, help="The input ONNX model file to load")
+    parser.add_argument("-e", "--engine", help="The output path for the TRT engine")
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default="fp16",
+        choices=["fp32", "fp16", "int8"],
+        help="The precision mode to build in, either 'fp32', 'fp16' or 'int8', default: 'fp16'",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Enable more verbose log output")
+    args = parser.parse_args()
+    if args.engine is None:
+        args.engine = args.onnx.replace(".onnx", ".trt")
+    main(args)

scripts/start_api.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+#!/bin/bash
+source ~/.bashrc
+python api.py

src/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# -*- coding: utf-8 -*-
+# @Author  : wenshao
+# @Email   : [email protected]
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py

src/models/JoyVASA/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# -*- coding: utf-8 -*-
+# @Time    : 2024/12/15
+# @Author  : wenshao
+# @Email   : [email protected]
+# @Project : FasterLivePortrait
+# @FileName: __init__.py

src/models/JoyVASA/common.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=600):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        # vanilla sinusoidal encoding
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:, x.shape[1], :]
+        return self.dropout(x)
+def enc_dec_mask(T, S, frame_width=2, expansion=0, device='cuda'):
+    mask = torch.ones(T, S)
+    for i in range(T):
+        mask[i, max(0, (i - expansion) * frame_width):(i + expansion + 1) * frame_width] = 0
+    return (mask == 1).to(device=device)
+def pad_audio(audio, audio_unit=320, pad_threshold=80):
+    batch_size, audio_len = audio.shape
+    n_units = audio_len // audio_unit
+    side_len = math.ceil((audio_unit * n_units + pad_threshold - audio_len) / 2)
+    if side_len >= 0:
+        reflect_len = side_len // 2
+        replicate_len = side_len % 2
+        if reflect_len > 0:
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+            audio = F.pad(audio, (reflect_len, reflect_len), mode='reflect')
+        if replicate_len > 0:
+            audio = F.pad(audio, (1, 1), mode='replicate')
+    return audio

src/models/JoyVASA/dit_talking_head.py ADDED Viewed

	@@ -0,0 +1,538 @@

+import pdb
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import platform
+from .common import PositionalEncoding, enc_dec_mask, pad_audio
+from tqdm import tqdm
+class DiffusionSchedule(nn.Module):
+    def __init__(self, num_steps, mode='linear', beta_1=1e-4, beta_T=0.02, s=0.008):
+        super().__init__()
+        if mode == 'linear':
+            betas = torch.linspace(beta_1, beta_T, num_steps)
+        elif mode == 'quadratic':
+            betas = torch.linspace(beta_1 ** 0.5, beta_T ** 0.5, num_steps) ** 2
+        elif mode == 'sigmoid':
+            betas = torch.sigmoid(torch.linspace(-5, 5, num_steps)) * (beta_T - beta_1) + beta_1
+        elif mode == 'cosine':
+            steps = num_steps + 1
+            x = torch.linspace(0, num_steps, steps)
+            alpha_bars = torch.cos(((x / num_steps) + s) / (1 + s) * torch.pi * 0.5) ** 2
+            alpha_bars = alpha_bars / alpha_bars[0]
+            betas = 1 - (alpha_bars[1:] / alpha_bars[:-1])
+            betas = torch.clip(betas, 0.0001, 0.999)
+        else:
+            raise ValueError(f'Unknown diffusion schedule {mode}!')
+        betas = torch.cat([torch.zeros(1), betas], dim=0)  # Padding beta_0 = 0
+        alphas = 1 - betas
+        log_alphas = torch.log(alphas)
+        for i in range(1, log_alphas.shape[0]):  # 1 to T
+            log_alphas[i] += log_alphas[i - 1]
+        alpha_bars = log_alphas.exp()
+        sigmas_flex = torch.sqrt(betas)
+        sigmas_inflex = torch.zeros_like(sigmas_flex)
+        for i in range(1, sigmas_flex.shape[0]):
+            sigmas_inflex[i] = ((1 - alpha_bars[i - 1]) / (1 - alpha_bars[i])) * betas[i]
+        sigmas_inflex = torch.sqrt(sigmas_inflex)
+        self.num_steps = num_steps
+        self.register_buffer('betas', betas)
+        self.register_buffer('alphas', alphas)
+        self.register_buffer('alpha_bars', alpha_bars)
+        self.register_buffer('sigmas_flex', sigmas_flex)
+        self.register_buffer('sigmas_inflex', sigmas_inflex)
+    def uniform_sample_t(self, batch_size):
+        ts = torch.randint(1, self.num_steps + 1, (batch_size,))
+        return ts.tolist()
+    def get_sigmas(self, t, flexibility=0):
+        assert 0 <= flexibility <= 1
+        sigmas = self.sigmas_flex[t] * flexibility + self.sigmas_inflex[t] * (1 - flexibility)
+        return sigmas
+class DitTalkingHead(nn.Module):
+    def __init__(self, device='cuda', target="sample", architecture="decoder",
+                 motion_feat_dim=76, fps=25, n_motions=100, n_prev_motions=10,
+                 audio_model="hubert", feature_dim=512, n_diff_steps=500, diff_schedule="cosine",
+                 cfg_mode="incremental", guiding_conditions="audio,", audio_encoder_path=''):
+        super().__init__()
+        # Model parameters
+        self.target = target  # 预测原始图像还是预测噪声
+        self.architecture = architecture
+        self.motion_feat_dim = motion_feat_dim  # motion 特征维度
+        self.fps = fps
+        self.n_motions = n_motions  # 当前motion100个, window_length, T_w
+        self.n_prev_motions = n_prev_motions  # 前续motion 10个, T_p
+        self.feature_dim = feature_dim
+        # Audio encoder
+        self.audio_model = audio_model
+        if self.audio_model == 'wav2vec2':
+            print("using wav2vec2 audio encoder ...")
+            from .wav2vec2 import Wav2Vec2Model
+            self.audio_encoder = Wav2Vec2Model.from_pretrained(audio_encoder_path)
+            # wav2vec 2.0 weights initialization
+            self.audio_encoder.feature_extractor._freeze_parameters()
+            frozen_layers = [0, 1]
+            for name, param in self.audio_encoder.named_parameters():
+                if name.startswith("feature_projection"):
+                    param.requires_grad = False
+                if name.startswith("encoder.layers"):
+                    layer = int(name.split(".")[2])
+                    if layer in frozen_layers:
+                        param.requires_grad = False
+        elif self.audio_model == "wav2vec2_ori":
+            from .wav2vec2 import Wav2Vec2Model
+            self.audio_encoder = Wav2Vec2Model.from_pretrained(audio_encoder_path)
+            # wav2vec 2.0 weights initialization
+            self.audio_encoder.feature_extractor._freeze_parameters()
+        elif self.audio_model == 'hubert':  # 根据经验，hubert特征提取器效果更好
+            from .hubert import HubertModel
+            # from hubert import HubertModel
+            self.audio_encoder = HubertModel.from_pretrained(audio_encoder_path)
+            self.audio_encoder.feature_extractor._freeze_parameters()
+            # print("hubert-en: ", self.audio_encoder)
+            frozen_layers = [0, 1]
+            for name, param in self.audio_encoder.named_parameters():
+                if name.startswith("feature_projection"):
+                    param.requires_grad = False
+                if name.startswith("encoder.layers"):
+                    layer = int(name.split(".")[2])
+                    if layer in frozen_layers:
+                        param.requires_grad = False
+        elif self.audio_model == 'hubert_zh':  # 根据经验，hubert特征提取器效果更好
+            print("using hubert chinese")
+            from .hubert import HubertModel
+            # from hubert import HubertModel
+            self.audio_encoder = HubertModel.from_pretrained(audio_encoder_path)
+            self.audio_encoder.feature_extractor._freeze_parameters()
+            frozen_layers = [0, 1]
+            for name, param in self.audio_encoder.named_parameters():
+                if name.startswith("feature_projection"):
+                    param.requires_grad = False
+                if name.startswith("encoder.layers"):
+                    layer = int(name.split(".")[2])
+                    if layer in frozen_layers:
+                        param.requires_grad = False
+        elif self.audio_model == 'hubert_zh_ori':  # 根据经验，hubert特征提取器效果更好
+            print("using hubert chinese ori")
+            from .hubert import HubertModel
+            self.audio_encoder = HubertModel.from_pretrained(audio_encoder_path)
+            self.audio_encoder.feature_extractor._freeze_parameters()
+        else:
+            raise ValueError(f'Unknown audio model {self.audio_model}!')
+        if architecture == 'decoder':
+            self.audio_feature_map = nn.Linear(768, feature_dim)
+            self.start_audio_feat = nn.Parameter(torch.randn(1, self.n_prev_motions, feature_dim))
+        else:
+            raise ValueError(f'Unknown architecture {architecture}!')
+        self.start_motion_feat = nn.Parameter(torch.randn(1, self.n_prev_motions, self.motion_feat_dim))  # 1, 10, 76
+        # Diffusion model
+        self.denoising_net = DenoisingNetwork(device=device, n_motions=self.n_motions,
+                                              n_prev_motions=self.n_prev_motions,
+                                              motion_feat_dim=self.motion_feat_dim, feature_dim=feature_dim)
+        # diffusion schedule
+        self.diffusion_sched = DiffusionSchedule(n_diff_steps, diff_schedule)
+        # Classifier-free settings
+        self.cfg_mode = cfg_mode
+        guiding_conditions = guiding_conditions.split(',') if guiding_conditions else []
+        self.guiding_conditions = [cond for cond in guiding_conditions if cond in ['audio']]
+        if 'audio' in self.guiding_conditions:
+            audio_feat_dim = feature_dim
+            self.null_audio_feat = nn.Parameter(torch.randn(1, 1, audio_feat_dim))  # 1, 1, 512
+        self.to(device)
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def forward(self, motion_feat, audio_or_feat, prev_motion_feat=None, prev_audio_feat=None, time_step=None,
+                indicator=None):
+        """
+        Args:
+            motion_feat: (N, L, d_coef) motion coefficients or features
+            audio_or_feat: (N, L_audio) raw audio or audio feature
+            prev_motion_feat: (N, n_prev_motions, d_motion) previous motion coefficients or feature
+            prev_audio_feat: (N, n_prev_motions, d_audio) previous audio features
+            time_step: (N,)
+            indicator: (N, L) 0/1 indicator of real (unpadded) motion coefficients
+        Returns:
+           motion_feat_noise: (N, L, d_motion)
+        """
+        batch_size = motion_feat.shape[0]
+        # 加载语音特征
+        if audio_or_feat.ndim == 2:  # 原始语音
+            # Extract audio features
+            assert audio_or_feat.shape[1] == 16000 * self.n_motions / self.fps, \
+                f'Incorrect audio length {audio_or_feat.shape[1]}'
+            audio_feat_saved = self.extract_audio_feature(audio_or_feat)  # (N, L, feature_dim)
+        elif audio_or_feat.ndim == 3:  # 语音特征
+            assert audio_or_feat.shape[1] == self.n_motions, f'Incorrect audio feature length {audio_or_feat.shape[1]}'
+            audio_feat_saved = audio_or_feat
+        else:
+            raise ValueError(f'Incorrect audio input shape {audio_or_feat.shape}')
+        audio_feat = audio_feat_saved.clone()
+        # 前续motion特征
+        if prev_motion_feat is None:
+            prev_motion_feat = self.start_motion_feat.expand(batch_size, -1, -1)  # (N, n_prev_motions, d_motion)
+        # 前续语音特征
+        if prev_audio_feat is None:
+            # (N, n_prev_motions, feature_dim)
+            prev_audio_feat = self.start_audio_feat.expand(batch_size, -1, -1)
+        # Classifier-free guidance
+        if len(self.guiding_conditions) > 0:
+            assert len(self.guiding_conditions) <= 2, 'Only support 1 or 2 CFG conditions!'
+            if len(self.guiding_conditions) == 1 or self.cfg_mode == 'independent':
+                null_cond_prob = 0.5 if len(self.guiding_conditions) >= 2 else 0.1
+                if 'audio' in self.guiding_conditions:
+                    mask_audio = torch.rand(batch_size, device=self.device) < null_cond_prob
+                    audio_feat = torch.where(mask_audio.view(-1, 1, 1),
+                                             self.null_audio_feat.expand(batch_size, self.n_motions, -1),
+                                             audio_feat)
+            else:
+                # len(self.guiding_conditions) > 1 and self.cfg_mode == 'incremental'
+                # full (0.45), w/o style (0.45), w/o style or audio (0.1)
+                mask_flag = torch.rand(batch_size, device=self.device)
+                if 'audio' in self.guiding_conditions:
+                    mask_audio = mask_flag > 0.9
+                    audio_feat = torch.where(mask_audio.view(-1, 1, 1),
+                                             self.null_audio_feat.expand(batch_size, self.n_motions, -1),
+                                             audio_feat)
+        if time_step is None:
+            # Sample time step
+            time_step = self.diffusion_sched.uniform_sample_t(batch_size)  # (N,)
+        # The forward diffusion process
+        alpha_bar = self.diffusion_sched.alpha_bars[time_step]  # (N,)
+        c0 = torch.sqrt(alpha_bar).view(-1, 1, 1)  # (N, 1, 1)
+        c1 = torch.sqrt(1 - alpha_bar).view(-1, 1, 1)  # (N, 1, 1)
+        eps = torch.randn_like(motion_feat)  # (N, L, d_motion)
+        motion_feat_noisy = c0 * motion_feat + c1 * eps
+        # The reverse diffusion process
+        motion_feat_target = self.denoising_net(motion_feat_noisy, audio_feat,
+                                                prev_motion_feat, prev_audio_feat, time_step, indicator)
+        return eps, motion_feat_target, motion_feat.detach(), audio_feat_saved.detach()
+    def extract_audio_feature(self, audio, frame_num=None):
+        frame_num = frame_num or self.n_motions
+        # # Strategy 1: resample during audio feature extraction
+        # hidden_states = self.audio_encoder(pad_audio(audio), self.fps, frame_num=frame_num).last_hidden_state  # (N, L, 768)
+        # Strategy 2: resample after audio feature extraction (BackResample)
+        hidden_states = self.audio_encoder(pad_audio(audio), self.fps,
+                                           frame_num=frame_num * 2).last_hidden_state  # (N, 2L, 768)
+        hidden_states = hidden_states.transpose(1, 2)  # (N, 768, 2L)
+        hidden_states = F.interpolate(hidden_states, size=frame_num, align_corners=False, mode='linear')  # (N, 768, L)
+        hidden_states = hidden_states.transpose(1, 2)  # (N, L, 768)
+        audio_feat = self.audio_feature_map(hidden_states)  # (N, L, feature_dim)
+        return audio_feat
+    @torch.no_grad()
+    def sample(self, audio_or_feat, prev_motion_feat=None, prev_audio_feat=None,
+               motion_at_T=None, indicator=None, cfg_mode=None, cfg_cond=None, cfg_scale=1.15, flexibility=0,
+               dynamic_threshold=None, ret_traj=False):
+        # Check and convert inputs
+        batch_size = audio_or_feat.shape[0]
+        # Check CFG conditions
+        if cfg_mode is None:  # Use default CFG mode
+            cfg_mode = self.cfg_mode
+        if cfg_cond is None:  # Use default CFG conditions
+            cfg_cond = self.guiding_conditions
+        cfg_cond = [c for c in cfg_cond if c in ['audio', ]]
+        if not isinstance(cfg_scale, list):
+            cfg_scale = [cfg_scale] * len(cfg_cond)
+        # sort cfg_cond and cfg_scale
+        if len(cfg_cond) > 0:
+            cfg_cond, cfg_scale = zip(*sorted(zip(cfg_cond, cfg_scale), key=lambda x: ['audio', ].index(x[0])))
+        else:
+            cfg_cond, cfg_scale = [], []
+        if audio_or_feat.ndim == 2:
+            # Extract audio features
+            assert audio_or_feat.shape[1] == 16000 * self.n_motions / self.fps, \
+                f'Incorrect audio length {audio_or_feat.shape[1]}'
+            audio_feat = self.extract_audio_feature(audio_or_feat)  # (N, L, feature_dim)
+        elif audio_or_feat.ndim == 3:
+            assert audio_or_feat.shape[1] == self.n_motions, f'Incorrect audio feature length {audio_or_feat.shape[1]}'
+            audio_feat = audio_or_feat
+        else:
+            raise ValueError(f'Incorrect audio input shape {audio_or_feat.shape}')
+        if prev_motion_feat is None:
+            prev_motion_feat = self.start_motion_feat.expand(batch_size, -1, -1)  # (N, n_prev_motions, d_motion)
+        if prev_audio_feat is None:
+            # (N, n_prev_motions, feature_dim)
+            prev_audio_feat = self.start_audio_feat.expand(batch_size, -1, -1)
+        if motion_at_T is None:
+            motion_at_T = torch.randn((batch_size, self.n_motions, self.motion_feat_dim)).to(self.device)
+        # Prepare input for the reverse diffusion process (including optional classifier-free guidance)
+        if 'audio' in cfg_cond:
+            audio_feat_null = self.null_audio_feat.expand(batch_size, self.n_motions, -1)
+        else:
+            audio_feat_null = audio_feat
+        audio_feat_in = [audio_feat_null]
+        for cond in cfg_cond:
+            if cond == 'audio':
+                audio_feat_in.append(audio_feat)
+        n_entries = len(audio_feat_in)
+        audio_feat_in = torch.cat(audio_feat_in, dim=0)
+        prev_motion_feat_in = torch.cat([prev_motion_feat] * n_entries, dim=0)
+        prev_audio_feat_in = torch.cat([prev_audio_feat] * n_entries, dim=0)
+        indicator_in = torch.cat([indicator] * n_entries, dim=0) if indicator is not None else None
+        traj = {self.diffusion_sched.num_steps: motion_at_T}
+        for t in tqdm(range(self.diffusion_sched.num_steps, 0, -1)):
+            if t > 1:
+                z = torch.randn_like(motion_at_T)
+            else:
+                z = torch.zeros_like(motion_at_T)
+            alpha = self.diffusion_sched.alphas[t]
+            alpha_bar = self.diffusion_sched.alpha_bars[t]
+            alpha_bar_prev = self.diffusion_sched.alpha_bars[t - 1]
+            sigma = self.diffusion_sched.get_sigmas(t, flexibility)
+            motion_at_t = traj[t]
+            motion_in = torch.cat([motion_at_t] * n_entries, dim=0)
+            step_in = torch.tensor([t] * batch_size, device=self.device)
+            step_in = torch.cat([step_in] * n_entries, dim=0)
+            results = self.denoising_net(motion_in, audio_feat_in, prev_motion_feat_in,
+                                         prev_audio_feat_in, step_in, indicator_in)
+            # Apply thresholding if specified
+            if dynamic_threshold:
+                dt_ratio, dt_min, dt_max = dynamic_threshold
+                abs_results = results[:, -self.n_motions:].reshape(batch_size * n_entries, -1).abs()
+                s = torch.quantile(abs_results, dt_ratio, dim=1)
+                s = torch.clamp(s, min=dt_min, max=dt_max)
+                s = s[..., None, None]
+                results = torch.clamp(results, min=-s, max=s)
+            results = results.chunk(n_entries)
+            # Unconditional target (CFG) or the conditional target (non-CFG)
+            target_theta = results[0][:, -self.n_motions:]
+            # Classifier-free Guidance (optional)
+            for i in range(0, n_entries - 1):
+                if cfg_mode == 'independent':
+                    target_theta += cfg_scale[i] * (
+                            results[i + 1][:, -self.n_motions:] - results[0][:, -self.n_motions:])
+                elif cfg_mode == 'incremental':
+                    target_theta += cfg_scale[i] * (
+                            results[i + 1][:, -self.n_motions:] - results[i][:, -self.n_motions:])
+                else:
+                    raise NotImplementedError(f'Unknown cfg_mode {cfg_mode}')
+            if self.target == 'noise':
+                c0 = 1 / torch.sqrt(alpha)
+                c1 = (1 - alpha) / torch.sqrt(1 - alpha_bar)
+                motion_next = c0 * (motion_at_t - c1 * target_theta) + sigma * z
+            elif self.target == 'sample':
+                c0 = (1 - alpha_bar_prev) * torch.sqrt(alpha) / (1 - alpha_bar)
+                c1 = (1 - alpha) * torch.sqrt(alpha_bar_prev) / (1 - alpha_bar)
+                motion_next = c0 * motion_at_t + c1 * target_theta + sigma * z
+            else:
+                raise ValueError('Unknown target type: {}'.format(self.target))
+            traj[t - 1] = motion_next.detach()  # Stop gradient and save trajectory.
+            traj[t] = traj[t].cpu()  # Move previous output to CPU memory.
+            if not ret_traj:
+                del traj[t]
+        if ret_traj:
+            return traj, motion_at_T, audio_feat
+        else:
+            return traj[0], motion_at_T, audio_feat
+class DenoisingNetwork(nn.Module):
+    def __init__(self, device='cuda', motion_feat_dim=76,
+                 use_indicator=None, architecture="decoder", feature_dim=512, n_heads=8,
+                 n_layers=8, mlp_ratio=4, align_mask_width=1, no_use_learnable_pe=True, n_prev_motions=10,
+                 n_motions=100, n_diff_steps=500, ):
+        super().__init__()
+        # Model parameters
+        self.motion_feat_dim = motion_feat_dim
+        self.use_indicator = use_indicator
+        # Transformer
+        self.architecture = architecture
+        self.feature_dim = feature_dim
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.mlp_ratio = mlp_ratio
+        self.align_mask_width = align_mask_width
+        self.use_learnable_pe = not no_use_learnable_pe
+        # sequence length
+        self.n_prev_motions = n_prev_motions
+        self.n_motions = n_motions
+        # Temporal embedding for the diffusion time step
+        self.TE = PositionalEncoding(self.feature_dim, max_len=n_diff_steps + 1)
+        self.diff_step_map = nn.Sequential(
+            nn.Linear(self.feature_dim, self.feature_dim),
+            nn.GELU(),
+            nn.Linear(self.feature_dim, self.feature_dim)
+        )
+        if self.use_learnable_pe:
+            # Learnable positional encoding
+            self.PE = nn.Parameter(torch.randn(1, 1 + self.n_prev_motions + self.n_motions, self.feature_dim))
+        else:
+            self.PE = PositionalEncoding(self.feature_dim)
+        # Transformer decoder
+        if self.architecture == 'decoder':
+            self.feature_proj = nn.Linear(self.motion_feat_dim + (1 if self.use_indicator else 0),
+                                          self.feature_dim)
+            decoder_layer = nn.TransformerDecoderLayer(
+                d_model=self.feature_dim, nhead=self.n_heads, dim_feedforward=self.mlp_ratio * self.feature_dim,
+                activation='gelu', batch_first=True
+            )
+            self.transformer = nn.TransformerDecoder(decoder_layer, num_layers=self.n_layers)
+            if self.align_mask_width > 0:
+                motion_len = self.n_prev_motions + self.n_motions
+                alignment_mask = enc_dec_mask(motion_len, motion_len, frame_width=1,
+                                              expansion=self.align_mask_width - 1)
+                # print(f"alignment_mask: ", alignment_mask.shape)
+                # alignment_mask = F.pad(alignment_mask, (0, 0, 1, 0), value=False)
+                self.register_buffer('alignment_mask', alignment_mask)
+            else:
+                self.alignment_mask = None
+        else:
+            raise ValueError(f'Unknown architecture: {self.architecture}')
+        # Motion decoder
+        self.motion_dec = nn.Sequential(
+            nn.Linear(self.feature_dim, self.feature_dim // 2),
+            nn.GELU(),
+            nn.Linear(self.feature_dim // 2, self.motion_feat_dim),
+            # nn.Tanh() # 增加了一个tanh
+            # nn.Softmax()
+        )
+        self.to(device)
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def forward(self, motion_feat, audio_feat, prev_motion_feat, prev_audio_feat, step, indicator=None):
+        """
+        Args:
+            motion_feat: (N, L, d_motion). Noisy motion feature
+            audio_feat: (N, L, feature_dim)
+            prev_motion_feat: (N, L_p, d_motion). Padded previous motion coefficients or feature
+            prev_audio_feat: (N, L_p, d_audio). Padded previous motion coefficients or feature
+            step: (N,)
+            indicator: (N, L). 0/1 indicator for the real (unpadded) motion feature
+        Returns:
+            motion_feat_target: (N, L_p + L, d_motion)
+        """
+        motion_feat = motion_feat.to(audio_feat.dtype)
+        # Diffusion time step embedding
+        diff_step_embedding = self.diff_step_map(self.TE.pe[0, step]).unsqueeze(1)  # (N, 1, diff_step_dim)
+        if indicator is not None:
+            indicator = torch.cat([torch.zeros((indicator.shape[0], self.n_prev_motions), device=indicator.device),
+                                   indicator], dim=1)  # (N, L_p + L)
+            indicator = indicator.unsqueeze(-1)  # (N, L_p + L, 1)
+        # Concat features and embeddings
+        if self.architecture == 'decoder':
+            # print("prev_motion_feat: ", prev_motion_feat.shape, "motion_feat: ", motion_feat.shape)
+            feats_in = torch.cat([prev_motion_feat, motion_feat], dim=1)  # (N, L_p + L, d_motion)
+        else:
+            raise ValueError(f'Unknown architecture: {self.architecture}')
+        if self.use_indicator:
+            feats_in = torch.cat([feats_in, indicator], dim=-1)  # (N, L_p + L, d_motion + d_audio + 1)
+        feats_in = self.feature_proj(feats_in)  # (N, L_p + L, feature_dim)
+        # feats_in = torch.cat([person_feat, feats_in], dim=1)  # (N, 1 + L_p + L, feature_dim)
+        if self.use_learnable_pe:
+            # feats_in = feats_in + self.PE
+            feats_in = feats_in + self.PE + diff_step_embedding
+        else:
+            # feats_in = self.PE(feats_in)
+            feats_in = self.PE(feats_in) + diff_step_embedding
+        # Transformer
+        if self.architecture == 'decoder':
+            audio_feat_in = torch.cat([prev_audio_feat, audio_feat], dim=1)  # (N, L_p + L, d_audio)
+            # print(f"feats_in: {feats_in.shape}, audio_feat_in: {audio_feat_in.shape}, memory_mask: {self.alignment_mask.shape}")
+            feat_out = self.transformer(feats_in, audio_feat_in, memory_mask=self.alignment_mask)
+        else:
+            raise ValueError(f'Unknown architecture: {self.architecture}')
+        # Decode predicted motion feature noise / sample
+        # motion_feat_target = self.motion_dec(feat_out[:, 1:])  # (N, L_p + L, d_motion)
+        motion_feat_target = self.motion_dec(feat_out)  # (N, L_p + L, d_motion)
+        return motion_feat_target
+if __name__ == "__main__":
+    device = "cuda"
+    motion_feat_dim = 76
+    n_motions = 100  # L
+    n_prev_motions = 10  # L_p
+    L_audio = int(16000 * n_motions / 25)  # 64000
+    d_audio = 768
+    N = 5
+    feature_dim = 512
+    motion_feat = torch.ones((N, n_motions, motion_feat_dim)).to(device)
+    prev_motion_feat = torch.ones((N, n_prev_motions, motion_feat_dim)).to(device)
+    audio_or_feat = torch.ones((N, L_audio)).to(device)
+    prev_audio_feat = torch.ones((N, n_prev_motions, d_audio)).to(device)
+    time_step = torch.ones(N, dtype=torch.long).to(device)
+    model = DitTalkingHead().to(device)
+    z = model(motion_feat, audio_or_feat, prev_motion_feat=None,
+              prev_audio_feat=None, time_step=None, indicator=None)
+    traj, motion_at_T, audio_feat = z[0], z[1], z[2]
+    print(motion_at_T.shape, audio_feat.shape)

src/models/JoyVASA/helper.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# -*- coding: utf-8 -*-
+# @Time    : 2024/12/15
+# @Author  : wenshao
+# @Email   : [email protected]
+# @Project : FasterLivePortrait
+# @FileName: helper.py
+import os.path as osp
+class NullableArgs:
+    def __init__(self, namespace):
+        for key, value in namespace.__dict__.items():
+            setattr(self, key, value)
+    def __getattr__(self, key):
+        # when an attribute lookup has not found the attribute
+        if key == 'align_mask_width':
+            if 'use_alignment_mask' in self.__dict__:
+                return 1 if self.use_alignment_mask else 0
+            else:
+                return 0
+        if key == 'no_head_pose':
+            return not self.predict_head_pose
+        if key == 'no_use_learnable_pe':
+            return not self.use_learnable_pe
+        return None
+def make_abs_path(fn):
+    # return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+    return osp.abspath(osp.join(osp.dirname(osp.realpath(__file__)), fn))

src/models/JoyVASA/hubert.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from transformers import HubertModel
+from transformers.modeling_outputs import BaseModelOutput
+from .wav2vec2 import linear_interpolation
+_CONFIG_FOR_DOC = 'HubertConfig'
+class HubertModel(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+    def forward(self, input_values, output_fps=25, attention_mask=None, output_attentions=None,
+                output_hidden_states=None, return_dict=None, frame_num=None):
+        self.config.output_attentions = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)  # (N, C, L)
+        # Resample the audio feature @ 50 fps to `output_fps`.
+        if frame_num is not None:
+            extract_features_len = round(frame_num * 50 / output_fps)
+            extract_features = extract_features[:, :, :extract_features_len]
+        extract_features = linear_interpolation(extract_features, 50, output_fps, output_len=frame_num)
+        extract_features = extract_features.transpose(1, 2)  # (N, L, C)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+        hidden_states = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(hidden_states)
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+        return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_outputs.hidden_states,
+                               attentions=encoder_outputs.attentions, )

src/models/JoyVASA/wav2vec2.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from packaging import version
+from typing import Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from transformers import Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput
+_CONFIG_FOR_DOC = 'Wav2Vec2Config'
+# the implementation of Wav2Vec2Model is borrowed from
+# https://huggingface.co/transformers/_modules/transformers/models/wav2vec2/modeling_wav2vec2.html#Wav2Vec2Model
+# initialize our encoder with the pre-trained wav2vec 2.0 weights.
+def _compute_mask_indices(shape: Tuple[int, int], mask_prob: float, mask_length: int,
+                          attention_mask: Optional[torch.Tensor] = None, min_masks: int = 0, ) -> np.ndarray:
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    all_num_mask = int(mask_prob * all_sz / float(mask_length) + np.random.rand())
+    all_num_mask = max(min_masks, all_num_mask)
+    mask_idcs = []
+    padding_mask = attention_mask.ne(1) if attention_mask is not None else None
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand())
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+        lengths = np.full(num_mask, mask_length)
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+        min_len = min(lengths)
+        if sz - min_len <= num_mask:
+            min_len = sz - num_mask - 1
+        mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+        mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])])
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len:
+            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+    return mask
+# linear interpolation layer
+def linear_interpolation(features, input_fps, output_fps, output_len=None):
+    # features: (N, C, L)
+    seq_len = features.shape[2] / float(input_fps)
+    if output_len is None:
+        output_len = int(seq_len * output_fps)
+    output_features = F.interpolate(features, size=output_len, align_corners=False, mode='linear')
+    return output_features
+class Wav2Vec2Model(Wav2Vec2Model):
+    def __init__(self, config):
+        super().__init__(config)
+        self.is_old_version = version.parse(transformers.__version__) < version.parse('4.7.0')
+    def forward(self, input_values, output_fps=25, attention_mask=None, output_attentions=None,
+                output_hidden_states=None, return_dict=None, frame_num=None):
+        self.config.output_attentions = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states = self.feature_extractor(input_values)  # (N, C, L)
+        # Resample the audio feature @ 50 fps to `output_fps`.
+        if frame_num is not None:
+            hidden_states_len = round(frame_num * 50 / output_fps)
+            hidden_states = hidden_states[:, :, :hidden_states_len]
+        hidden_states = linear_interpolation(hidden_states, 50, output_fps, output_len=frame_num)
+        hidden_states = hidden_states.transpose(1, 2)  # (N, L, C)
+        if attention_mask is not None:
+            output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+            attention_mask = torch.zeros(hidden_states.shape[:2], dtype=hidden_states.dtype,
+                                         device=hidden_states.device)
+            attention_mask[(torch.arange(attention_mask.shape[0], device=hidden_states.device), output_lengths - 1)] = 1
+            attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        if self.is_old_version:
+            hidden_states = self.feature_projection(hidden_states)
+        else:
+            hidden_states = self.feature_projection(hidden_states)[0]
+        if self.config.apply_spec_augment and self.training:
+            batch_size, sequence_length, hidden_size = hidden_states.size()
+            if self.config.mask_time_prob > 0:
+                mask_time_indices = _compute_mask_indices((batch_size, sequence_length), self.config.mask_time_prob,
+                                                          self.config.mask_time_length, attention_mask=attention_mask,
+                                                          min_masks=2, )
+                hidden_states[torch.from_numpy(mask_time_indices)] = self.masked_spec_embed.to(hidden_states.dtype)
+            if self.config.mask_feature_prob > 0:
+                mask_feature_indices = _compute_mask_indices((batch_size, hidden_size), self.config.mask_feature_prob,
+                                                             self.config.mask_feature_length, )
+                mask_feature_indices = torch.from_numpy(mask_feature_indices).to(hidden_states.device)
+                hidden_states[mask_feature_indices[:, None].expand(-1, sequence_length, -1)] = 0
+        encoder_outputs = self.encoder(hidden_states, attention_mask=attention_mask,
+                                       output_attentions=output_attentions, output_hidden_states=output_hidden_states,
+                                       return_dict=return_dict, )
+        hidden_states = encoder_outputs[0]
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+        return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_outputs.hidden_states,
+                               attentions=encoder_outputs.attentions, )

src/models/XPose/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# -*- coding: utf-8 -*-
+# @Time    : 2024/8/5 21:58
+# @Author  : shaoguowen
+# @Email   : [email protected]
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py

src/models/XPose/config_model/UniPose_SwinT.py ADDED Viewed

	@@ -0,0 +1,125 @@

+_base_ = ['coco_transformer.py']
+use_label_enc = True
+num_classes=2
+lr = 0.0001
+param_dict_type = 'default'
+lr_backbone = 1e-05
+lr_backbone_names = ['backbone.0']
+lr_linear_proj_names = ['reference_points', 'sampling_offsets']
+lr_linear_proj_mult = 0.1
+ddetr_lr_param = False
+batch_size = 2
+weight_decay = 0.0001
+epochs = 12
+lr_drop = 11
+save_checkpoint_interval = 100
+clip_max_norm = 0.1
+onecyclelr = False
+multi_step_lr = False
+lr_drop_list = [33, 45]
+modelname = 'UniPose'
+frozen_weights = None
+backbone = 'swin_T_224_1k'
+dilation = False
+position_embedding = 'sine'
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+unic_layers = 0
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+pdetr3_bbox_embed_diff_each_layer = False
+pdetr3_refHW = -1
+random_refpoints_xy = False
+fix_refpoints_hw = -1
+dabdetr_yolo_like_anchor_update = False
+dabdetr_deformable_encoder = False
+dabdetr_deformable_decoder = False
+use_deformable_box_attn = False
+box_attn_type = 'roi_align'
+dec_layer_number = None
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+decoder_layer_noise = False
+dln_xy_noise = 0.2
+dln_hw_noise = 0.2
+add_channel_attention = False
+add_pos_value = False
+two_stage_type = 'standard'
+two_stage_pat_embed = 0
+two_stage_add_query_num = 0
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+two_stage_learn_wh = False
+two_stage_default_hw = 0.05
+two_stage_keep_all_tokens = False
+num_select = 50
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+masks = False
+decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']
+matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
+decoder_module_seq = ['sa', 'ca', 'ffn']
+nms_iou_threshold = -1
+dec_pred_bbox_embed_share = True
+dec_pred_class_embed_share = True
+use_dn = True
+dn_number = 100
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef=1.0
+dn_bbox_coef=1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+match_unstable_error = True
+# for ema
+use_ema = True
+ema_decay = 0.9997
+ema_epoch = 0
+use_detached_boxes_dec_out = False
+max_text_len = 256
+shuffle_type = None
+use_text_enhancer = True
+use_fusion_layer = True
+use_checkpoint = False # True
+use_transformer_ckpt = True
+text_encoder_type = 'bert-base-uncased'
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+num_body_points=68
+binary_query_selection = False
+use_cdn = True
+ffn_extra_layernorm = False
+fix_size=False

src/models/XPose/config_model/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# -*- coding: utf-8 -*-
+# @Time    : 2024/8/5 21:58
+# @Author  : shaoguowen
+# @Email   : [email protected]
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py

src/models/XPose/config_model/coco_transformer.py ADDED Viewed

	@@ -0,0 +1,8 @@

+data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+data_aug_max_size = 1333
+data_aug_scales2_resize = [400, 500, 600]
+data_aug_scales2_crop = [384, 600]
+data_aug_scale_overlap = None

src/models/XPose/models/UniPose/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+from .unipose import build_unipose

src/models/XPose/models/UniPose/attention.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from codes in torch.nn
+# ------------------------------------------------------------------------
+"""
+MultiheadAttention that support query, key, and value to have different dimensions.
+Query, key, and value projections are removed.
+Mostly copy-paste from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/activation.py#L873
+and https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py#L4837
+"""
+import warnings
+import torch
+from torch.nn.modules.linear import Linear
+from torch.nn.init import constant_
+from torch.nn.modules.module import Module
+from torch._jit_internal import Optional, Tuple
+try:
+    from torch.overrides import has_torch_function, handle_torch_function
+except:
+    from torch._overrides import has_torch_function, handle_torch_function
+from torch.nn.functional import linear, pad, softmax, dropout
+Tensor = torch.Tensor
+class MultiheadAttention(Module):
+    r"""Allows the model to jointly attend to information
+    from different representation subspaces.
+    See reference: Attention Is All You Need
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+    Args:
+        embed_dim: total dimension of the model.
+        num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
+        add_bias_kv: add bias to the key and value sequences at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        kdim: total number of features in key. Default: None.
+        vdim: total number of features in value. Default: None.
+        Note: if kdim and vdim are None, they will be set to embed_dim such that
+        query, key, and value have the same number of features.
+    Examples::
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    """
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        vdim = vdim if vdim is not None else embed_dim
+        self.out_proj = Linear(vdim , vdim)
+        self.in_proj_bias = None
+        self.in_proj_weight = None
+        self.bias_k = self.bias_v = None
+        self.q_proj_weight = None
+        self.k_proj_weight = None
+        self.v_proj_weight = None
+        self.add_zero_attn = add_zero_attn
+        self._reset_parameters()
+    def _reset_parameters(self):
+        constant_(self.out_proj.bias, 0.)
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+        super(MultiheadAttention, self).__setstate__(state)
+    def forward(self, query, key, value, key_padding_mask=None,
+                need_weights=True, attn_mask=None):
+        # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
+        r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. When given a binary mask and a value is True,
+            the corresponding value on the attention layer will be ignored. When given
+            a byte mask and a value is non-zero, the corresponding value on the attention
+            layer will be ignored
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+    Shape:
+        - Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the position
+          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*\text{num_heads}, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+        """
+        if not self._qkv_same_embed_dim:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight, out_dim=self.vdim)
+        else:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, out_dim=self.vdim)
+def multi_head_attention_forward(query: Tensor,
+                                 key: Tensor,
+                                 value: Tensor,
+                                 embed_dim_to_check: int,
+                                 num_heads: int,
+                                 in_proj_weight: Tensor,
+                                 in_proj_bias: Tensor,
+                                 bias_k: Optional[Tensor],
+                                 bias_v: Optional[Tensor],
+                                 add_zero_attn: bool,
+                                 dropout_p: float,
+                                 out_proj_weight: Tensor,
+                                 out_proj_bias: Tensor,
+                                 training: bool = True,
+                                 key_padding_mask: Optional[Tensor] = None,
+                                 need_weights: bool = True,
+                                 attn_mask: Optional[Tensor] = None,
+                                 use_separate_proj_weight: bool = False,
+                                 q_proj_weight: Optional[Tensor] = None,
+                                 k_proj_weight: Optional[Tensor] = None,
+                                 v_proj_weight: Optional[Tensor] = None,
+                                 static_k: Optional[Tensor] = None,
+                                 static_v: Optional[Tensor] = None,
+                                 out_dim: Optional[Tensor] = None
+                                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+    Shape:
+        Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
+          will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+    """
+    if not torch.jit.is_scripting():
+        tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
+                    out_proj_weight, out_proj_bias)
+        if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
+            return handle_torch_function(
+                multi_head_attention_forward, tens_ops, query, key, value,
+                embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
+                bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
+                out_proj_bias, training=training, key_padding_mask=key_padding_mask,
+                need_weights=need_weights, attn_mask=attn_mask,
+                use_separate_proj_weight=use_separate_proj_weight,
+                q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
+                v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+    head_dim = embed_dim // num_heads
+    v_head_dim = out_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+    q = query * scaling
+    k = key
+    v = value
+    if attn_mask is not None:
+        assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
+            attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
+            'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+            attn_mask = attn_mask.to(torch.bool)
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 2D attn_mask is not correct.')
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 3D attn_mask is not correct.')
+        else:
+            raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+        key_padding_mask = key_padding_mask.to(torch.bool)
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, "bias cannot be added to static key."
+            assert static_v is None, "bias cannot be added to static value."
+    else:
+        assert bias_k is None
+        assert bias_v is None
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == v_head_dim
+        v = static_v
+    src_len = k.size(1)
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+        else:
+            attn_output_weights += attn_mask
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+    # attn_output_weights = softmax(
+    #     attn_output_weights, dim=-1)
+    attn_output_weights = softmax(
+            attn_output_weights - attn_output_weights.max(dim=-1, keepdim=True)[0], dim=-1)
+    attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
+    attn_output = torch.bmm(attn_output_weights, v)
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / num_heads
+    else:
+        return attn_output, None

src/models/XPose/models/UniPose/backbone.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+Backbone modules.
+"""
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+from ...util.misc import NestedTensor, is_main_process
+from .position_encoding import build_position_encoding
+from .swin_transformer import build_swin_transformer
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+class BackboneBase(nn.Module):
+    def __init__(
+        self,
+        backbone: nn.Module,
+        train_backbone: bool,
+        num_channels: int,
+        return_interm_indices: list,
+    ):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if (
+                not train_backbone
+                or "layer2" not in name
+                and "layer3" not in name
+                and "layer4" not in name
+            ):
+                parameter.requires_grad_(False)
+        return_layers = {}
+        for idx, layer_index in enumerate(return_interm_indices):
+            return_layers.update(
+                {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)}
+            )
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        # import ipdb; ipdb.set_trace()
+        return out
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(
+        self,
+        name: str,
+        train_backbone: bool,
+        dilation: bool,
+        return_interm_indices: list,
+        batch_norm=FrozenBatchNorm2d,
+    ):
+        if name in ["resnet18", "resnet34", "resnet50", "resnet101"]:
+            backbone = getattr(torchvision.models, name)(
+                replace_stride_with_dilation=[False, False, dilation],
+                pretrained=is_main_process(),
+                norm_layer=batch_norm,
+            )
+        else:
+            raise NotImplementedError("Why you can get here with name {}".format(name))
+        # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
+        assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available."
+        assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+        num_channels_all = [256, 512, 1024, 2048]
+        num_channels = num_channels_all[4 - len(return_interm_indices) :]
+        super().__init__(backbone, train_backbone, num_channels, return_interm_indices)
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.tensors.dtype))
+        return out, pos
+def build_backbone(args):
+    """
+    Useful args:
+        - backbone: backbone name
+        - lr_backbone:
+        - dilation
+        - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]
+        - backbone_freeze_keywords:
+        - use_checkpoint: for swin only for now
+    """
+    position_embedding = build_position_encoding(args)
+    train_backbone = True
+    if not train_backbone:
+        raise ValueError("Please set lr_backbone > 0")
+    return_interm_indices = args.return_interm_indices
+    assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+    args.backbone_freeze_keywords
+    use_checkpoint = getattr(args, "use_checkpoint", False)
+    if args.backbone in ["resnet50", "resnet101"]:
+        backbone = Backbone(
+            args.backbone,
+            train_backbone,
+            args.dilation,
+            return_interm_indices,
+            batch_norm=FrozenBatchNorm2d,
+        )
+        bb_num_channels = backbone.num_channels
+    elif args.backbone in [
+        "swin_T_224_1k",
+        "swin_B_224_22k",
+        "swin_B_384_22k",
+        "swin_L_224_22k",
+        "swin_L_384_22k",
+    ]:
+        pretrain_img_size = int(args.backbone.split("_")[-2])
+        backbone = build_swin_transformer(
+            args.backbone,
+            pretrain_img_size=pretrain_img_size,
+            out_indices=tuple(return_interm_indices),
+            dilation=False,
+            use_checkpoint=use_checkpoint,
+        )
+        bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :]
+    else:
+        raise NotImplementedError("Unknown backbone {}".format(args.backbone))
+    assert len(bb_num_channels) == len(
+        return_interm_indices
+    ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}"
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = bb_num_channels
+    assert isinstance(
+        bb_num_channels, List
+    ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels))
+    return model

src/models/XPose/models/UniPose/deformable_transformer.py ADDED Viewed

	@@ -0,0 +1,1230 @@

+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+import math
+import copy
+import torch
+import torch.utils.checkpoint as checkpoint
+from torch import nn, Tensor
+from typing import Optional
+from ...util.misc import inverse_sigmoid
+from .transformer_vanilla import TransformerEncoderLayer
+from .fuse_modules import BiAttentionBlock
+from .utils import gen_encoder_output_proposals, MLP, _get_activation_fn, gen_sineembed_for_position, get_sine_pos_embed
+from .ops.modules import MSDeformAttn
+class DeformableTransformer(nn.Module):
+    def __init__(self, d_model=256, nhead=8,
+                 num_queries=300,
+                 num_encoder_layers=6,
+                 num_unicoder_layers=0,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048, dropout=0.0,
+                 activation="relu", normalize_before=False,
+                 return_intermediate_dec=False, query_dim=4,
+                 num_patterns=0,
+                 modulate_hw_attn=False,
+                 # for deformable encoder
+                 deformable_encoder=False,
+                 deformable_decoder=False,
+                 num_feature_levels=1,
+                 enc_n_points=4,
+                 dec_n_points=4,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 # init query
+                 learnable_tgt_init=False,
+                 decoder_query_perturber=None,
+                 add_channel_attention=False,
+                 add_pos_value=False,
+                 random_refpoints_xy=False,
+                 # two stage
+                 two_stage_type='no',
+                 two_stage_pat_embed=0,
+                 two_stage_add_query_num=0,
+                 two_stage_learn_wh=False,
+                 two_stage_keep_all_tokens=False,
+                 # evo of #anchors
+                 dec_layer_number=None,
+                 rm_enc_query_scale=True,
+                 rm_dec_query_scale=True,
+                 rm_self_attn_layers=None,
+                 key_aware_type=None,
+                 # layer share
+                 layer_share_type=None,
+                 # for detach
+                 rm_detach=None,
+                 decoder_sa_type='ca',
+                 module_seq=['sa', 'ca', 'ffn'],
+                 # for dn
+                 embed_init_tgt=False,
+                 use_detached_boxes_dec_out=False,
+                 use_text_enhancer=False,
+                 use_fusion_layer=False,
+                 use_checkpoint=False,
+                 use_transformer_ckpt=False,
+                 use_text_cross_attention=False,
+                 text_dropout=0.1,
+                 fusion_dropout=0.1,
+                 fusion_droppath=0.0,
+                 binary_query_selection=False,
+                 ffn_extra_layernorm=False,
+                 ):
+        super().__init__()
+        self.num_feature_levels = num_feature_levels
+        self.num_encoder_layers = num_encoder_layers
+        self.num_unicoder_layers = num_unicoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.deformable_encoder = deformable_encoder
+        self.deformable_decoder = deformable_decoder
+        self.two_stage_keep_all_tokens = two_stage_keep_all_tokens
+        self.num_queries = num_queries
+        self.random_refpoints_xy = random_refpoints_xy
+        self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
+        self.ffn_extra_layernorm = ffn_extra_layernorm
+        assert query_dim == 4
+        self.binary_query_selection = binary_query_selection
+        if self.binary_query_selection:
+            self.binary_query_selection_layer = nn.Linear(d_model, 1)
+        # assert not binary_query_selection, 'binary_query_selection not implemented yet'
+        if num_feature_levels > 1:
+            assert deformable_encoder, "only support deformable_encoder for num_feature_levels > 1"
+        if use_deformable_box_attn:
+            assert deformable_encoder or deformable_encoder
+        assert layer_share_type in [None, 'encoder', 'decoder', 'both']
+        if layer_share_type in ['encoder', 'both']:
+            enc_layer_share = True
+        else:
+            enc_layer_share = False
+        if layer_share_type in ['decoder', 'both']:
+            dec_layer_share = True
+        else:
+            dec_layer_share = False
+        assert layer_share_type is None
+        self.decoder_sa_type = decoder_sa_type
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+        # choose encoder layer type
+        if deformable_encoder:
+            encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
+                                                              dropout, activation,
+                                                              num_feature_levels, nhead, enc_n_points,
+                                                              add_channel_attention=add_channel_attention,
+                                                              use_deformable_box_attn=use_deformable_box_attn,
+                                                              box_attn_type=box_attn_type)
+        else:
+            raise NotImplementedError
+        if use_text_enhancer:
+            text_enhance_layer = TransformerEncoderLayer(
+                d_model=d_model,
+                nhead=nhead // 2,
+                dim_feedforward=dim_feedforward // 2,
+                dropout=text_dropout
+            )
+        else:
+            text_enhance_layer = None
+        if use_fusion_layer:
+            feature_fusion_layer = BiAttentionBlock(
+                v_dim=d_model,
+                l_dim=d_model,
+                embed_dim=dim_feedforward // 2,
+                num_heads=nhead // 2,
+                dropout=fusion_dropout,
+                drop_path=fusion_droppath
+            )
+        else:
+            feature_fusion_layer = None
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        assert encoder_norm is None
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, d_model=d_model,
+            num_queries=num_queries,
+            enc_layer_share=enc_layer_share,
+            text_enhance_layer=text_enhance_layer,
+            feature_fusion_layer=feature_fusion_layer,
+            use_checkpoint=use_checkpoint,
+            use_transformer_ckpt=use_transformer_ckpt,
+        )
+        # choose decoder layer type
+        if deformable_decoder:
+            decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
+                                                              dropout, activation,
+                                                              num_feature_levels, nhead, dec_n_points,
+                                                              use_text_cross_attention=use_text_cross_attention,
+                                                              ffn_extra_layernorm=ffn_extra_layernorm, )
+        else:
+            raise NotImplementedError
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec,
+                                          d_model=d_model, query_dim=query_dim,
+                                          modulate_hw_attn=modulate_hw_attn,
+                                          num_feature_levels=num_feature_levels,
+                                          deformable_decoder=deformable_decoder,
+                                          decoder_query_perturber=decoder_query_perturber,
+                                          dec_layer_number=dec_layer_number, rm_dec_query_scale=rm_dec_query_scale,
+                                          dec_layer_share=dec_layer_share,
+                                          use_detached_boxes_dec_out=use_detached_boxes_dec_out
+                                          )
+        self.d_model = d_model
+        self.nhead = nhead
+        self.dec_layers = num_decoder_layers
+        self.num_queries = num_queries  # useful for single stage model only
+        self.num_patterns = num_patterns
+        if not isinstance(num_patterns, int):
+            Warning("num_patterns should be int but {}".format(type(num_patterns)))
+            self.num_patterns = 0
+        if num_feature_levels > 1:
+            if self.num_encoder_layers > 0:
+                self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+            else:
+                self.level_embed = None
+        self.learnable_tgt_init = learnable_tgt_init
+        assert learnable_tgt_init, "why not learnable_tgt_init"
+        self.embed_init_tgt = embed_init_tgt
+        if (two_stage_type != 'no' and embed_init_tgt) or (two_stage_type == 'no'):
+            self.tgt_embed = nn.Embedding(self.num_queries, d_model)
+            nn.init.normal_(self.tgt_embed.weight.data)
+        else:
+            self.tgt_embed = None
+        # for two stage
+        self.two_stage_type = two_stage_type
+        self.two_stage_pat_embed = two_stage_pat_embed
+        self.two_stage_add_query_num = two_stage_add_query_num
+        self.two_stage_learn_wh = two_stage_learn_wh
+        assert two_stage_type in ['no', 'standard'], "unknown param {} of two_stage_type".format(two_stage_type)
+        if two_stage_type == 'standard':
+            # anchor selection at the output of encoder
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+            if two_stage_pat_embed > 0:
+                self.pat_embed_for_2stage = nn.Parameter(torch.Tensor(two_stage_pat_embed, d_model))
+                nn.init.normal_(self.pat_embed_for_2stage)
+            if two_stage_add_query_num > 0:
+                self.tgt_embed = nn.Embedding(self.two_stage_add_query_num, d_model)
+            if two_stage_learn_wh:
+                # import ipdb; ipdb.set_trace()
+                self.two_stage_wh_embedding = nn.Embedding(1, 2)
+            else:
+                self.two_stage_wh_embedding = None
+        if two_stage_type == 'no':
+            self.init_ref_points(num_queries)  # init self.refpoint_embed
+        self.enc_out_class_embed = None
+        self.enc_out_bbox_embed = None
+        # evolution of anchors
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            if self.two_stage_type != 'no' or num_patterns == 0:
+                assert dec_layer_number[
+                           0] == num_queries, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries})"
+            else:
+                assert dec_layer_number[
+                           0] == num_queries * num_patterns, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries}) * num_patterns({num_patterns})"
+        self._reset_parameters()
+        self.rm_self_attn_layers = rm_self_attn_layers
+        if rm_self_attn_layers is not None:
+            # assert len(rm_self_attn_layers) == num_decoder_layers
+            print("Removing the self-attn in {} decoder layers".format(rm_self_attn_layers))
+            for lid, dec_layer in enumerate(self.decoder.layers):
+                if lid in rm_self_attn_layers:
+                    dec_layer.rm_self_attn_modules()
+        self.rm_detach = rm_detach
+        if self.rm_detach:
+            assert isinstance(rm_detach, list)
+            assert any([i in ['enc_ref', 'enc_tgt', 'dec'] for i in rm_detach])
+        self.decoder.rm_detach = rm_detach
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        if self.num_feature_levels > 1 and self.level_embed is not None:
+            nn.init.normal_(self.level_embed)
+        if self.two_stage_learn_wh:
+            nn.init.constant_(self.two_stage_wh_embedding.weight, math.log(0.05 / (1 - 0.05)))
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, 4)
+        if self.random_refpoints_xy:
+            # import ipdb; ipdb.set_trace()
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+    def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, attn_mask2=None, text_dict=None,
+                dn_meta=None,targets=None,kpt_embed=None):
+        """
+        Input:
+            - srcs: List of multi features [bs, ci, hi, wi]
+            - masks: List of multi masks [bs, hi, wi]
+            - refpoint_embed: [bs, num_dn, 4]. None in infer
+            - pos_embeds: List of multi pos embeds [bs, ci, hi, wi]
+            - tgt: [bs, num_dn, d_model]. None in infer
+        """
+        # if self.two_stage_type != 'no' and self.two_stage_add_query_num == 0:
+        #     assert refpoint_embed is None
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            src = src.flatten(2).transpose(1, 2)  # bs, hw, c
+            mask = mask.flatten(1)  # bs, hw
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)  # bs, hw, c
+            if self.num_feature_levels > 1 and self.level_embed is not None:
+                lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            else:
+                lvl_pos_embed = pos_embed
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)  # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)  # bs, \sum{hxw}
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)  # bs, \sum{hxw}, c
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        # two stage
+        enc_topk_proposals = enc_refpoint_embed = None
+        #########################################################
+        # Begin Encoder
+        #########################################################
+        memory, memory_text = self.encoder(
+            src_flatten,
+            pos=lvl_pos_embed_flatten,
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            key_padding_mask=mask_flatten,
+            memory_text=text_dict['encoded_text'],
+            text_attention_mask=~text_dict['text_token_mask'],
+            # we ~ the mask . False means use the token; True means pad the token
+            position_ids=text_dict['position_ids'],
+            text_self_attention_masks=text_dict['text_self_attention_masks'],
+        )
+        #########################################################
+        # End Encoder
+        # - memory: bs, \sum{hw}, c
+        # - mask_flatten: bs, \sum{hw}
+        # - lvl_pos_embed_flatten: bs, \sum{hw}, c
+        # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        #########################################################
+        text_dict['encoded_text'] = memory_text
+        if self.two_stage_type == 'standard':
+            if self.two_stage_learn_wh:
+                input_hw = self.two_stage_wh_embedding.weight[0]
+            else:
+                input_hw = None
+            output_memory, output_proposals = gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes,
+                                                                           input_hw)
+            output_memory = self.enc_output_norm(self.enc_output(output_memory))
+            if self.two_stage_pat_embed > 0:
+                bs, nhw, _ = output_memory.shape
+                # output_memory: bs, n, 256; self.pat_embed_for_2stage: k, 256
+                output_memory = output_memory.repeat(1, self.two_stage_pat_embed, 1)
+                _pats = self.pat_embed_for_2stage.repeat_interleave(nhw, 0)
+                output_memory = output_memory + _pats
+                output_proposals = output_proposals.repeat(1, self.two_stage_pat_embed, 1)
+            if self.two_stage_add_query_num > 0:
+                assert refpoint_embed is not None
+                output_memory = torch.cat((output_memory, tgt), dim=1)
+                output_proposals = torch.cat((output_proposals, refpoint_embed), dim=1)
+            if self.binary_query_selection:
+                topk_logits = self.binary_query_selection_layer(output_memory).squeeze(-1)
+            else:
+                if text_dict is not None:
+                    enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)
+                else:
+                    enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
+                topk_logits = enc_outputs_class_unselected.max(-1)[0]
+            enc_outputs_coord_unselected = self.enc_out_bbox_embed(
+                output_memory) + output_proposals  # (bs, \sum{hw}, 4) unsigmoid
+            topk = self.num_queries
+            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]  # bs, nq
+            # gather boxes
+            refpoint_embed_undetach = torch.gather(enc_outputs_coord_unselected, 1,
+                                                   topk_proposals.unsqueeze(-1).repeat(1, 1, 4))  # unsigmoid
+            refpoint_embed_ = refpoint_embed_undetach.detach()
+            init_box_proposal = torch.gather(output_proposals, 1,
+                                             topk_proposals.unsqueeze(-1).repeat(1, 1, 4)).sigmoid()  # sigmoid
+            # gather tgt
+            tgt_undetach = torch.gather(output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
+            if self.embed_init_tgt:
+                tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, d_model
+            else:
+                tgt_ = tgt_undetach.detach()
+            if refpoint_embed is not None:
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+        elif self.two_stage_type == 'no':
+            tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, d_model
+            refpoint_embed_ = self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, 4
+            if refpoint_embed is not None:
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+            if self.num_patterns > 0:
+                tgt_embed = tgt.repeat(1, self.num_patterns, 1)
+                refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1)
+                tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(self.num_queries,
+                                                                             1)  # 1, n_q*n_pat, d_model
+                tgt = tgt_embed + tgt_pat
+            init_box_proposal = refpoint_embed_.sigmoid()
+        else:
+            raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type))
+        #########################################################
+        # End preparing tgt
+        # - tgt: bs, NQ, d_model
+        # - refpoint_embed(unsigmoid): bs, NQ, d_model
+        #########################################################
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     if refpoint_embed.isnan().any() | refpoint_embed.isinf().any():
+        #         import ipdb; ipdb.set_trace()
+        #     if tgt.isnan().any() | tgt.isinf().any():
+        #         import ipdb; ipdb.set_trace()
+        #########################################################
+        # Begin Decoder
+        #########################################################
+        hs, references = self.decoder(
+            tgt=tgt.transpose(0, 1),
+            memory=memory.transpose(0, 1),
+            memory_key_padding_mask=mask_flatten,
+            pos=lvl_pos_embed_flatten.transpose(0, 1),
+            refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios, tgt_mask=attn_mask,
+            tgt_mask2=attn_mask2,
+            memory_text=text_dict['encoded_text'],
+            text_attention_mask=~text_dict['text_token_mask'],
+            text_dict=text_dict,
+            dn_meta=dn_meta,
+            targets=targets,
+            kpt_embed=kpt_embed
+            # we ~ the mask . False means use the token; True means pad the token
+        )
+        #########################################################
+        # End Decoder
+        # hs: n_dec, bs, nq, d_model
+        # references: n_dec+1, bs, nq, query_dim
+        #########################################################
+        #########################################################
+        # Begin postprocess
+        #########################################################
+        if self.two_stage_type == 'standard':
+            if self.two_stage_keep_all_tokens:
+                hs_enc = output_memory.unsqueeze(0)
+                ref_enc = enc_outputs_coord_unselected.unsqueeze(0)
+                init_box_proposal = output_proposals
+                # import ipdb; ipdb.set_trace()
+            else:
+                hs_enc = tgt_undetach.unsqueeze(0)
+                ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0)
+        else:
+            hs_enc = ref_enc = None
+        #########################################################
+        # End postprocess
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None
+        # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None
+        #########################################################
+        return hs, references, hs_enc, ref_enc, init_box_proposal
+        # hs: (n_dec, bs, nq, d_model)
+        # references: sigmoid coordinates. (n_dec+1, bs, bq, 4)
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None
+        # ref_enc: sigmoid coordinates. \
+        #           (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None
+class TransformerEncoder(nn.Module):
+    def __init__(self,
+                 encoder_layer, num_layers, d_model=256,
+                 num_queries=300,
+                 enc_layer_share=False,
+                 text_enhance_layer=None,
+                 feature_fusion_layer=None,
+                 use_checkpoint=False,
+                 use_transformer_ckpt=False,
+                 ):
+        """_summary_
+        Args:
+            encoder_layer (_type_): _description_
+            num_layers (_type_): _description_
+            norm (_type_, optional): _description_. Defaults to None.
+            d_model (int, optional): _description_. Defaults to 256.
+            num_queries (int, optional): _description_. Defaults to 300.
+            enc_layer_share (bool, optional): _description_. Defaults to False.
+        """
+        super().__init__()
+        # prepare layers
+        self.layers = []
+        self.text_layers = []
+        self.fusion_layers = []
+        if num_layers > 0:
+            self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)
+            if text_enhance_layer is not None:
+                self.text_layers = _get_clones(text_enhance_layer, num_layers, layer_share=enc_layer_share)
+            if feature_fusion_layer is not None:
+                self.fusion_layers = _get_clones(feature_fusion_layer, num_layers, layer_share=enc_layer_share)
+        else:
+            self.layers = []
+            del encoder_layer
+            if text_enhance_layer is not None:
+                self.text_layers = []
+                del text_enhance_layer
+            if feature_fusion_layer is not None:
+                self.fusion_layers = []
+                del feature_fusion_layer
+        self.query_scale = None
+        self.num_queries = num_queries
+        self.num_layers = num_layers
+        self.d_model = d_model
+        self.use_checkpoint = use_checkpoint
+        self.use_transformer_ckpt = use_transformer_ckpt
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),)
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+    def forward(self,
+                # for images
+                src: Tensor,
+                pos: Tensor,
+                spatial_shapes: Tensor,
+                level_start_index: Tensor,
+                valid_ratios: Tensor,
+                key_padding_mask: Tensor,
+                # for texts
+                memory_text: Tensor = None,
+                text_attention_mask: Tensor = None,
+                pos_text: Tensor = None,
+                text_self_attention_masks: Tensor = None,
+                position_ids: Tensor = None,
+                ):
+        """
+        Input:
+            - src: [bs, sum(hi*wi), 256]
+            - pos: pos embed for src. [bs, sum(hi*wi), 256]
+            - spatial_shapes: h,w of each level [num_level, 2]
+            - level_start_index: [num_level] start point of level in sum(hi*wi).
+            - valid_ratios: [bs, num_level, 2]
+            - key_padding_mask: [bs, sum(hi*wi)]
+            - memory_text: bs, n_text, 256
+            - text_attention_mask: bs, n_text
+                False for no padding; True for padding
+            - pos_text: bs, n_text, 256
+            - position_ids: bs, n_text
+        Intermedia:
+            - reference_points: [bs, sum(hi*wi), num_level, 2]
+        Outpus:
+            - output: [bs, sum(hi*wi), 256]
+        """
+        output = src
+        # preparation and reshape
+        if self.num_layers > 0:
+            reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        if self.text_layers:
+            # generate pos_text
+            bs, n_text, text_dim = memory_text.shape
+            if pos_text is None and position_ids is None:
+                pos_text = torch.arange(n_text, device=memory_text.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs,
+                                                                                                                     1,
+                                                                                                                     1)
+                pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)
+            if position_ids is not None:
+                pos_text = get_sine_pos_embed(position_ids[..., None], num_pos_feats=256, exchange_xy=False)
+        # main process
+        for layer_id, layer in enumerate(self.layers):
+            # if output.isnan().any() or memory_text.isnan().any():
+            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            #         import ipdb; ipdb.set_trace()
+            if self.fusion_layers:
+                if self.use_checkpoint:
+                    output, memory_text = checkpoint.checkpoint(
+                        self.fusion_layers[layer_id],
+                        output,
+                        memory_text,
+                        key_padding_mask,
+                        text_attention_mask
+                    )
+                else:
+                    output, memory_text = self.fusion_layers[layer_id](v=output, l=memory_text,
+                                                                       attention_mask_v=key_padding_mask,
+                                                                       attention_mask_l=text_attention_mask)
+            if self.text_layers:
+                memory_text = self.text_layers[layer_id](
+                    src=memory_text.transpose(0, 1),
+                    src_mask=~text_self_attention_masks,  # note we use ~ for mask here
+                    src_key_padding_mask=text_attention_mask,
+                    pos=(pos_text.transpose(0, 1) if pos_text is not None else None)
+                ).transpose(0, 1)
+            # main process
+            if self.use_transformer_ckpt:
+                output = checkpoint.checkpoint(
+                    layer,
+                    output,
+                    pos,
+                    reference_points,
+                    spatial_shapes,
+                    level_start_index,
+                    key_padding_mask
+                )
+            else:
+                output = layer(src=output, pos=pos, reference_points=reference_points, spatial_shapes=spatial_shapes,
+                               level_start_index=level_start_index, key_padding_mask=key_padding_mask)
+        return output, memory_text
+class TransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None,
+                 return_intermediate=False,
+                 d_model=256, query_dim=4,
+                 modulate_hw_attn=False,
+                 num_feature_levels=1,
+                 deformable_decoder=False,
+                 decoder_query_perturber=None,
+                 dec_layer_number=None,  # number of queries each layer in decoder
+                 rm_dec_query_scale=False,
+                 dec_layer_share=False,
+                 dec_layer_dropout_prob=None,
+                 use_detached_boxes_dec_out=False,
+                 num_box_decoder_layers=2,
+                 num_body_points=68,
+                 ):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(decoder_layer, num_layers, layer_share=dec_layer_share)
+        else:
+            self.layers = []
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+        assert return_intermediate, "support return_intermediate only"
+        self.query_dim = query_dim
+        assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
+        self.num_feature_levels = num_feature_levels
+        self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
+        if not deformable_decoder:
+            self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)
+        else:
+            self.query_pos_sine_scale = None
+        if rm_dec_query_scale:
+            self.query_scale = None
+        else:
+            raise NotImplementedError
+            self.query_scale = MLP(d_model, d_model, d_model, 2)
+        self.bbox_embed = None
+        self.class_embed = None
+        self.pose_embed = None
+        self.pose_hw_embed = None
+        self.d_model = d_model
+        self.modulate_hw_attn = modulate_hw_attn
+        self.deformable_decoder = deformable_decoder
+        if not deformable_decoder and modulate_hw_attn:
+            self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
+        else:
+            self.ref_anchor_head = None
+        self.decoder_query_perturber = decoder_query_perturber
+        self.box_pred_damping = None
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            assert isinstance(dec_layer_number, list)
+            assert len(dec_layer_number) == num_layers
+            # assert dec_layer_number[0] ==
+        self.dec_layer_dropout_prob = dec_layer_dropout_prob
+        if dec_layer_dropout_prob is not None:
+            assert isinstance(dec_layer_dropout_prob, list)
+            assert len(dec_layer_dropout_prob) == num_layers
+            for i in dec_layer_dropout_prob:
+                assert 0.0 <= i <= 1.0
+        self.rm_detach = None
+        self.num_body_points = num_body_points
+        self.hw = nn.Embedding(17, 2)
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.kpt_index = [x for x in range(50 * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]
+        self.hw_append = nn.Embedding(self.num_body_points-17, 2)
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                tgt_mask2: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                refpoints_unsigmoid: Optional[Tensor] = None,  # num_queries, bs, 2
+                # for memory
+                level_start_index: Optional[Tensor] = None,  # num_levels
+                spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+                valid_ratios: Optional[Tensor] = None,
+                # for text
+                memory_text: Optional[Tensor] = None,
+                text_attention_mask: Optional[Tensor] = None,
+                text_dict: Optional[Tensor] = None,
+                dn_meta: Optional[Tensor] = None,
+                targets: Optional[Tensor] = None,
+                kpt_embed: Optional[Tensor] = None
+                ):
+        """
+        Input:
+            - tgt: nq, bs, d_model
+            - memory: hw, bs, d_model
+            - pos: hw, bs, d_model
+            - refpoints_unsigmoid: nq, bs, 2/4
+            - valid_ratios/spatial_shapes: bs, nlevel, 2
+        """
+        output = tgt
+        output += self.hw.weight[0, 0] * 0.0
+        intermediate = []
+        reference_points = refpoints_unsigmoid.sigmoid()
+        ref_points = [reference_points]
+        effect_num_dn = dn_meta['pad_size'] if self.training else 0
+        inter_select_number = 50
+        for layer_id, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] \
+                                         * torch.cat([valid_ratios, valid_ratios], -1)[None, :]  # nq, bs, nlevel, 4
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
+            query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :])  # nq, bs, 256*2
+            # conditional query
+            raw_query_pos = self.ref_point_head(query_sine_embed)  # nq, bs, 256
+            pos_scale = self.query_scale(output) if self.query_scale is not None else 1
+            query_pos = pos_scale * raw_query_pos
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if query_pos.isnan().any() | query_pos.isinf().any():
+            #         import ipdb; ipdb.set_trace()
+            # main process
+            output = layer(
+                tgt=output,
+                tgt_query_pos=query_pos,
+                tgt_query_sine_embed=query_sine_embed,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                tgt_reference_points=reference_points_input,
+                memory_text=memory_text,
+                text_attention_mask=text_attention_mask,
+                memory=memory,
+                memory_key_padding_mask=memory_key_padding_mask,
+                memory_level_start_index=level_start_index,
+                memory_spatial_shapes=spatial_shapes,
+                memory_pos=pos,
+                self_attn_mask=tgt_mask,
+                cross_attn_mask=memory_mask
+            )
+            if output.isnan().any() | output.isinf().any():
+                print(f"output layer_id {layer_id} is nan")
+                try:
+                    num_nan = output.isnan().sum().item()
+                    num_inf = output.isinf().sum().item()
+                    print(f"num_nan {num_nan}, num_inf {num_inf}")
+                except Exception as e:
+                    print(e)
+            intermediate.append(self.norm(output))
+            # iter update
+            if layer_id < self.num_box_decoder_layers:
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                delta_unsig = self.bbox_embed[layer_id](output)
+                outputs_unsig = delta_unsig + reference_before_sigmoid
+                new_reference_points = outputs_unsig.sigmoid()
+            # select # ref points as anchors
+            if layer_id == self.num_box_decoder_layers - 1:
+                dn_output = output[:effect_num_dn]
+                dn_new_reference_points = new_reference_points[:effect_num_dn]
+                class_unselected = self.class_embed[layer_id](output.transpose(0, 1), text_dict)[:,
+                                   effect_num_dn:].transpose(0, 1)
+                topk_proposals = torch.topk(class_unselected.max(-1)[0], inter_select_number, dim=0)[1]
+                new_reference_points_for_box = torch.gather(new_reference_points[effect_num_dn:], 0,
+                                                            topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+                new_output_for_box = torch.gather(output[effect_num_dn:], 0,
+                                                  topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
+                keypoint_embed=kpt_embed.transpose(0, 1)
+                new_output_for_keypoint = keypoint_embed[None, :, :, :].repeat(new_output_for_box.shape[0],1,1,1)
+                delta_xy = self.pose_embed[-1](new_output_for_keypoint)[..., :2]
+                keypoint_xy = (inverse_sigmoid(new_reference_points_for_box[..., :2][:, None]) + delta_xy).sigmoid()
+                num_queries, _, bs, _ = keypoint_xy.shape
+                aa = torch.cat((self.hw.weight,self.hw_append.weight),dim=0)
+                keypoint_wh_weight = aa.unsqueeze(0).unsqueeze(-2).repeat(num_queries, 1, bs, 1).sigmoid()
+                keypoint_wh = keypoint_wh_weight * new_reference_points_for_box[..., 2:][:, None]
+                new_reference_points_for_keypoint = torch.cat((keypoint_xy, keypoint_wh), dim=-1)
+                new_reference_points = torch.cat(
+                    (new_reference_points_for_box.unsqueeze(1), new_reference_points_for_keypoint), dim=1).flatten(0, 1)
+                output = torch.cat((new_output_for_box.unsqueeze(1), new_output_for_keypoint), dim=1).flatten(0, 1)
+                new_reference_points = torch.cat((dn_new_reference_points, new_reference_points), dim=0)
+                output = torch.cat((dn_output, output), dim=0)
+                tgt_mask = tgt_mask2
+            if layer_id >= self.num_box_decoder_layers:
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                output_bbox_dn = output[:effect_num_dn]
+                output_bbox_norm = output[effect_num_dn:][0::(self.num_body_points + 1)]
+                reference_before_sigmoid_bbox_dn = reference_before_sigmoid[:effect_num_dn]
+                reference_before_sigmoid_bbox_norm = reference_before_sigmoid[effect_num_dn:][
+                                                     0::(self.num_body_points + 1)]
+                delta_unsig_dn = self.bbox_embed[layer_id](output_bbox_dn)
+                delta_unsig_norm = self.bbox_embed[layer_id](output_bbox_norm)
+                outputs_unsig_dn = delta_unsig_dn + reference_before_sigmoid_bbox_dn
+                outputs_unsig_norm = delta_unsig_norm + reference_before_sigmoid_bbox_norm
+                new_reference_points_for_box_dn = outputs_unsig_dn.sigmoid()
+                new_reference_points_for_box_norm = outputs_unsig_norm.sigmoid()
+                output_kpt = output[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index, device=output.device))
+                delta_xy_unsig = self.pose_embed[layer_id - self.num_box_decoder_layers](output_kpt)
+                outputs_unsig = reference_before_sigmoid[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index,
+                                                                                                      device=output.device)).clone()  ##
+                delta_hw_unsig = self.pose_hw_embed[layer_id - self.num_box_decoder_layers](output_kpt)
+                outputs_unsig[..., :2] += delta_xy_unsig[..., :2]
+                outputs_unsig[..., 2:] += delta_hw_unsig
+                new_reference_points_for_keypoint = outputs_unsig.sigmoid()
+                bs = new_reference_points_for_box_norm.shape[1]
+                new_reference_points_norm = torch.cat((new_reference_points_for_box_norm.unsqueeze(1),
+                                                       new_reference_points_for_keypoint.view(-1, self.num_body_points,
+                                                                                              bs, 4)), dim=1).flatten(0,
+                                                                                                                      1)
+                new_reference_points = torch.cat((new_reference_points_for_box_dn, new_reference_points_norm), dim=0)
+            if self.rm_detach and 'dec' in self.rm_detach:
+                reference_points = new_reference_points
+            else:
+                reference_points = new_reference_points.detach()
+            # if layer_id != self.num_layers - 1:
+            if self.use_detached_boxes_dec_out:
+                ref_points.append(reference_points)
+            else:
+                ref_points.append(new_reference_points)
+        return [
+            [itm_out.transpose(0, 1) for itm_out in intermediate],
+            [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points]
+        ]
+class DeformableTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 add_channel_attention=False,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 ):
+        super().__init__()
+        # self attention
+        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+        # channel attention
+        self.add_channel_attention = add_channel_attention
+        if add_channel_attention:
+            self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)
+            self.norm_channel = nn.LayerNorm(d_model)
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):
+        # self attention
+        # import ipdb; ipdb.set_trace()
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index,
+                              key_padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+        # channel attn
+        if self.add_channel_attention:
+            src = self.norm_channel(src + self.activ_channel(src))
+        return src
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 use_text_feat_guide=False,
+                 use_text_cross_attention=False,
+                 ffn_extra_layernorm=False
+                 ):
+        super().__init__()
+        # cross attention
+        # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm1 = nn.LayerNorm(d_model)
+        # cross attention text
+        if use_text_cross_attention:
+            self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+            self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+            self.catext_norm = nn.LayerNorm(d_model)
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm2 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
+        self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm3 = nn.LayerNorm(d_model)
+        if ffn_extra_layernorm:
+            raise NotImplementedError('ffn_extra_layernorm not implemented')
+            self.norm_ext = nn.LayerNorm(d_ffn)
+        else:
+            self.norm_ext = None
+        self.key_aware_proj = None
+        self.use_text_feat_guide = use_text_feat_guide
+        assert not use_text_feat_guide
+        self.use_text_cross_attention = use_text_cross_attention
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, tgt, ipdb_flag=False):
+        with torch.cuda.amp.autocast(enabled=False):
+            tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+    def forward(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None,  # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None,  # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None,  # nq, bs, 4
+                memory_text: Optional[Tensor] = None,  # bs, num_token, d_model
+                text_attention_mask: Optional[Tensor] = None,  # bs, num_token
+                # for memory
+                memory: Optional[Tensor] = None,  # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None,  # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None,  # pos for memory
+                # sa
+                self_attn_mask: Optional[Tensor] = None,  # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None,  # mask used for cross-attention
+                ):
+        """
+        Input:
+            - tgt/tgt_query_pos: nq, bs, d_model
+            -
+        """
+        assert cross_attn_mask is None
+        # self attention
+        if self.self_attn is not None:
+            # import ipdb; ipdb.set_trace()
+            q = k = self.with_pos_embed(tgt, tgt_query_pos)
+            tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+            tgt = tgt + self.dropout2(tgt2)
+            tgt = self.norm2(tgt)
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if tgt.isnan().any() | tgt.isinf().any() :
+            #         import ipdb; ipdb.set_trace()
+        if self.use_text_cross_attention:
+            tgt2 = self.ca_text(self.with_pos_embed(tgt, tgt_query_pos), memory_text.transpose(0, 1),
+                                memory_text.transpose(0, 1), key_padding_mask=text_attention_mask)[0]
+            tgt = tgt + self.catext_dropout(tgt2)
+            tgt = self.catext_norm(tgt)
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            #         import ipdb; ipdb.set_trace()
+            # if tgt.isnan().any() | tgt.isinf().any() :
+            #     import ipdb; ipdb.set_trace()
+        tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+                               tgt_reference_points.transpose(0, 1).contiguous(),
+                               memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index,
+                               memory_key_padding_mask).transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     tgtk = tgt.clone()
+        #     if tgt.isnan().any() | tgt.isinf().any() :
+        #         import ipdb; ipdb.set_trace()
+        # ffn
+        tgt = self.forward_ffn(tgt)
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     if tgt.isnan().any() | tgt.isinf().any() :
+        #         tgtk = self.forward_ffn(tgtk, ipdb_flag=True)
+        #         import ipdb; ipdb.set_trace()
+        return tgt
+def _get_clones(module, N, layer_share=False):
+    # import ipdb; ipdb.set_trace()
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def build_deformable_transformer(args):
+    decoder_query_perturber = None
+    if args.decoder_layer_noise:
+        from .utils import RandomBoxPerturber
+        decoder_query_perturber = RandomBoxPerturber(
+            x_noise_scale=args.dln_xy_noise, y_noise_scale=args.dln_xy_noise,
+            w_noise_scale=args.dln_hw_noise, h_noise_scale=args.dln_hw_noise)
+    use_detached_boxes_dec_out = False
+    try:
+        use_detached_boxes_dec_out = args.use_detached_boxes_dec_out
+    except:
+        use_detached_boxes_dec_out = False
+    binary_query_selection = False
+    try:
+        binary_query_selection = args.binary_query_selection
+    except:
+        binary_query_selection = False
+    ffn_extra_layernorm = False
+    try:
+        ffn_extra_layernorm = args.ffn_extra_layernorm
+    except:
+        print('ffn_extra_layernorm not found, set to False')
+        ffn_extra_layernorm = False
+    return DeformableTransformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        num_queries=args.num_queries,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_unicoder_layers=args.unic_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+        query_dim=args.query_dim,
+        activation=args.transformer_activation,
+        num_patterns=args.num_patterns,
+        modulate_hw_attn=True,
+        deformable_encoder=True,
+        deformable_decoder=True,
+        num_feature_levels=args.num_feature_levels,
+        enc_n_points=args.enc_n_points,
+        dec_n_points=args.dec_n_points,
+        use_deformable_box_attn=args.use_deformable_box_attn,
+        box_attn_type=args.box_attn_type,
+        learnable_tgt_init=True,
+        decoder_query_perturber=decoder_query_perturber,
+        add_channel_attention=args.add_channel_attention,
+        add_pos_value=args.add_pos_value,
+        random_refpoints_xy=args.random_refpoints_xy,
+        # two stage
+        two_stage_type=args.two_stage_type,  # ['no', 'standard', 'early']
+        two_stage_pat_embed=args.two_stage_pat_embed,
+        two_stage_add_query_num=args.two_stage_add_query_num,
+        two_stage_learn_wh=args.two_stage_learn_wh,
+        two_stage_keep_all_tokens=args.two_stage_keep_all_tokens,
+        dec_layer_number=args.dec_layer_number,
+        rm_self_attn_layers=None,
+        key_aware_type=None,
+        layer_share_type=None,
+        rm_detach=None,
+        decoder_sa_type=args.decoder_sa_type,
+        module_seq=args.decoder_module_seq,
+        embed_init_tgt=args.embed_init_tgt,
+        use_detached_boxes_dec_out=use_detached_boxes_dec_out,
+        use_text_enhancer=args.use_text_enhancer,
+        use_fusion_layer=args.use_fusion_layer,
+        use_checkpoint=args.use_checkpoint,
+        use_transformer_ckpt=args.use_transformer_ckpt,
+        use_text_cross_attention=args.use_text_cross_attention,
+        text_dropout=args.text_dropout,
+        fusion_dropout=args.fusion_dropout,
+        fusion_droppath=args.fusion_droppath,
+        binary_query_selection=binary_query_selection,
+        ffn_extra_layernorm=ffn_extra_layernorm,
+    )

src/models/XPose/models/UniPose/fuse_modules.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from timm.models.layers import DropPath
+from src.models.util import DropPath
+class FeatureResizer(nn.Module):
+    """
+    This class takes as input a set of embeddings of dimension C1 and outputs a set of
+    embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
+    """
+    def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
+        super().__init__()
+        self.do_ln = do_ln
+        # Object feature encoding
+        self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
+        self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, encoder_features):
+        x = self.fc(encoder_features)
+        if self.do_ln:
+            x = self.layer_norm(x)
+        output = self.dropout(x)
+        return output
+def l1norm(X, dim, eps=1e-8):
+    """L1-normalize columns of X
+    """
+    norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps
+    X = torch.div(X, norm)
+    return X
+def l2norm(X, dim, eps=1e-8):
+    """L2-normalize columns of X
+    """
+    norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps
+    X = torch.div(X, norm)
+    return X
+def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8):
+    """
+    query: (n_context, queryL, d)
+    context: (n_context, sourceL, d)
+    """
+    batch_size_q, queryL = query.size(0), query.size(1)
+    batch_size, sourceL = context.size(0), context.size(1)
+    # Get attention
+    # --> (batch, d, queryL)
+    queryT = torch.transpose(query, 1, 2)
+    # (batch, sourceL, d)(batch, d, queryL)
+    # --> (batch, sourceL, queryL)
+    attn = torch.bmm(context, queryT)
+    if raw_feature_norm == "softmax":
+        # --> (batch*sourceL, queryL)
+        attn = attn.view(batch_size * sourceL, queryL)
+        attn = nn.Softmax()(attn)
+        # --> (batch, sourceL, queryL)
+        attn = attn.view(batch_size, sourceL, queryL)
+    elif raw_feature_norm == "l2norm":
+        attn = l2norm(attn, 2)
+    elif raw_feature_norm == "clipped_l2norm":
+        attn = nn.LeakyReLU(0.1)(attn)
+        attn = l2norm(attn, 2)
+    else:
+        raise ValueError("unknown first norm type:", raw_feature_norm)
+    # --> (batch, queryL, sourceL)
+    attn = torch.transpose(attn, 1, 2).contiguous()
+    # --> (batch*queryL, sourceL)
+    attn = attn.view(batch_size * queryL, sourceL)
+    attn = nn.Softmax()(attn * smooth)
+    # --> (batch, queryL, sourceL)
+    attn = attn.view(batch_size, queryL, sourceL)
+    # --> (batch, sourceL, queryL)
+    attnT = torch.transpose(attn, 1, 2).contiguous()
+    # --> (batch, d, sourceL)
+    contextT = torch.transpose(context, 1, 2)
+    # (batch x d x sourceL)(batch x sourceL x queryL)
+    # --> (batch, d, queryL)
+    weightedContext = torch.bmm(contextT, attnT)
+    # --> (batch, queryL, d)
+    weightedContext = torch.transpose(weightedContext, 1, 2)
+    return weightedContext, attnT
+class BiMultiHeadAttention(nn.Module):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):
+        super(BiMultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.v_dim = v_dim
+        self.l_dim = l_dim
+        assert (
+                self.head_dim * self.num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        self.scale = self.head_dim ** (-0.5)
+        self.dropout = dropout
+        self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
+        self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
+        self.stable_softmax_2d = True
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+        self._reset_parameters()
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _reset_parameters(self):
+        nn.init.xavier_uniform_(self.v_proj.weight)
+        self.v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.l_proj.weight)
+        self.l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_v_proj.weight)
+        self.values_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_l_proj.weight)
+        self.values_l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_v_proj.weight)
+        self.out_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_l_proj.weight)
+        self.out_l_proj.bias.data.fill_(0)
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        """_summary_
+        Args:
+            v (_type_): bs, n_img, dim
+            l (_type_): bs, n_text, dim
+            attention_mask_v (_type_, optional): _description_. bs, n_img
+            attention_mask_l (_type_, optional): _description_. bs, n_text
+        Returns:
+            _type_: _description_
+        """
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        bsz, tgt_len, _ = v.size()
+        query_states = self.v_proj(v) * self.scale
+        key_states = self._shape(self.l_proj(l), -1, bsz)
+        value_v_states = self._shape(self.values_v_proj(v), -1, bsz)
+        value_l_states = self._shape(self.values_l_proj(l), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_v_states = value_v_states.view(*proj_shape)
+        value_l_states = value_l_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))  # bs*nhead, nimg, ntxt
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+        if self.stable_softmax_2d:
+            attn_weights = attn_weights - attn_weights.max()
+        if self.clamp_min_for_underflow:
+            attn_weights = torch.clamp(attn_weights,
+                                       min=-50000)  # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights = torch.clamp(attn_weights,
+                                       max=50000)  # Do not increase 50000, data type half has quite limited range
+        attn_weights_T = attn_weights.transpose(1, 2)
+        attn_weights_l = (attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[
+            0])
+        if self.clamp_min_for_underflow:
+            attn_weights_l = torch.clamp(attn_weights_l,
+                                         min=-50000)  # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights_l = torch.clamp(attn_weights_l,
+                                         max=50000)  # Do not increase 50000, data type half has quite limited range
+        # mask vison for language
+        if attention_mask_v is not None:
+            attention_mask_v = attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            attn_weights_l.masked_fill_(attention_mask_v, float('-inf'))
+        attn_weights_l = attn_weights_l.softmax(dim=-1)
+        # mask language for vision
+        if attention_mask_l is not None:
+            attention_mask_l = attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            attn_weights.masked_fill_(attention_mask_l, float('-inf'))
+        attn_weights_v = attn_weights.softmax(dim=-1)
+        attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
+        attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)
+        attn_output_v = torch.bmm(attn_probs_v, value_l_states)
+        attn_output_l = torch.bmm(attn_probs_l, value_v_states)
+        if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}"
+            )
+        if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}"
+            )
+        attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output_v = attn_output_v.transpose(1, 2)
+        attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim)
+        attn_output_l = attn_output_l.transpose(1, 2)
+        attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
+        attn_output_v = self.out_v_proj(attn_output_v)
+        attn_output_l = self.out_l_proj(attn_output_l)
+        return attn_output_v, attn_output_l
+# Bi-Direction MHA (text->image, image->text)
+class BiAttentionBlock(nn.Module):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1,
+                 drop_path=.0, init_values=1e-4, cfg=None):
+        """
+        Inputs:
+            embed_dim - Dimensionality of input and attention feature vectors
+            hidden_dim - Dimensionality of hidden layer in feed-forward network
+                         (usually 2-4x larger than embed_dim)
+            num_heads - Number of heads to use in the Multi-Head Attention block
+            dropout - Amount of dropout to apply in the feed-forward network
+        """
+        super(BiAttentionBlock, self).__init__()
+        # pre layer norm
+        self.layer_norm_v = nn.LayerNorm(v_dim)
+        self.layer_norm_l = nn.LayerNorm(l_dim)
+        self.attn = BiMultiHeadAttention(v_dim=v_dim,
+                                         l_dim=l_dim,
+                                         embed_dim=embed_dim,
+                                         num_heads=num_heads,
+                                         dropout=dropout)
+        # add layer scale for training stability
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=False)
+        self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=False)
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        v = self.layer_norm_v(v)
+        l = self.layer_norm_l(l)
+        delta_v, delta_l = self.attn(v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l)
+        # v, l = v + delta_v, l + delta_l
+        v = v + self.drop_path(self.gamma_v * delta_v)
+        l = l + self.drop_path(self.gamma_l * delta_l)
+        return v, l

src/models/XPose/models/UniPose/mask_generate.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+def prepare_for_mask(kpt_mask):
+    tgt_size2 = 50 * 69
+    attn_mask2 = torch.ones(kpt_mask.shape[0], 8, tgt_size2, tgt_size2).to('cuda') < 0
+    group_bbox_kpt = 69
+    num_group=50
+    for matchj in range(num_group * group_bbox_kpt):
+        sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+        ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+        if sj > 0:
+            attn_mask2[:,:,matchj, :sj] = True
+        if ej < num_group * group_bbox_kpt:
+            attn_mask2[:,:,matchj, ej:] = True
+    bs, length = kpt_mask.shape
+    equal_mask = kpt_mask[:, :, None] == kpt_mask[:, None, :]
+    equal_mask= equal_mask.unsqueeze(1).repeat(1,8,1,1)
+    for idx in range(num_group):
+        start_idx = idx * length
+        end_idx = (idx + 1) * length
+        attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][equal_mask] = False
+        attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][~equal_mask] = True
+    input_query_label = None
+    input_query_bbox = None
+    attn_mask = None
+    dn_meta = None
+    return input_query_label, input_query_bbox, attn_mask, attn_mask2.flatten(0,1), dn_meta
+def post_process(outputs_class, outputs_coord, dn_meta, aux_loss, _set_aux_loss):
+    if dn_meta and dn_meta['pad_size'] > 0:
+        output_known_class = [outputs_class_i[:, :dn_meta['pad_size'], :] for outputs_class_i in outputs_class]
+        output_known_coord = [outputs_coord_i[:, :dn_meta['pad_size'], :] for outputs_coord_i in outputs_coord]
+        outputs_class = [outputs_class_i[:, dn_meta['pad_size']:, :] for outputs_class_i in outputs_class]
+        outputs_coord = [outputs_coord_i[:, dn_meta['pad_size']:, :] for outputs_coord_i in outputs_coord]
+        out = {'pred_logits': output_known_class[-1], 'pred_boxes': output_known_coord[-1]}
+        if aux_loss:
+            out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_coord)
+        dn_meta['output_known_lbs_bboxes'] = out
+    return outputs_class, outputs_coord

src/models/XPose/models/UniPose/ops/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# -*- coding: utf-8 -*-
+# @Time    : 2024/8/5 21:58
+# @Author  : shaoguowen
+# @Email   : [email protected]
+# @Project : FasterLivePortrait
+# @FileName: __init__.py.py

src/models/XPose/models/UniPose/ops/functions/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from .ms_deform_attn_func import MSDeformAttnFunction

src/models/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+import MultiScaleDeformableAttention as MSDA
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
+        ctx.im2col_step = im2col_step
+        output = MSDA.ms_deform_attn_forward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = \
+            MSDA.ms_deform_attn_backward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
+                                          mode='bilinear', padding_mode='zeros', align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
+    return output.transpose(1, 2).contiguous()