Spaces:

tencent
/

SongGeneration

Running on L40S

App Files Files Community

hainazhu commited on Jun 6

Commit

258fd02

1 Parent(s): 51fab49

Add application file

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +10 -33
.gitignore +3 -0
Dockerfile +13 -0
LICENSE +211 -0
README.md +63 -6
app.py +140 -0
codeclm/models/__init__.py +11 -0
codeclm/models/builders.py +139 -0
codeclm/models/codeclm.py +303 -0
codeclm/models/levo.py +224 -0
codeclm/models/llama/__init__.py +90 -0
codeclm/models/llama/configuration_llama.py +182 -0
codeclm/models/llama/convert_llama_weights_to_hf.py +318 -0
codeclm/models/llama/modeling_llama.py +1243 -0
codeclm/models/llama/tokenization_llama.py +426 -0
codeclm/models/llama/tokenization_llama_fast.py +264 -0
codeclm/models/lm_levo.py +546 -0
codeclm/modules/conditioners.py +883 -0
codeclm/modules/pattern.py +351 -0
codeclm/modules/streaming.py +112 -0
codeclm/tokenizer/Flow1dVAE/audio.py +304 -0
codeclm/tokenizer/Flow1dVAE/cal_token_stat.py +19 -0
codeclm/tokenizer/Flow1dVAE/compare_model_weight.py +13 -0
codeclm/tokenizer/Flow1dVAE/configs/models/transformer2D_wocross_inch112_1x4_multi_large.json +26 -0
codeclm/tokenizer/Flow1dVAE/configs/scheduler/stable_diffusion_2.1_largenoise_sample.json +14 -0
codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x1_and_sep_npy.py +121 -0
codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x1_sep.py +94 -0
codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x2.py +70 -0
codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4.py +46 -0
codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4_ds.py +86 -0
codeclm/tokenizer/Flow1dVAE/generate_1rvq.py +283 -0
codeclm/tokenizer/Flow1dVAE/generate_2rvq.py +294 -0
codeclm/tokenizer/Flow1dVAE/generate_4rvq.py +293 -0
codeclm/tokenizer/Flow1dVAE/generate_septoken.py +302 -0
codeclm/tokenizer/Flow1dVAE/libs/datasets/MusicSoundMixedDataset.py +1278 -0
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_429.py +372 -0
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_combined.py +830 -0
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_combined_withset.py +994 -0
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song.py +313 -0
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song_20s.py +313 -0
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song_new_429.py +313 -0
codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_stock.py +461 -0
codeclm/tokenizer/Flow1dVAE/libs/fsq/fsq.py +236 -0
codeclm/tokenizer/Flow1dVAE/libs/rvq/core_vq.py +366 -0
codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize.py +268 -0
codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize2.py +290 -0
codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize3.py +299 -0
codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize3_4layer.py +303 -0
codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize3_4layer_freezelayer1.py +301 -0
codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize3_4layer_return_layer.py +305 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,12 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+third_party/demucs/ckpt/htdemucs.pth filter=lfs diff=lfs merge=lfs -text
+ckpt/100000_dpo.pt filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
+ckpt/vae/autoencoder_music_1320k.ckpt filter=lfs diff=lfs merge=lfs -text
+ckpt/models--lengyue233--content-vec-best/blobs/d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e filter=lfs diff=lfs merge=lfs -text
+codeclm/tokenizer/Flow1dVAE/third_party/wespeaker/voxceleb_resnet34_LM/voxceleb_resnet34_LM.onnx filter=lfs diff=lfs merge=lfs -text
+codeclm/tokenizer/Flow1dVAE/third_party/wespeaker/voxceleb_resnet34_LM/voxceleb_resnet34_LM.pt filter=lfs diff=lfs merge=lfs -text
+third_party/flash_attn-2.7.4.post1+cu12torch2.2cxx11abiFALSE-cp39-cp39-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+ckpt/60000_alnew.pt filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+launchs/
+**/__pycache__
+sample/generated/

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM juhayna/song-generation-levo:v0.1
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,211 @@

+Tencent is pleased to support the open source community by making SongGeneration available.
+Copyright (C) 2025 THL A29 Limited, a Tencent company.  All rights reserved.
+SongGeneration is licensed under the License Terms of SongGeneration except for the third-party components listed below, which is licensed under different terms.  SongGeneration does not impose any additional limitations beyond what is outlined in the respective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
+License Terms of SongGeneration:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this Software and associated documentation files, to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sublicense copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+- You agree to use the SongGeneration only for academic, research and education purposes, and refrain from using it for any commercial or production purposes under any circumstances.
+- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+For avoidance of doubts, "Software" means the SongGeneration inference-enabling code and the weights made available under this license excluding any pre-trained data and other AI components.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+Other dependencies and licenses:
+Open Source Software Licensed under the MIT License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. stable_audio_tools
+Copyright (c) 2023 Stability AI
+Terms of the MIT:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+For the license of other third party components, please refer to the following URL:
+https://github.com/Stability-AI/stable-audio-tools/tree/main/LICENSES
+Open Source Software Licensed under the MIT License:
+--------------------------------------------------------------------
+1. demucs
+Copyright (c) Meta Platforms, Inc. and affiliates.
+A copy of the MIT is included in this file.
+Open Source Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. torch
+From PyTorch:
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+From Caffe2:
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+All contributions by Kakao Brain:
+Copyright 2019-2020 Kakao Brain
+All contributions by Cruise LLC:
+Copyright (c) 2022 Cruise LLC.
+All rights reserved.
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+All rights reserved.
+Terms of the BSD 3-Clause:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+For the license of other third party components, please refer to the following URL:
+https://github.com/pytorch/pytorch/blob/v2.0.1/NOTICE
+Open Source Software Licensed under the BSD 2-Clause License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. torchaudio
+Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
+All rights reserved.
+Terms of the BSD 2-Clause:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+For the license of other third party components, please refer to the following URL:
+https://github.com/pytorch/audio/blob/v2.0.2/LICENSE
+Open Source Software License under the Apache License Version 2.0:
+--------------------------------------------------------------------
+1. huggingface-hub
+Copyright (c) huggingface-hub original author and authors
+2. transformers
+Copyright 2018- The Hugging Face team. All rights reserved.
+Terms of the Apache License Version 2.0:
+--------------------------------------------------------------------
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS

README.md CHANGED Viewed

@@ -1,11 +1,68 @@
 ---
-title: SongGeneration LeVo
-emoji: 🏃
 colorFrom: purple
-colorTo: blue
 sdk: docker
-pinned: false
-short_description: Demo interface for the LeVo song generation model.
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: LeVo Song Generation
+emoji: 🎵
 colorFrom: purple
+colorTo: gray
 sdk: docker
+app_port: 7860
 ---
+# SongGeration:
+This repository is the official code repository for LeVo: High-Quality Song Generation with Multi-Preference Alignment. You can find our paper on [here](https://arxiv.org/). The demo page is available [here](https://levo-demo.github.io/).
+In this repository, we provide the SongGeration model, inference scripts, and the checkpoint that has been trained on the Million Song Dataset. Specifically, we have released the model and inference code corresponding to the SFT + auto-DPO version.
+## Installation
+## Start from scatch
+You can install the necessary dependencies using the `requirements.txt` file with Python 3.8.12:
+```bash
+pip install -r requirements.txt
+```
+then install flash attention from wget
+```bash
+wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl -P /home/
+pip install /home/flash_attn-2.7.4.post1+cu12torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+```
+## Start with docker
+```bash
+docker pull juhayna/song-generation-levo:v0.1
+docker run -it --gpus all --network=host juhayna/song-generation-levo:v0.1 /bin/bash
+```
+## Inference
+Please note that all the two folder below must be downloaded completely for the model to load correctly, which is sourced from [here](https://huggingface.co/waytan22/SongGeneration)
+- Save `ckpt` to the root directory
+- Save `third_party` to the root directory
+Then run inference, use the following command:
+```bash
+sh generate.sh sample/lyric.jsonl sample/generate
+```
+- Input keys in the `sample/lyric.jsonl`
+  - `idx`: name of the generate song file
+  - `descriptions`: text description, can be None or specified gender, timbre, genre, mood, instrument and BPM
+  - `prompt_audio_path`: reference audio path, can be None or 10s song audio path
+  - `gt_lyric`: lyrics, it needs to follow the format of '\[Structure\] Text', supported structures can be found in `conf/vocab.yaml`
+- Outputs of the loader `sample/generate`:
+  - `audio`: generated audio files
+  - `jsonl`: output jsonls
+  - `token`: Token corresponding to the generated audio files
+## Note
+Since the model is trained based on data longer than 1 minute, if the given lyrics are too short, the model will automatically fill in the lyrics to extend the duration.
+## License
+The code and weights in this repository is released under the MIT license as found in the [LICENSE](LICENSE) file.

app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import gradio as gr
+import json
+import numpy as np
+from datetime import datetime
+import os
+import sys
+import librosa
+EXAMPLE_DESC = """female, dark, pop, sad, piano and drums, the bpm is 125."""
+EXAMPLE_LYRICS = """
+[intro-short]
+[verse]
+夜晚的街灯闪烁.
+我漫步在熟悉的角落.
+回忆像潮水般涌来.
+你的笑容如此清晰.
+在心头无法抹去.
+那些曾经的甜蜜.
+如今只剩我独自回忆.
+[bridge]
+手机屏幕亮起.
+是你发来的消息.
+简单的几个字.
+却让我泪流满面.
+曾经的拥抱温暖.
+如今却变得遥远.
+我多想回到从前.
+重新拥有你的陪伴.
+[chorus]
+回忆的温度还在.
+你却已不在.
+我的心被爱填满.
+却又被思念刺痛.
+R&B的节奏奏响.
+我的心却在流浪.
+没有你的日子.
+我该如何继续向前.
+[outro-short]
+""".strip()
+# 模拟歌曲生成函数
+def generate_song(description, lyric, prompt_audio=None):
+    # 这里模拟生成过程 - 实际应用中替换为你的模型调用
+    print(f"Generating song with description: {description}")
+    print(f"Lyrics provided: {lyric}")
+    if prompt_audio is not None:
+        print("Using prompt audio for generation")
+    # 从文件中加载示例音频
+    audio_path = "./sample/example.mp3"
+    audio_data, sample_rate = librosa.load(audio_path, sr=None)  # 保持原始采样率
+    # 创建输入配置的JSON
+    input_config = {
+        "description": description,
+        "lyric": lyric,
+        "has_prompt_audio": prompt_audio is not None,
+        "timestamp": datetime.now().isoformat(),
+    }
+    return (sample_rate, audio_data), json.dumps(input_config, indent=2)
+# 创建Gradio界面
+with gr.Blocks(title="LeVo Demo Space") as demo:
+    gr.Markdown("# 🎵 LeVo Demo Space")
+    gr.Markdown("Demo interface for the LeVo song generation model. Provide a description, lyrics, and optionally an audio prompt to generate a custom song.")
+    with gr.Row():
+        with gr.Column():
+            description = gr.Textbox(
+                label="Song Description",
+                placeholder="Describe the style, mood, and characteristics of the song...",
+                lines=1,
+                max_lines=2,
+                value=EXAMPLE_DESC,
+            )
+            lyric = gr.Textbox(
+                label="Lyrics",
+                placeholder="Enter the lyrics for the song...",
+                lines=5,
+                max_lines=8,
+                value=EXAMPLE_LYRICS,
+            )
+            with gr.Tabs(elem_id="extra-tabs"):
+                with gr.Tab("Audio Prompt"):
+                    prompt_audio = gr.Audio(
+                        label="Prompt Audio (Optional)",
+                        type="filepath",
+                        elem_id="audio-prompt"
+                    )
+                with gr.Tab("Advanced Config"):
+                    text_prompt = gr.Textbox(
+                    label="Text Prompt",
+                    placeholder="Enter the Text Prompt, eg: emotional piano pop",
+                    )
+            generate_btn = gr.Button("Generate Song", variant="primary")
+        with gr.Column():
+            output_audio = gr.Audio(label="Generated Song", type="numpy")
+            output_json = gr.JSON(label="Input Configuration")
+    # 示例按钮
+    examples = gr.Examples(
+        examples=[
+            ["An uplifting pop song with catchy melodies"],
+            ["Melancholic piano ballad"],
+        ],
+        inputs=[description],
+        label="Description examples"
+    )
+    examples = gr.Examples(
+        examples=[
+            ["Shine bright like the stars above\nYou're the one that I'm dreaming of"],
+            ["The rain keeps falling on my window pane\nReminding me of love that's gone away"],
+        ],
+        inputs=[lyric],
+        label="Lyrics examples"
+    )
+    # 生成按钮点击事件
+    generate_btn.click(
+        fn=generate_song,
+        inputs=[description, lyric, prompt_audio],
+        outputs=[output_audio, output_json]
+    )
+# 启动应用
+if __name__ == "__main__":
+    demo.launch()

codeclm/models/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Models for EnCodec, AudioGen, MusicGen, as well as the generic LMModel.
+"""
+# flake8: noqa
+from . import builders
+from .codeclm import CodecLM

codeclm/models/builders.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""
+All the functions to build the relevant models and modules
+from the Hydra config.
+"""
+import typing as tp
+import omegaconf
+import torch
+from codeclm.utils.utils import dict_from_config
+from codeclm.modules.pattern import (
+    CodebooksPatternProvider,
+    DelayedPatternProvider,
+)
+from codeclm.modules.conditioners import (
+    BaseConditioner,
+    QwTokenizerConditioner,
+    QwTextConditioner,
+    PhonemeTokenizerConditioner,
+    QuantizedEmbeddingConditioner,
+    ConditionerProvider,
+    ConditionFuser,
+)
+def get_audio_tokenizer_model(checkpoint_path: str, cfg: omegaconf.DictConfig):
+    from codeclm.tokenizer.audio_tokenizer import AudioTokenizer
+    """Instantiate a compression model."""
+    if checkpoint_path is None:
+        return None
+    if checkpoint_path.startswith('//pretrained/'):
+        name = checkpoint_path.split('/', 3)[-1]
+        return AudioTokenizer.get_pretrained(name, cfg.vae_config, cfg.vae_model, 'cpu', mode=cfg.mode)
+    elif checkpoint_path == "":
+        return None
+    else:
+        name = checkpoint_path
+        return AudioTokenizer.get_pretrained(name, cfg.vae_config, cfg.vae_model, 'cpu', mode=cfg.mode)
+def get_lm_model(cfg: omegaconf.DictConfig): #-> LMModel:
+    """Instantiate a LM."""
+    lm_kwargs = dict_from_config(getattr(cfg, 'lm'))
+    # n_q: number of RVQ
+    code_depth = lm_kwargs['code_depth']
+    q_modeling = lm_kwargs.pop('q_modeling', None)
+    # conditioner
+    condition_provider = get_conditioner_provider(lm_kwargs["dim"], cfg)
+    # codebook pattern: delay
+    codebooks_pattern_cfg = getattr(cfg, 'codebooks_pattern')
+    if codebooks_pattern_cfg.modeling is None:
+        assert q_modeling is not None, \
+            "LM model should either have a codebook pattern defined or transformer_lm.q_modeling"
+        codebooks_pattern_cfg = omegaconf.OmegaConf.create(
+            {'modeling': q_modeling, 'delay': {'delays': list(range(code_depth))}}
+        )
+    pattern_provider = get_codebooks_pattern_provider(code_depth, codebooks_pattern_cfg)
+    # condition dropout
+    attribute_dropout = dict_from_config(getattr(cfg, 'attribute_dropout'))
+    cls_free_guidance = dict_from_config(getattr(cfg, 'classifier_free_guidance'))
+    cfg_prob, cfg_coef = cls_free_guidance['training_dropout'], cls_free_guidance['inference_coef']
+    # condition fuser
+    fuser = get_condition_fuser(cfg)
+    lm_type = lm_kwargs['lm_type'] # YCY: For consistency, choose different lm.py based on lm_type
+    if lm_type == 'Llama':
+        from .lm_levo import LmModel
+        return LmModel(
+            pattern_provider=pattern_provider,
+            condition_provider=condition_provider,
+            fuser=fuser,
+            cfg_dropout=cfg_prob,
+            cfg_coef=cfg_coef,
+            attribute_dropout=attribute_dropout,
+            cfg=cfg,
+            **lm_kwargs
+        ).to('cpu')
+    else:
+        raise KeyError(f"Unexpected LM model {lm_type}")
+def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -> ConditionerProvider:
+    """Instantiate a conditioning model."""
+    cfg = getattr(cfg, 'conditioners')
+    dict_cfg = {} if cfg is None else dict_from_config(cfg)
+    conditioners: tp.Dict[str, BaseConditioner] = {}
+    condition_provider_args = dict_cfg.pop('args', {})
+    for cond, cond_cfg in dict_cfg.items():
+        model_type = cond_cfg['model']
+        model_args = cond_cfg[model_type]
+        if model_type == 'QwTokenizer':
+            conditioners[str(cond)] = QwTokenizerConditioner(
+                output_dim=output_dim,
+                **model_args
+            )
+        elif model_type == "QwTextTokenizer":
+            conditioners[str(cond)] = QwTextConditioner(
+                output_dim=output_dim,
+                **model_args
+            )
+        elif model_type == 'PhonemeTokenizer':
+            conditioners[str(cond)] = PhonemeTokenizerConditioner(
+                output_dim=output_dim,
+                **model_args
+            )
+        elif model_type == "qt_embedding":
+            conditioners[str(cond)] = QuantizedEmbeddingConditioner(
+                dim=output_dim,
+                **model_args
+            )
+        else:
+            raise ValueError(f"Unrecognized conditioning model: {model_type}")
+    conditioner = ConditionerProvider(conditioners, **condition_provider_args)
+    return conditioner
+def get_condition_fuser(cfg: omegaconf.DictConfig) -> ConditionFuser:
+    """Instantiate a condition fuser object."""
+    fuser_cfg = getattr(cfg, 'fuser')
+    fuser_methods = ['sum', 'prepend']
+    fuse2cond = {k: fuser_cfg[k] for k in fuser_methods}
+    kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
+    fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
+    return fuser
+def get_codebooks_pattern_provider(code_depth: int, cfg: omegaconf.DictConfig) -> CodebooksPatternProvider:
+    """Instantiate a codebooks pattern provider object."""
+    pattern_providers = {
+        'delay': DelayedPatternProvider,
+    }
+    name = cfg.modeling
+    kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
+    klass = pattern_providers[name]
+    return klass(code_depth, **kwargs)

codeclm/models/codeclm.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""
+Main model for using CodecLM. This will combine all the required components
+and provide easy access to the generation API.
+"""
+import typing as tp
+import warnings
+import torch
+from codeclm.tokenizer.audio_tokenizer import AudioTokenizer
+from .lm_levo import LmModel
+from ..modules.conditioners import ConditioningAttributes, AudioCondition
+from ..utils.autocast import TorchAutocast
+import torch
+from torch.nn import functional as F
+import torchaudio
+# from optim.ema import EMA
+MelodyList = tp.List[tp.Optional[torch.Tensor]]
+MelodyType = tp.Union[torch.Tensor, MelodyList]
+class CodecLM:
+    """CodecLM main model with convenient generation API.
+    Args:
+        name (str): name of the model.
+        compression_model (CompressionModel): Compression model
+            used to map audio to invertible discrete representations.
+        lm (LMModel): Language model over discrete representations.
+        max_duration (float, optional): maximum duration the model can produce,
+            otherwise, inferred from the training params.
+    """
+    def __init__(self, name: str, audiotokenizer: AudioTokenizer, lm: LmModel,
+                 max_duration: tp.Optional[float] = None, seperate_tokenizer: AudioTokenizer = None):
+        self.name = name
+        self.audiotokenizer = audiotokenizer
+        self.lm = lm
+        self.seperate_tokenizer = seperate_tokenizer
+        # import pdb; pdb.set_trace()
+        if max_duration is None:
+            if hasattr(lm, 'cfg'):
+                max_duration = lm.cfg.dataset.segment_duration  # type: ignore
+            else:
+                raise ValueError("You must provide max_duration when building directly CodecLM")
+        assert max_duration is not None
+        self.max_duration: float = max_duration
+        self.device = next(iter(lm.parameters())).device
+        self.generation_params: dict = {}
+        # self.set_generation_params(duration=15)  # 15 seconds by default
+        self.set_generation_params(duration=15, extend_stride=self.max_duration // 2)
+        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
+        if self.device.type == 'cpu':
+            self.autocast = TorchAutocast(enabled=False)
+        else:
+            self.autocast = TorchAutocast(enabled=False)
+    @property
+    def frame_rate(self) -> float:
+        """Roughly the number of AR steps per seconds."""
+        return self.audiotokenizer.frame_rate
+    @property
+    def sample_rate(self) -> int:
+        """Sample rate of the generated audio."""
+        return self.audiotokenizer.sample_rate
+    @property
+    def audio_channels(self) -> int:
+        """Audio channels of the generated audio."""
+        return self.audiotokenizer.channels
+    def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                              top_p: float = 0.0, temperature: float = 1.0,
+                              duration: float = 30.0, cfg_coef: float = 3.0,
+                             extend_stride: float = 18, record_tokens: bool = False,
+                             record_window: int = 50):
+        """Set the generation parameters for CodecLM.
+        Args:
+            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+            top_k (int, optional): top_k used for sampling. Defaults to 250.
+            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+            duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
+            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+                instead of batching together the two. This has some impact on how things
+                are padded but seems to have little impact in practice.
+            extend_stride: when doing extended generation (i.e. more than 30 seconds), by how much
+                should we extend the audio each time. Larger values will mean less context is
+                preserved, and shorter value will require extra computations.
+        """
+        assert extend_stride <= self.max_duration, "Cannot stride by more than max generation duration."
+        self.extend_stride = extend_stride
+        self.duration = duration
+        self.generation_params = {
+            'use_sampling': use_sampling,
+            'temp': temperature,
+            'top_k': top_k,
+            'top_p': top_p,
+            'cfg_coef': cfg_coef,
+            'record_tokens': record_tokens,
+            'record_window': record_window,
+        }
+    def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+        """Override the default progress callback."""
+        self._progress_callback = progress_callback
+    # Inference
+    def generate(self, lyrics: tp.List[str],
+                 descriptions: tp.List[str],
+                 melody_wavs: torch.Tensor = None,
+                 melody_is_wav: bool = True,
+                 vocal_wavs: torch.Tensor = None,
+                 bgm_wavs: torch.Tensor = None,
+                 return_tokens: bool = False,
+                 ) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
+        """Generate samples conditioned on text and melody.
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            melody_wavs: (torch.Tensor or list of Tensor): A batch of waveforms used as
+                melody conditioning. Should have shape [B, C, T] with B matching the description length,
+                C=1 or 2. It can be [C, T] if there is a single description. It can also be
+                a list of [C, T] tensors.
+            melody_sample_rate: (int): Sample rate of the melody waveforms.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        """
+        if melody_wavs is not None:
+            if melody_wavs.dim() == 2:
+                melody_wavs = melody_wavs[None]
+            if melody_wavs.dim() != 3:
+                raise ValueError("Melody wavs should have a shape [B, C, T].")
+            melody_wavs = list(melody_wavs)
+        if vocal_wavs is not None:
+            if vocal_wavs.dim() == 2:
+                vocal_wavs = vocal_wavs[None]
+            if vocal_wavs.dim() != 3:
+                raise ValueError("Vocal wavs should have a shape [B, C, T].")
+            vocal_wavs = list(vocal_wavs)
+        if bgm_wavs is not None:
+            if bgm_wavs.dim() == 2:
+                bgm_wavs = bgm_wavs[None]
+            if bgm_wavs.dim() != 3:
+                raise ValueError("BGM wavs should have a shape [B, C, T].")
+            bgm_wavs = list(bgm_wavs)
+        texts, audio_qt_embs = self._prepare_tokens_and_attributes(lyrics=lyrics, melody_wavs=melody_wavs, vocal_wavs=vocal_wavs, bgm_wavs=bgm_wavs, melody_is_wav=melody_is_wav)
+        tokens = self._generate_tokens(texts, descriptions, audio_qt_embs)
+        if (tokens == self.lm.eos_token_id).any():
+            length = torch.nonzero(torch.eq(tokens, self.lm.eos_token_id))[:,-1].min()
+            tokens = tokens[...,:length]
+        if return_tokens:
+            return tokens
+        else:
+            out = self.generate_audio(tokens)
+            return out
+    @torch.no_grad()
+    def _prepare_tokens_and_attributes(
+            self,
+            lyrics: tp.Sequence[tp.Optional[str]],
+            melody_wavs: tp.Optional[MelodyList] = None,
+            vocal_wavs: tp.Optional[MelodyList] = None,
+            bgm_wavs: tp.Optional[MelodyList] = None,
+            melody_is_wav = True
+    ) -> tp.Tuple[tp.List[str], tp.List[torch.Tensor]]:
+        """Prepare model inputs.
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+            melody_wavs (torch.Tensor, optional): A batch of waveforms
+                used as melody conditioning. Defaults to None.
+        """
+        assert len(lyrics) == 1
+        texts = [lyric for lyric in lyrics]
+        audio_qt_embs = []
+        target_melody_token_len = self.lm.cfg.prompt_len * self.audiotokenizer.frame_rate
+        # import pdb; pdb.set_trace()
+        if melody_wavs is None:
+            melody_tokens = torch.full((1,1,target_melody_token_len), 16385, device=self.device).long()
+        elif melody_wavs is not None:
+            if 'prompt_audio' not in self.lm.condition_provider.conditioners:
+                raise RuntimeError("This model doesn't support melody conditioning. "
+                                   "Use the `melody` model.")
+            assert len(melody_wavs) == len(texts), \
+                f"number of melody wavs must match number of descriptions! " \
+                f"got melody len={len(melody_wavs)}, and descriptions len={len(texts)}"
+            if type(melody_wavs) == list:
+                melody_wavs = torch.stack(melody_wavs, dim=0)
+            melody_wavs = melody_wavs.to(self.device)
+            if melody_is_wav:
+                melody_tokens, scale = self.audiotokenizer.encode(melody_wavs)
+            else:
+                melody_tokens = melody_wavs
+            if melody_tokens.shape[-1] > target_melody_token_len:
+                melody_tokens = melody_tokens[...,:target_melody_token_len]
+            elif melody_tokens.shape[-1] < target_melody_token_len:
+                melody_tokens = torch.cat([melody_tokens, torch.full((1,1,target_melody_token_len - melody_tokens.shape[-1]), 16385, device=self.device).long()], dim=-1)
+        if self.seperate_tokenizer is not None:
+            if vocal_wavs is not None:
+                if type(vocal_wavs) == list:
+                    vocal_wavs = torch.stack(vocal_wavs, dim=0)
+                if bgm_wavs is None:
+                    use_bgm = False
+                    bgm_wavs = torch.zeros_like(vocal_wavs)
+                    bgm_wavs[:, 0] = 1.0
+                    bgm_wavs[:, 1:] = torch.randn_like(bgm_wavs[:, 1:])* 0.0003
+                else:
+                    use_bgm = True
+                    if type(bgm_wavs) == list:
+                        bgm_wavs = torch.stack(bgm_wavs, dim=0)
+                vocal_wavs = vocal_wavs.to(self.device)
+                bgm_wavs = bgm_wavs.to(self.device)
+                vocal_tokens, bgm_tokens = self.seperate_tokenizer.encode(vocal_wavs, bgm_wavs)
+                assert len(vocal_tokens.shape) == len(bgm_tokens.shape) == 3, \
+                    f"vocal and bgm tokens should have a shape [B, C, T]! " \
+                    f"got vocal len={vocal_tokens.shape}, and bgm len={bgm_tokens.shape}"
+                assert vocal_tokens.shape[-1] == bgm_tokens.shape[-1], \
+                    f"vocal and bgm tokens should have the same length! " \
+                    f"got vocal len={vocal_tokens.shape[-1]}, and bgm len={bgm_tokens.shape[-1]}"
+                if not use_bgm:
+                    bgm_tokens = torch.full_like(bgm_tokens, 16385)
+                if bgm_tokens.shape[-1] > target_melody_token_len:
+                    bgm_tokens = bgm_tokens[...,:target_melody_token_len]
+                elif bgm_tokens.shape[-1] < target_melody_token_len:
+                    bgm_tokens = torch.cat([bgm_tokens, torch.full((1,1,target_melody_token_len - bgm_tokens.shape[-1]), 16385, device=self.device).long()], dim=-1)
+                if vocal_tokens.shape[-1] > target_melody_token_len:
+                    vocal_tokens = vocal_tokens[...,:target_melody_token_len]
+                elif vocal_tokens.shape[-1] < target_melody_token_len:
+                    vocal_tokens = torch.cat([vocal_tokens, torch.full((1,1,target_melody_token_len - vocal_tokens.shape[-1]), 16385, device=self.device).long()], dim=-1)
+            else:
+                bgm_tokens = torch.full((1,1,target_melody_token_len), 16385, device=self.device).long()
+                vocal_tokens = torch.full((1,1,target_melody_token_len), 16385, device=self.device).long()
+            melody_tokens = torch.cat([melody_tokens, vocal_tokens, bgm_tokens], dim=1)
+        assert melody_tokens.shape[-1] == target_melody_token_len
+        audio_qt_embs = melody_tokens.long()
+        return texts, audio_qt_embs
+    def _generate_tokens(self,
+                        texts: tp.Optional[tp.List[str]] = None,
+                        descriptions: tp.Optional[tp.List[str]] = None,
+                        audio_qt_embs: tp.Optional[tp.List[torch.Tensor]] = None) -> torch.Tensor:
+        """Generate discrete audio tokens given audio prompt and/or conditions.
+        Args:
+            attributes (list of ConditioningAttributes): Conditions used for generation (text/melody).
+            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        Returns:
+            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
+        """
+        total_gen_len = int(self.duration * self.frame_rate)
+        current_gen_offset: int = 0
+        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
+            generated_tokens += current_gen_offset
+            if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
+                self._progress_callback(generated_tokens, total_gen_len)
+            else:
+                print(f'{generated_tokens: 6d} / {total_gen_len: 6d}', end='\r')
+        if self.duration <= self.max_duration:
+            # generate by sampling from LM, simple case.
+            with self.autocast:
+                gen_tokens = self.lm.generate(texts=texts,
+                                              descriptions=descriptions,
+                                              audio_qt_embs=audio_qt_embs,
+                                              max_gen_len=total_gen_len,
+                                              **self.generation_params)
+        else:
+            raise NotImplementedError(f"duration {self.duration} < max duration {self.max_duration}")
+        return gen_tokens
+    @torch.no_grad()
+    def generate_audio(self, gen_tokens: torch.Tensor, prompt=None, vocal_prompt=None, bgm_prompt=None):
+        """Generate Audio from tokens"""
+        assert gen_tokens.dim() == 3
+        if self.seperate_tokenizer is not None:
+            gen_tokens_song = gen_tokens[:, [0], :]
+            gen_tokens_vocal = gen_tokens[:, [1], :]
+            gen_tokens_bgm = gen_tokens[:, [2], :]
+            # gen_audio_song = self.audiotokenizer.decode(gen_tokens_song, prompt)
+            gen_audio_seperate = self.seperate_tokenizer.decode([gen_tokens_vocal, gen_tokens_bgm], vocal_prompt, bgm_prompt)
+            return gen_audio_seperate
+        else:
+            gen_audio = self.audiotokenizer.decode(gen_tokens, prompt)
+            return gen_audio

codeclm/models/levo.py ADDED Viewed

	@@ -0,0 +1,224 @@

+from .llama.modeling_llama import LlamaConfig, CausalLMOutputWithPast, BaseModelOutputWithPast, LlamaDecoderLayer, LlamaRMSNorm
+from .llama.modeling_llama import LlamaForCausalLM as LlamaForCausalLM_base
+from .llama.modeling_llama import LlamaModel as LlamaModel_base
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Union, Optional, Tuple, List
+from packaging import version
+import transformers
+"""
+Wrap the original Llama model for potential customized changes.
+"""
+"""main class"""
+class CausalLM(LlamaForCausalLM_base):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LmModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+        )
+"""Submodel class"""
+class LmModel(LlamaModel_base):
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        layer_cls = LlamaDecoderLayer   # cross attention decoder layer can be overwritten here
+        assert version.parse(transformers.__version__) < version.parse("4.40")
+        self.layers = nn.ModuleList([layer_cls(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.gradient_checkpointing_disable()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            layer_args = (hidden_states, attention_mask, position_ids,)
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, past_key_value, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer), *layer_args
+                )
+            else:
+                layer_outputs = decoder_layer(*layer_args,
+                                              past_key_value=past_key_value,
+                                              output_attentions=output_attentions,
+                                              use_cache=use_cache)
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )

codeclm/models/llama/__init__.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from transformers.utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_sentencepiece_available,
+    is_tokenizers_available,
+    is_torch_available,
+)
+_import_structure = {
+    "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"],
+}
+try:
+    if not is_sentencepiece_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_llama"] = ["LlamaTokenizer"]
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"]
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_llama"] = [
+        "LlamaForCausalLM",
+        "LlamaModel",
+        "LlamaPreTrainedModel",
+        "LlamaForSequenceClassification",
+    ]
+if TYPE_CHECKING:
+    from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig
+    try:
+        if not is_sentencepiece_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_llama import LlamaTokenizer
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_llama_fast import LlamaTokenizerFast
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_llama import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaPreTrainedModel
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

codeclm/models/llama/configuration_llama.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LLaMA model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class LlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        pretraining_tp (`int`, *optional*, defaults to `1`):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
+            Llama 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        Example:
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")

codeclm/models/llama/convert_llama_weights_to_hf.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gc
+import json
+import os
+import shutil
+import warnings
+import torch
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
+try:
+    from transformers import LlamaTokenizerFast
+except ImportError as e:
+    warnings.warn(e)
+    warnings.warn(
+        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
+    )
+    LlamaTokenizerFast = None
+"""
+Sample usage:
+```
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+```
+Thereafter, models can be loaded via:
+```py
+from transformers import LlamaForCausalLM, LlamaTokenizer
+model = LlamaForCausalLM.from_pretrained("/output/path")
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+```
+Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+"""
+NUM_SHARDS = {
+    "7B": 1,
+    "7Bf": 1,
+    "13B": 2,
+    "13Bf": 2,
+    "34B": 4,
+    "30B": 4,
+    "65B": 8,
+    "70B": 8,
+    "70Bf": 8,
+}
+def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
+    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+def write_model(model_path, input_base_path, model_size, tokenizer_path=None, safe_serialization=True):
+    # for backward compatibility, before you needed the repo to be called `my_repo/model_size`
+    if not os.path.isfile(os.path.join(input_base_path, "params.json")):
+        input_base_path = os.path.join(input_base_path, model_size)
+    os.makedirs(model_path, exist_ok=True)
+    tmp_model_path = os.path.join(model_path, "tmp")
+    os.makedirs(tmp_model_path, exist_ok=True)
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_shards = NUM_SHARDS[model_size]
+    n_layers = params["n_layers"]
+    n_heads = params["n_heads"]
+    n_heads_per_shard = n_heads // num_shards
+    dim = params["dim"]
+    dims_per_head = dim // n_heads
+    base = params.get("rope_theta", 10000.0)
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+    if base > 10000.0:
+        max_position_embeddings = 16384
+    else:
+        max_position_embeddings = 2048
+    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+    if tokenizer_path is not None:
+        tokenizer = tokenizer_class(tokenizer_path)
+        tokenizer.save_pretrained(model_path)
+    vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
+    if "n_kv_heads" in params:
+        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
+        num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
+        key_value_dim = dim // num_key_value_heads
+    else:  # compatibility with other checkpoints
+        num_key_value_heads = n_heads
+        num_local_key_value_heads = n_heads_per_shard
+        key_value_dim = dim
+    # permute for sliced rotary
+    def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
+        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
+    # Load weights
+    if model_size == "7B":
+        # Not sharded
+        # (The sharded implementation would also work, but this is simpler.)
+        loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
+    else:
+        # Sharded
+        loaded = [
+            torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
+            for i in range(num_shards)
+        ]
+    param_count = 0
+    index_dict = {"weight_map": {}}
+    for layer_i in range(n_layers):
+        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
+        if model_size == "7B":
+            # Unsharded
+            state_dict = {
+                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
+                    loaded[f"layers.{layer_i}.attention.wq.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
+                    loaded[f"layers.{layer_i}.attention.wk.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
+                f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
+                f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
+                f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
+                f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
+            }
+        else:
+            # Sharded
+            # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
+            # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
+            # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
+            state_dict = {
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
+                    f"layers.{layer_i}.attention_norm.weight"
+                ].clone(),
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
+                    f"layers.{layer_i}.ffn_norm.weight"
+                ].clone(),
+            }
+            state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
+                torch.cat(
+                    [
+                        loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                        for i in range(num_shards)
+                    ],
+                    dim=0,
+                ).reshape(dim, dim)
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
+                torch.cat(
+                    [
+                        loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
+                            num_local_key_value_heads, dims_per_head, dim
+                        )
+                        for i in range(num_shards)
+                    ],
+                    dim=0,
+                ).reshape(key_value_dim, dim),
+                num_key_value_heads,
+                key_value_dim,
+                dim,
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
+                [
+                    loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
+                        num_local_key_value_heads, dims_per_head, dim
+                    )
+                    for i in range(num_shards)
+                ],
+                dim=0,
+            ).reshape(key_value_dim, dim)
+            state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
+            )
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(tmp_model_path, filename))
+    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
+    if model_size == "7B":
+        # Unsharded
+        state_dict = {
+            "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
+            "model.norm.weight": loaded["norm.weight"],
+            "lm_head.weight": loaded["output.weight"],
+        }
+    else:
+        state_dict = {
+            "model.norm.weight": loaded[0]["norm.weight"],
+            "model.embed_tokens.weight": torch.cat(
+                [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
+            ),
+            "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
+        }
+    for k, v in state_dict.items():
+        index_dict["weight_map"][k] = filename
+        param_count += v.numel()
+    torch.save(state_dict, os.path.join(tmp_model_path, filename))
+    # Write configs
+    index_dict["metadata"] = {"total_size": param_count * 2}
+    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
+    ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
+    multiple_of = params["multiple_of"] if "multiple_of" in params else 256
+    config = LlamaConfig(
+        hidden_size=dim,
+        intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
+        num_attention_heads=params["n_heads"],
+        num_hidden_layers=params["n_layers"],
+        rms_norm_eps=params["norm_eps"],
+        num_key_value_heads=num_key_value_heads,
+        vocab_size=vocab_size,
+        rope_theta=base,
+        max_position_embeddings=max_position_embeddings,
+    )
+    config.save_pretrained(tmp_model_path)
+    # Make space so we can load the model properly now.
+    del state_dict
+    del loaded
+    gc.collect()
+    print("Loading the checkpoint in a Llama model.")
+    model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
+    # Avoid saving this as part of the config.
+    del model.config._name_or_path
+    model.config.torch_dtype = torch.float16
+    print("Saving in the Transformers format.")
+    model.save_pretrained(model_path, safe_serialization=safe_serialization)
+    shutil.rmtree(tmp_model_path)
+def write_tokenizer(tokenizer_path, input_tokenizer_path):
+    # Initialize the tokenizer based on the `spm` model
+    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+    print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
+    tokenizer = tokenizer_class(input_tokenizer_path)
+    tokenizer.save_pretrained(tokenizer_path)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
+    )
+    parser.add_argument(
+        "--model_size",
+        choices=["7B", "7Bf", "13B", "13Bf", "30B", "34B", "65B", "70B", "70Bf", "tokenizer_only"],
+        help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
+    )
+    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
+    args = parser.parse_args()
+    spm_path = os.path.join(args.input_dir, "tokenizer.model")
+    if args.model_size != "tokenizer_only":
+        write_model(
+            model_path=args.output_dir,
+            input_base_path=args.input_dir,
+            model_size=args.model_size,
+            safe_serialization=args.safe_serialization,
+            tokenizer_path=spm_path,
+        )
+    else:
+        write_tokenizer(args.output_dir, spm_path)
+if __name__ == "__main__":
+    main()

codeclm/models/llama/modeling_llama.py ADDED Viewed

	@@ -0,0 +1,1243 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_llama import LlamaConfig
+if is_flash_attn_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "LlamaConfig"
+def _get_unpad_data(padding_mask):
+    seqlens_in_batch = padding_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class LlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaFlashAttention2(LlamaAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # LlamaFlashAttention2 attention does not support output_attentions
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dime x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        # TODO: llama does not have dropout in the config??
+        # It is recommended to use dropout with FA according to the docs
+        # when training.
+        dropout_rate = 0.0  # if not self.training else self.attn_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to"
+                " the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                " float16."
+            )
+            query_states = query_states.to(torch.float16)
+            key_states = key_states.to(torch.float16)
+            value_states = value_states.to(torch.float16)
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, padding_mask, q_len, dropout=dropout_rate
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, padding_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            padding_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # Contains at least one padding token in the sequence
+        if padding_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, padding_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=True,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True
+            )
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            padding_mask = padding_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = (
+            LlamaAttention(config=config)
+            if not getattr(config, "_flash_attn_2_enabled", False)
+            else LlamaFlashAttention2(config=config)
+        )
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlamaModel):
+            module.gradient_checkpointing = value
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+            padding_mask = None
+        else:
+            if 0 in attention_mask:
+                padding_mask = attention_mask
+            else:
+                padding_mask = None
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, past_key_value, output_attentions, padding_mask=padding_mask)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    padding_mask=padding_mask,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

codeclm/models/llama/tokenization_llama.py ADDED Viewed

	@@ -0,0 +1,426 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for LLaMA."""
+import os
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers.convert_slow_tokenizer import import_protobuf
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+if TYPE_CHECKING:
+    from transformers.tokenization_utils_base import TextInput
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+    },
+    "tokenizer_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+    },
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "hf-internal-testing/llama-tokenizer": 2048,
+}
+SPIECE_UNDERLINE = "▁"
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+ that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
+class LlamaTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
+    no padding token in the original model.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        legacy (`bool`, *optional*):
+            Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
+            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
+            example:
+            - `legacy=True`:
+            ```python
+            >>> from transformers import T5Tokenizer
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
+            >>> tokenizer.encode("Hello <extra_id_0>.")
+            [8774, 32099, 3, 5, 1]
+            ```
+            - `legacy=False`:
+            ```python
+            >>> from transformers import T5Tokenizer
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
+            [8774, 32099, 5, 1]
+            ```
+            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        use_default_system_prompt=True,
+        spaces_between_special_tokens=False,
+        legacy=None,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        if legacy is None:
+            logger.warning_once(
+                f"You are using the default legacy behaviour of the {self.__class__}. This is"
+                " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
+                " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
+                " means, and thouroughly read the reason why this was added as explained in"
+                " https://github.com/huggingface/transformers/pull/24565"
+            )
+            legacy = True
+        self.legacy = legacy
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.use_default_system_prompt = use_default_system_prompt
+        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            use_default_system_prompt=use_default_system_prompt,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            legacy=legacy,
+            **kwargs,
+        )
+    @property
+    def unk_token_length(self):
+        return len(self.sp_model.encode(str(self.unk_token)))
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
+    def get_spm_processor(self, from_slow=False):
+        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        if self.legacy or from_slow:  # no dependency on protobuf
+            tokenizer.Load(self.vocab_file)
+            return tokenizer
+        with open(self.vocab_file, "rb") as f:
+            sp_model = f.read()
+            model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
+            model = model_pb2.ModelProto.FromString(sp_model)
+            normalizer_spec = model_pb2.NormalizerSpec()
+            normalizer_spec.add_dummy_prefix = False
+            model.normalizer_spec.MergeFrom(normalizer_spec)
+            sp_model = model.SerializeToString()
+            tokenizer.LoadFromSerializedProto(sp_model)
+        return tokenizer
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+    def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]:
+        """
+        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
+        first token is special.
+        """
+        if self.legacy or len(text) == 0:
+            return super().tokenize(text, **kwargs)
+        tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)
+        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+            tokens = tokens[1:]
+        return tokens
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string.
+        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
+        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
+        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
+        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
+        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
+        """
+        tokens = self.sp_model.encode(text, out_type=str)
+        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
+            return tokens
+        # 1. Encode string + prefix ex: "<unk> Hey"
+        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
+        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        # since we manually add the prefix space, we have to remove it when decoding
+        if tokens[0].startswith(SPIECE_UNDERLINE):
+            tokens[0] = tokens[0][1:]
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0 and self.legacy:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+        return output
+    @property
+    def default_chat_template(self):
+        """
+        LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
+        Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
+        user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
+        rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
+        results in an unusual token ordering when it is present. This template should definitely be changed if you wish
+        to fine-tune a model with more flexible role ordering!
+        The output should look something like:
+        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos> <bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST]
+        """
+        template = (
+            "{% if messages[0]['role'] == 'system' %}"
+            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present
+            "{% set system_message = messages[0]['content'] %}"
+            "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
+            "{% set loop_messages = messages %}"  # Or use the default system message if the flag is set
+            "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
+            "{% else %}"
+            "{% set loop_messages = messages %}"
+            "{% set system_message = false %}"
+            "{% endif %}"
+            "{% for message in loop_messages %}"  # Loop over all non-system messages
+            "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+            "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
+            "{% endif %}"
+            "{% if loop.index0 == 0 and system_message != false %}"  # Embed system message in first message
+            "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
+            "{% else %}"
+            "{% set content = message['content'] %}"
+            "{% endif %}"
+            "{% if message['role'] == 'user' %}"  # After all of that, handle messages/roles in a fairly normal way
+            "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
+            "{% elif message['role'] == 'system' %}"
+            "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ ' '  + content.strip() + ' ' + eos_token }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
+        default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
+        template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
+        return template

codeclm/models/llama/tokenization_llama_fast.py ADDED Viewed

	@@ -0,0 +1,264 @@

+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from shutil import copyfile
+from typing import Optional, Tuple
+from tokenizers import processors
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import is_sentencepiece_available, logging
+from transformers.utils.versions import require_version
+require_version("tokenizers>=0.13.3")
+if is_sentencepiece_available():
+    from .tokenization_llama import LlamaTokenizer
+else:
+    LlamaTokenizer = None
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+    },
+    "tokenizer_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+    },
+}
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+ that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
+class LlamaTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
+    This uses notably ByteFallback and no normalization.
+    ```
+    from transformers import LlamaTokenizerFast
+    tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
+    tokenizer.encode("Hello this is a test")
+    >>> [1, 15043, 445, 338, 263, 1243]
+    ```
+    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
+    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
+    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
+    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        tokenizer_file (`str`):
+            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
+            Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
+            spaces.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    slow_tokenizer_class = LlamaTokenizer
+    padding_side = "left"
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        clean_up_tokenization_spaces=False,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        add_bos_token=True,
+        add_eos_token=False,
+        use_default_system_prompt=True,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            use_default_system_prompt=use_default_system_prompt,
+            **kwargs,
+        )
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
+        self.use_default_system_prompt = use_default_system_prompt
+        self.vocab_file = vocab_file
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        single = f"{(bos+':0 ') * self.add_bos_token}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' '+bos+':1') * self.add_bos_token} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)
+    @property
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
+        Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
+        user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
+        rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
+        results in an unusual token ordering when it is present. This template should definitely be changed if you wish
+        to fine-tune a model with more flexible role ordering!
+        The output should look something like:
+        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos> <bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST]
+        """
+        template = (
+            "{% if messages[0]['role'] == 'system' %}"
+            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present
+            "{% set system_message = messages[0]['content'] %}"
+            "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
+            "{% set loop_messages = messages %}"  # Or use the default system message if the flag is set
+            "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
+            "{% else %}"
+            "{% set loop_messages = messages %}"
+            "{% set system_message = false %}"
+            "{% endif %}"
+            "{% for message in loop_messages %}"  # Loop over all non-system messages
+            "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+            "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
+            "{% endif %}"
+            "{% if loop.index0 == 0 and system_message != false %}"  # Embed system message in first message
+            "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
+            "{% else %}"
+            "{% set content = message['content'] %}"
+            "{% endif %}"
+            "{% if message['role'] == 'user' %}"  # After all of that, handle messages/roles in a fairly normal way
+            "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
+            "{% elif message['role'] == 'system' %}"
+            "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ ' '  + content.strip() + ' ' + eos_token }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
+        default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
+        template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
+        return template
+    # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output

codeclm/models/lm_levo.py ADDED Viewed

	@@ -0,0 +1,546 @@

+import torch
+import math
+import random
+import torch.nn as nn
+import typing as tp
+import torch.nn.functional as F
+from dataclasses import dataclass
+from codeclm.models.levo import CausalLM, LlamaConfig
+from codeclm.modules.streaming import StreamingModule
+from codeclm.modules.conditioners import (
+    ConditioningAttributes,
+    AudioCondition,
+    ConditionType,
+    ConditionerProvider,
+    ConditionFuser,
+    ClassifierFreeGuidanceDropoutInference,
+    ClassifierFreeGuidanceDropout,
+    AttributeDropout,
+)
+from codeclm.utils.utils import create_norm_fn, init_layer, sample_top_k, sample_top_p, multinomial
+from codeclm.modules.pattern import CodebooksPatternProvider
+ConditionTensors = tp.Dict[str, ConditionType]
+@dataclass
+class LMOutput:
+    # The logits are already re-aligned with the input codes
+    # hence no extra shift is required, e.g. when computing CE
+    logits: torch.Tensor  # [B, K, T, card]
+    mask: torch.Tensor  # [B, K, T]
+class LmModel(StreamingModule):
+    """Transformer-based language model on multiple streams of codes.
+    Args:
+        pattern_provider (CodebooksPatternProvider): Pattern provider for codebook interleaving.
+        condition_provider (ConditioningProvider): Conditioning provider from metadata.
+        fuser (ConditionFuser): Fuser handling the fusing of conditions with language model input.
+        code_depth (int): Number of parallel streams to model.
+        code_size (int): Cardinality, vocabulary size.
+        dim (int): Dimension of the transformer encoder.
+        num_heads (int): Number of heads for the transformer encoder.
+        hidden_scale (int): Scale for hidden feed forward dimension of the transformer encoder.
+        norm (str): Normalization method.
+        norm_first (bool): Use pre-norm instead of post-norm.
+        emb_lr (float, optional): Embedding-specific learning rate.
+        bias_proj (bool): Use bias for output projections.
+        weight_init (str, optional): Method for weight initialization.
+        depthwise_init (str, optional): Method for depthwise weight initialization.
+        zero_bias_init (bool): If true and bias in Linears, initialize bias to zeros.
+        cfg_dropout (float): Classifier-free guidance dropout.
+        cfg_coef (float): Classifier-free guidance coefficient.
+        attribute_dropout (dict): Attribute dropout probabilities.
+        two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
+        **kwargs: Additional parameters for the transformer encoder.
+    """
+    def __init__(self,
+                 pattern_provider: CodebooksPatternProvider,
+                 condition_provider: ConditionerProvider,
+                 fuser: ConditionFuser,
+                 code_depth: int = 8,
+                 code_size: int = 1024,
+                 dim: int = 128,
+                 intermediate_size: int = 4096,
+                 num_heads: int = 8,
+                 norm: str = 'layer_norm', norm_first: bool = False,
+                 bias_proj: bool = True,
+                 weight_init: tp.Optional[str] = None, depthwise_init: tp.Optional[str] = None,
+                 zero_bias_init: bool = False, cfg_dropout: float = 0, cfg_coef: float = 1.0,
+                 attribute_dropout: tp.Dict[str, tp.Dict[str, float]] = {},
+                 lm_type = 'Llama',
+                 num_layers=16,
+                 cfg = None,
+                 **kwargs):
+        super().__init__()
+        self.cfg_coef = cfg_coef
+        self.cfg_dropout = ClassifierFreeGuidanceDropout(p=cfg_dropout,seed=random.randint(0, 9999))
+        self.att_dropout = AttributeDropout(p=attribute_dropout,seed=random.randint(0, 9999))
+        self.condition_provider = condition_provider
+        self.fuser = fuser
+        self.code_size = code_size + 1   # + EOS
+        input_emb_dim = code_size + 2   # EOP
+        self.code_depth = code_depth
+        self.dim = dim
+        self.cfg = cfg
+        self.pattern_provider = pattern_provider
+        self.emb = nn.ModuleList([nn.Embedding(input_emb_dim, dim)])
+        # if 'activation' in kwargs:
+        #     kwargs['activation'] = get_activation_fn(kwargs['activation'])
+        model_cfg = LlamaConfig(
+            hidden_size=dim,
+            intermediate_size = intermediate_size,
+            num_attention_heads = num_heads,
+            num_hidden_layers = num_layers,
+            num_key_value_heads = num_heads,
+            vocab_size = self.code_size,
+            use_cache=False,
+            max_position_embeddings=8196,
+            _flash_attn_2_enabled=True,
+            rms_norm_eps= 1e-5,
+            rope_theta= 100000.0,
+            use_flash_attn_2=True,
+            attn_implementation="flash_attention_2"
+        )
+        self.transformer = CausalLM(model_cfg)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim * 2, dim),
+            nn.GELU(),
+            nn.Linear(dim, dim)
+        )
+        self.layer2_emb = nn.ModuleList([nn.Embedding(input_emb_dim, dim) #, lr=emb_lr)
+                                  for _ in range(self.code_depth)])
+        sub_model_cfg = LlamaConfig(
+            hidden_size=dim,
+            intermediate_size = intermediate_size,
+            num_attention_heads = num_heads,
+            num_hidden_layers = 12,
+            num_key_value_heads = num_heads,
+            vocab_size = self.code_size,
+            use_cache=False,
+            max_position_embeddings=10000,
+            rms_norm_eps= 1e-5,
+            rope_theta= 500000.0,
+            _flash_attn_2_enabled=True,
+            use_flash_attn_2=True,
+            attn_implementation="flash_attention_2"
+        )
+        self.transformer2 = CausalLM(sub_model_cfg)
+        self.out_norm: tp.Optional[nn.Module] = None
+        if norm_first:
+            self.out_norm = create_norm_fn(norm, dim)
+        # enable EOS prediction
+        if code_depth > 1:
+            self.linears = nn.ModuleList([nn.Linear(dim, self.code_size, bias=False)
+                                        for _ in range(code_depth - 1)])
+        self._init_weights(weight_init, depthwise_init, zero_bias_init)
+        self._fsdp: tp.Optional[nn.Module]
+        self.__dict__['_fsdp'] = None
+        self.reset_streaming()
+    def _init_weights(self, weight_init: tp.Optional[str],
+                      depthwise_init: tp.Optional[str], zero_bias_init: bool):
+        """Initialization of the transformer module weights.
+        Args:
+            weight_init (str, optional): Weight initialization strategy. See ``get_init_fn`` for valid options.
+            depthwise_init (str, optional): Depthwise initialization strategy. The following options are valid:
+                'current' where the depth corresponds to the current layer index or 'global' where the total number
+                of layer is used as depth. If not set, no depthwise initialization strategy is used.
+            zero_bias_init (bool): Whether to initialize bias to zero or not.
+        """
+        assert depthwise_init is None or depthwise_init in ['current', 'global']
+        assert depthwise_init is None or weight_init is not None, \
+            "If 'depthwise_init' is defined, a 'weight_init' method should be provided."
+        assert not zero_bias_init or weight_init is not None, \
+            "If 'zero_bias_init', a 'weight_init' method should be provided"
+        if weight_init is None:
+            return
+        for emb_layer in self.emb:
+            init_layer(emb_layer, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+    @property
+    def special_token_id(self) -> int:
+        return self.code_size   # 10001
+    @property
+    def eos_token_id(self) -> int:
+        return self.code_size-1 # 10000
+    @torch.no_grad()
+    def prepare_condition_tensors(self,
+                                   batch_size = 1,
+                                   text: tp.Optional[tp.List[str]] = None,
+                                   descriptions: tp.Optional[tp.List[str]] = None,
+                                   audio_qt_emb: tp.Optional[tp.List[torch.Tensor]] = None,
+                                   prepare_null_condition = False,
+                                   ):
+        if self.training:
+            attributes = []
+            for i in range(batch_size):
+                attr = ConditioningAttributes()
+                if 'description' in self.condition_provider.conditioners:
+                    attr["text"]["description"] = ""
+                    if text is not None:
+                        attr["text"]["description"] = text[i]
+                if 'prompt_audio' in self.condition_provider.conditioners:
+                    mask = (audio_qt_emb[[i], :, 0] == 16385).bool().unsqueeze(-1)
+                    audio_qt_seq = torch.cat([torch.full_like(audio_qt_emb[i][None][:,:,0], self.eos_token_id).unsqueeze(-1), audio_qt_emb[i][None]], dim=-1)
+                    mask = mask.repeat(1, 1, audio_qt_seq.shape[-1])
+                    audio_qt_seq[mask] = 16385
+                    attr["audio"]['prompt_audio'] = AudioCondition(
+                        wav=audio_qt_seq.long(),
+                        length=torch.Tensor([audio_qt_seq.shape[-1]]).long(),
+                        sample_rate=[self.cfg.sample_rate],)
+                if 'type_info' in self.condition_provider.conditioners:
+                    attr["text"]["type_info"] = ""
+                    if descriptions is not None:
+                        attr["text"]["type_info"] = descriptions[i]
+                attributes.append(attr)
+            # print("before cfg dropout", attributes)
+            attributes = self.cfg_dropout(attributes)   # drop ALL conditions
+            # print("after cfg dropout", attributes)
+            attributes = self.att_dropout(attributes)   # selectively drop some attributes (text, wav, or more fine-grained)
+            # print("after attribute dropout", attributes)
+            # attribute to discrete tokenized ids
+            tokenized = self.condition_provider.tokenize(attributes)
+            # print("after tokenize", attributes)
+            # discrete tokenized ids to continuous embeddings
+            condition_tensors = self.condition_provider(tokenized)
+        else:
+            conditions = []
+            for i in range(batch_size):
+                attr = ConditioningAttributes()
+                if 'description' in self.condition_provider.conditioners:
+                    attr["text"]["description"] = ""
+                    if text is not None:
+                        attr["text"]["description"] = text[i]
+                if 'prompt_audio' in self.condition_provider.conditioners:
+                    mask = (audio_qt_emb[[i], :, 0] == 16385).bool().unsqueeze(-1)
+                    audio_qt_seq = torch.cat([torch.full_like(audio_qt_emb[i][None][:,:,0], self.eos_token_id).unsqueeze(-1), audio_qt_emb[i][None]], dim=-1)
+                    mask = mask.repeat(1, 1, audio_qt_seq.shape[-1])
+                    audio_qt_seq[mask] = 16385
+                    attr["audio"]['prompt_audio'] = AudioCondition(
+                        wav=audio_qt_seq.long().cuda(),
+                        length=torch.Tensor([audio_qt_seq.shape[-1]]).long(),
+                        sample_rate=[self.cfg.sample_rate],)
+                if 'type_info' in self.condition_provider.conditioners:
+                    attr["text"]["type_info"] = ""
+                    if descriptions is not None:
+                        attr["text"]["type_info"] = descriptions[i]
+                conditions.append(attr)
+                print("conditions", conditions)
+            if prepare_null_condition:
+                cfg_inference = ClassifierFreeGuidanceDropoutInference()
+                null_conditions = cfg_inference(conditions, condition_types=["audio", "text"],
+                                                customized=None)
+                conditions = conditions + null_conditions
+            tokenized_conditions = self.condition_provider.tokenize(conditions)
+            condition_tensors = self.condition_provider(tokenized_conditions)
+        return condition_tensors
+    def forward(self,
+                sequence: torch.Tensor,
+                condition_tensors: ConditionTensors) -> torch.Tensor:
+        """Apply language model on sequence and conditions.
+        Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
+        S the sequence steps, return the logits with shape [B, card, K, S].
+        Args:
+            indices (torch.Tensor): Indices of the codes to model.
+            condition_tensors (dict[str, ConditionType], optional): Pre-computed conditioning
+                tensors, see `conditions`.
+        Returns:
+            torch.Tensor: Logits.
+        """
+        # import pdb; pdb.set_trace()
+        B, K, S = sequence.shape
+        assert K == self.code_depth, "Sequence shape must match the specified number of codebooks"
+        input_1 = self.emb[0](sequence[:, 0])
+        input_2 = sum([self.layer2_emb[k](sequence[:, k]) for k in range(1, K)])
+        fused_input1, fused_input2 = self.fuser(input_1, input_2, condition_tensors)
+        output = self.transformer(inputs_embeds=fused_input1,
+                                  use_cache=self._is_streaming,
+                                  past_key_values=self._streaming_state.get('past_key_values_1', None))
+        if self._is_streaming:
+            self._streaming_state['past_key_values_1'] = output.past_key_values
+        logits = output.logits # [B, S, card]
+        logits = logits.unsqueeze(1) # [B, 1, S, card]
+        # if self.out_norm:
+        #     out = self.out_norm(out.to(self.out_norm.weight.data.dtype))
+        if K > 1:
+            fused_input2 = torch.cat([fused_input2, output.hidden_states], dim=-1)
+            fused_input2 = self.mlp(fused_input2)
+            output2 = self.transformer2(inputs_embeds=fused_input2,
+                                           use_cache=self._is_streaming,
+                                           past_key_values=self._streaming_state.get('past_key_values_2', None))
+            if self._is_streaming:
+                self._streaming_state['past_key_values_2'] = output2.past_key_values
+            res_logits = torch.stack([self.linears[k](output2.hidden_states) for k in range(K - 1)], dim=1)  # [B, K, S, card] # [B, K, S, card]
+            logits = torch.cat([logits, res_logits], dim=1)  # [B, K, S, card]
+        # remove the prefix from the model outputs
+        if len(self.fuser.fuse2cond['prepend']) > 0:
+            logits = logits[:, :, -S:, :]
+        return logits  # [B, K, S, card]
+    def compute_predictions(self,
+                            codes: torch.Tensor,
+                            condition_tensors: tp.Optional[ConditionTensors] = None,
+                            **kwargs,
+                            ):  # this function is called during training
+        """Given an input tensor of codes [B, K, T] and list of conditions, runs the model
+        forward using the specified codes interleaving pattern.
+        Args:
+            codes (torch.Tensor): Input codes of shape [B, K, T] with B the batch size,
+                K the number of codebooks and T the number of timesteps.
+            condition_tensors (dict[str, ConditionType], optional): pre-computed conditioning
+                tensors, see `conditions`.
+        Returns:
+            LMOutput: Language model outputs
+                logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
+                    i.e. the first item corresponds to logits to predict the first code, meaning that
+                    no additional shifting of codes and logits is required.
+                mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
+                    Given the specified interleaving strategies, parts of the logits and codes should
+                    not be considered as valid predictions because of invalid context.
+        """
+        B, K, T = codes.shape
+        codes = codes.contiguous()
+        # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
+        pattern = self.pattern_provider.get_pattern(T)
+        sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
+            codes, self.special_token_id, keep_only_valid_steps=False
+        )
+        model = self if self._fsdp is None else self._fsdp
+        logits = model(sequence_codes, condition_tensors)  # [B, K, S, card]
+        # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -> [B, K, T, card]
+        # and provide the corresponding mask over invalid positions of tokens
+        logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
+        # note: we use nans as special token to make it obvious if we feed unexpected logits
+        logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
+            logits, float('nan'), keep_only_valid_steps=False
+        )
+        logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
+        logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -> [B, K, T]
+        return LMOutput(logits, logits_mask)
+    @torch.no_grad()
+    def generate(self, #
+                #  conditions: tp.List[ConditioningAttributes] = [],
+                 texts = None,
+                 descriptions = None,
+                 audio_qt_embs = None,
+                 num_samples: tp.Optional[int] = None,
+                 max_gen_len: int = 256,
+                 use_sampling: bool = True,
+                 temp: float = 1.0,
+                 top_k: int = 250,
+                 top_p: float = 0.0,
+                 cfg_coef: tp.Optional[float] = None,
+                 check: bool = False,
+                 record_tokens: bool = True,
+                 record_window: int = 150
+                 ) -> torch.Tensor:
+        """Generate tokens sampling from the model given a prompt or unconditionally. Generation can
+        be perform in a greedy fashion or using sampling with top K and top P strategies.
+        Args:
+            prompt (torch.Tensor, optional): Prompt tokens of shape [B, K, T].
+            conditions_tensors (list of ConditioningAttributes, optional): List of conditions.
+            num_samples (int, optional): Number of samples to generate when no prompt and no conditions are given.
+            max_gen_len (int): Maximum generation length.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for "top-k" sampling.
+            top_p (float): P for "top-p" sampling.
+            cfg_coeff (float, optional): Classifier-free guidance coefficient.
+            check (bool): Whether to apply further checks on generated sequence.
+            callback (Callback, optional): Callback function to report generation progress.
+        Returns:
+            torch.Tensor: Generated tokens.
+        """
+        assert not self.training, "generation shouldn't be used in training mode."
+        first_param = next(iter(self.parameters()))
+        device = first_param.device
+        # 1) Check input shapes are consistent
+        possible_num_samples = []
+        if num_samples is not None:
+            possible_num_samples.append(num_samples)
+        elif texts:
+            possible_num_samples.append(len(texts))
+        elif audio_qt_embs:
+            possible_num_samples.append(len(audio_qt_embs))
+        else:
+            possible_num_samples.append(1)
+        assert [x == possible_num_samples[0] for x in possible_num_samples], "Inconsistent inputs shapes"
+        num_samples = possible_num_samples[0]
+        condition_tensors = self.prepare_condition_tensors(batch_size=1, text=texts, descriptions=descriptions, audio_qt_emb=audio_qt_embs, prepare_null_condition=True)
+        # 3) Prepare token pool
+        record_token_pool = None
+        if record_tokens:
+            record_token_pool = []
+        # 4) set up startoff patterns
+        start_offset = 0
+        assert start_offset < max_gen_len, f"{start_offset}, {max_gen_len}"
+        pattern = self.pattern_provider.get_pattern(max_gen_len)
+        # this token is used as default value for codes that are not generated yet
+        unknown_token = -1
+        # we generate codes up to the max_gen_len that will be mapped to the pattern sequence
+        B = num_samples
+        gen_codes = torch.full((B, self.code_depth, max_gen_len),
+                               unknown_token, dtype=torch.long, device=device)
+        # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
+        gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
+        output_codes = torch.full_like(gen_sequence, self.code_size)
+        # retrieve the start_offset in the sequence:
+        # it is the first sequence step that contains the `start_offset` timestep
+        start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
+        assert start_offset_sequence is not None
+        is_end = torch.zeros((B, self.code_depth, 1)).bool().to(device)
+        ignore_tokens = audio_qt_embs[0][0]
+        # 5) auto-regressive sampling
+        with self.streaming():
+            gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
+            prev_offset = 0
+            for offset in range(start_offset_sequence, gen_sequence_len):
+                # get current sequence (note that the streaming API is providing the caching over previous offsets)
+                curr_sequence = gen_sequence[..., prev_offset:offset]
+                curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
+                if check:
+                    # check coherence between mask and sequence
+                    assert (curr_sequence == torch.where(curr_mask, curr_sequence, self.special_token_id)).all()
+                    # should never happen as gen_sequence is filled progressively
+                    assert not (curr_sequence == unknown_token).any()
+                # sample next token from the model, next token shape is [B, K, 1]
+                next_token = self._sample_next_token(
+                    curr_sequence, condition_tensors, use_sampling, temp, top_k, top_p,
+                    cfg_coef=cfg_coef,
+                    sampled_token_pool=record_token_pool[-record_window:] if record_tokens else None,
+                    ignore_tokens = ignore_tokens
+                    )
+                # ensure the tokens that should be masked are properly set to special_token_id
+                # as the model never output special_token_id
+                valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
+                next_token[~valid_mask] = self.special_token_id
+                # 检查eos id
+                next_token[is_end] = self.special_token_id
+                is_end = is_end | (next_token == self.eos_token_id)
+                # ensure we don't overwrite prompt tokens, we only write over unknown tokens
+                # (then mask tokens should be left as is as well, which is correct)
+                gen_sequence[..., offset:offset+1] = torch.where(
+                    gen_sequence[..., offset:offset+1] == unknown_token,
+                    next_token, gen_sequence[..., offset:offset+1])
+                # record sampled tokens in a window
+                if record_tokens:
+                    record_token_pool.append(next_token.squeeze())
+                if torch.all(is_end):
+                    gen_sequence = gen_sequence[..., :offset+1]
+                    break
+                prev_offset = offset
+        # ensure sequence has been entirely filled
+        assert not (gen_sequence == unknown_token).any()
+        max_gen_len = gen_sequence.shape[-1]
+        output_codes[..., :max_gen_len] = gen_sequence
+        # ensure gen_sequence pattern and mask are matching
+        # which means the gen_sequence is valid according to the pattern
+        # assert (gen_sequence == torch.where(mask[None, ...].expand(B, -1, -1), gen_sequence,
+        #                                 self.special_token_id)
+        # ).all()
+        # get back the codes, trimming the prompt if needed and cutting potentially incomplete timesteps
+        out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(output_codes, special_token=unknown_token)
+        # sanity checks over the returned codes and corresponding masks
+        assert (out_codes != unknown_token).all()
+        assert (out_mask == 1).all()
+        # ensure the returned codes are all valid
+        assert (out_codes >= 0).all() and (out_codes <= self.code_size).all()
+        return out_codes
+    def _sample_next_token(self,
+                           sequence: torch.Tensor,
+                           condition_tensors: ConditionTensors,
+                           use_sampling: bool = False,
+                           temp: float = 1.0,
+                           top_k: int = 0,
+                           top_p: float = 0.0,
+                           cfg_coef: tp.Optional[float] = None,
+                           sampled_token_pool: tp.Optional[list] = None,
+                           ignore_tokens: tp.Optional[torch.tensor] = torch.tensor([])) -> torch.Tensor:
+        """Sample next token from the model given a sequence and a set of conditions. The model supports
+        multiple sampling strategies (greedy sampling, softmax, top-k, top-p...).
+        Args:
+            sequence (torch.Tensor): Current sequence of shape [B, K, S]
+                with K corresponding to the number of codebooks and S the number of sequence steps.
+                S = 1 in streaming mode, except for the first step that contains a bigger prompt.
+            condition_tensors (dict[str, ConditionType): Set of conditions. If CFG is used,
+                should be twice the batch size, being the concatenation of the conditions + null conditions.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for "top-k" sampling.
+            top_p (float): P for "top-p" sampling.
+            cfg_coef (float, optional): classifier free guidance coefficient
+        Returns:
+            next_token (torch.Tensor): Next token tensor of shape [B, K, 1].
+        """
+        # import pdb; pdb.set_trace()
+        B = sequence.shape[0]
+        cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
+        model = self if self._fsdp is None else self._fsdp
+        # Preparing for CFG, predicting both conditional and unconditional logits.
+        sequence = torch.cat([sequence, sequence], dim=0)
+        all_logits = model(sequence, condition_tensors=condition_tensors)
+        cond_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
+        logits = uncond_logits + (cond_logits - uncond_logits) * cfg_coef
+        logits = logits.permute(0, 1, 3, 2)  # [B, K, card, T]
+        logits = logits[..., -1]  # [B x K x card]
+        # add punishment to pre-sampled tokens
+        if sampled_token_pool is not None and len(sampled_token_pool) > 0:
+            sampled_token_pool = torch.stack(sampled_token_pool, -1) # [K, T]
+            for q in range(self.code_depth):
+                # q_count = torch.bincount(sampled_token_pool)
+                q_count = torch.bincount(torch.unique(sampled_token_pool[q]))
+                tmp = min(q_count.shape[-1], self.code_size - 1)
+                logits[:, q, :tmp] /= (1.1 ** q_count[:tmp])
+        # Apply softmax for sampling if temp > 0. Else, do greedy sampling to avoid zero division error.
+        if(ignore_tokens is not None):
+            logits[0][0][ignore_tokens.to(torch.int)] = float('-inf')
+        if use_sampling and temp > 0.0:
+            probs = torch.softmax(logits / temp, dim=-1)
+            if top_p > 0.0:
+                next_token = sample_top_p(probs, p=top_p)
+            elif top_k > 0:
+                next_token_first = sample_top_k(probs[:,[0],:], k=top_k)
+                next_token_res = sample_top_k(probs[:,1:,:], k=1)
+                next_token = torch.cat([next_token_first,next_token_res], dim = 1)
+            else:
+                next_token = multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(logits, dim=-1, keepdim=True)
+        return next_token

codeclm/modules/conditioners.py ADDED Viewed

	@@ -0,0 +1,883 @@

+import typing as tp
+import torch
+import torch.nn as nn
+from dataclasses import dataclass, field, fields
+from itertools import chain
+import warnings
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+from codeclm.utils.utils import length_to_mask, collate
+from codeclm.modules.streaming import StreamingModule
+from collections import defaultdict
+from copy import deepcopy
+ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
+# ================================================================
+# Condition and Condition attributes definitions
+# ================================================================
+class AudioCondition(tp.NamedTuple):
+    wav: torch.Tensor
+    length: torch.Tensor
+    sample_rate: tp.List[int]
+    path: tp.List[tp.Optional[str]] = []
+    seek_time: tp.List[tp.Optional[float]] = []
+@dataclass
+class ConditioningAttributes:
+    text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
+    audio: tp.Dict[str, AudioCondition] = field(default_factory=dict)
+    def __getitem__(self, item):
+        return getattr(self, item)
+    @property
+    def text_attributes(self):
+        return self.text.keys()
+    @property
+    def audio_attributes(self):
+        return self.audio.keys()
+    @property
+    def attributes(self):
+        return {
+            "text": self.text_attributes,
+            "audio": self.audio_attributes,
+        }
+    def to_flat_dict(self):
+        return {
+            **{f"text.{k}": v for k, v in self.text.items()},
+            **{f"audio.{k}": v for k, v in self.audio.items()},
+        }
+    @classmethod
+    def from_flat_dict(cls, x):
+        out = cls()
+        for k, v in x.items():
+            kind, att = k.split(".")
+            out[kind][att] = v
+        return out
+# ================================================================
+# Conditioner (tokenize and encode raw conditions) definitions
+# ================================================================
+class BaseConditioner(nn.Module):
+    """Base model for all conditioner modules.
+    We allow the output dim to be different than the hidden dim for two reasons:
+    1) keep our LUTs small when the vocab is large;
+    2) make all condition dims consistent.
+    Args:
+        dim (int): Hidden dim of the model.
+        output_dim (int): Output dim of the conditioner.
+    """
+    def __init__(self, dim: int, output_dim: int, input_token = False, padding_idx=0):
+        super().__init__()
+        self.dim = dim
+        self.output_dim = output_dim
+        if input_token:
+            self.output_proj = nn.Embedding(dim, output_dim, padding_idx)
+        else:
+            self.output_proj = nn.Linear(dim, output_dim)
+    def tokenize(self, *args, **kwargs) -> tp.Any:
+        """Should be any part of the processing that will lead to a synchronization
+        point, e.g. BPE tokenization with transfer to the GPU.
+        The returned value will be saved and return later when calling forward().
+        """
+        raise NotImplementedError()
+    def forward(self, inputs: tp.Any) -> ConditionType:
+        """Gets input that should be used as conditioning (e.g, genre, description or a waveform).
+        Outputs a ConditionType, after the input data was embedded as a dense vector.
+        Returns:
+            ConditionType:
+                - A tensor of size [B, T, D] where B is the batch size, T is the length of the
+                  output embedding and D is the dimension of the embedding.
+                - And a mask indicating where the padding tokens.
+        """
+        raise NotImplementedError()
+class TextConditioner(BaseConditioner):
+    ...
+class PhonemeTokenizerConditioner(TextConditioner):
+    def __init__(self,
+                 output_dim: int,
+                 vocab_list,
+                 max_len = 600,
+                 max_sentence_per_structure = 50,
+                 structure_tokens=None,
+                 structure_split_tokens=[','],
+                 sentence_split_tokens=['.'],
+                 mode='sum',
+                 structure_output_dim = 64,
+                 sentence_output_dim = 64,
+                 max_duration = 120,
+                 ):
+        self.vocab_list = vocab_list
+        self.max_len = max_len
+        self.mode = mode
+        self.max_sentence_per_structure = max_sentence_per_structure
+        voc_size = len(self.vocab_list)
+        if structure_tokens is None:
+            structure_tokens = [i for i in vocab_list if len(i) > 1 and i[0] == '[' and i[-1] == ']']
+        self.structure_token_ids = [vocab_list.index(i) for i in structure_tokens if i in vocab_list]
+        self.structure_split_token_ids = [vocab_list.index(i) for i in structure_split_tokens]
+        self.sentence_split_token_ids = [vocab_list.index(i) for i in sentence_split_tokens]
+        # here initialize a output_proj (nn.Embedding) layer
+        # By default the first vocab is "" (null)
+        if mode == 'sum':
+            content_output_dim = output_dim
+            sentence_output_dim = output_dim
+            structure_output_dim = output_dim
+        else:   # concat'
+            raise NotImplementedError("concat 模式还未实现")
+            # content_output_dim = output_dim - sentence_output_dim - structure_output_dim   # by default
+        super().__init__(voc_size, content_output_dim, input_token=True, padding_idx=0)
+        self.special_emb = nn.Embedding(voc_size, structure_output_dim, padding_idx=0)
+        self.blank_emb = nn.Parameter(torch.zeros(1, output_dim), requires_grad=False)
+        # the first index is "empty structure" token
+        self.sentence_idx_in_structure_emb = nn.Embedding(max_sentence_per_structure, sentence_output_dim)
+        self.sentence_reidx_in_structure_emb = nn.Embedding(max_sentence_per_structure, sentence_output_dim)
+        print("max_len", self.max_len)
+        print(self.structure_token_ids)
+        self.resolution = max_duration / max_len    # e.g., 120 / 600 = 0.2s
+        print(self.__class__, f"resolution = {self.resolution}")
+    def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Dict[str, torch.Tensor]:
+        inputs = []
+        for xx in x:
+            xx = '' if xx is None else xx
+            vocab_id = [self.vocab_list.index(item) for item in xx.split(" ") if item in self.vocab_list]
+            inputs.append(torch.tensor(vocab_id).long()) # [T]
+        return inputs
+    def forward(self, batch_tokens: tp.List, structure_dur = None) -> ConditionType:
+        """
+        Encode token_id into three types of embeddings:
+        1) content embedding: phoneme only (or meaningful contents to be sung out)
+        2) structure embedding: structure / separation embeddings, including structures (verse/chorus/...), separators (. / ,)
+        The two above share the same embedding layer, can be changed to separate embedding layers.
+        3) sentence_idx embedding (per structure):
+        """
+        embeds_batch = []
+        for b in range(len(batch_tokens)):
+            tokens = batch_tokens[b]
+            content_tokens = torch.zeros_like(tokens)
+            special_tokens = torch.zeros_like(tokens)
+            sentence_idx_in_structure_tokens = torch.zeros_like(tokens)
+            sentence_reidx_in_structure_tokens = torch.zeros_like(tokens)
+            current_sentence_in_structure_idx = 1
+            current_structure = 0
+            for i in range(tokens.shape[-1]):
+                token = tokens[i]
+                if token in self.structure_token_ids:       # structure token
+                    # only update structure token, leave content and sentence index token null (default 0)
+                    special_tokens[i] = token
+                    content_tokens[i] = token
+                    current_structure = token
+                    current_sentence_in_structure_idx = 1
+                    sentence_idx_in_structure_tokens[i] = 0
+                elif token in self.sentence_split_token_ids:    # utterance split token
+                    # only update structure token, leave content and sentence index token null (default 0)
+                    # add up sentence index
+                    special_tokens[i] = current_structure
+                    content_tokens[i] = token
+                    sentence_idx_in_structure_tokens[i] = min(current_sentence_in_structure_idx, self.max_sentence_per_structure - 1)
+                    current_sentence_in_structure_idx += 1
+                elif token in self.structure_split_token_ids:    # structure split token
+                    # update structure token (current structure), content token (current token),
+                    # blank index token
+                    content_tokens[i] = token
+                    special_tokens[i] = current_structure
+                    sentence_idx_in_structure_tokens[i] = sentence_idx_in_structure_tokens[i-1]
+                else:       # content tokens
+                    content_tokens[i] = token
+                    special_tokens[i] = current_structure
+                    sentence_idx_in_structure_tokens[i] = min(current_sentence_in_structure_idx, self.max_sentence_per_structure - 1)
+            # 反推
+            current_sentence_num = sentence_idx_in_structure_tokens[-1]
+            for i in range(tokens.shape[-1]-1,-1,-1):
+                if current_sentence_num != 0:
+                    sentence_reidx_in_structure_tokens[i] = min(current_sentence_num + 1 - sentence_idx_in_structure_tokens[i], self.max_sentence_per_structure - 1)
+                if sentence_idx_in_structure_tokens[i] == 0 and i > 0:
+                    current_sentence_num = sentence_idx_in_structure_tokens[i-1]
+            # print("tokens", tokens.max(), tokens.min())
+            # print("special tokens", special_tokens.max(), special_tokens.min())
+            # print("sentence idx in structure", sentence_idx_in_structure_tokens.max(), sentence_idx_in_structure_tokens.min())
+            device = self.output_proj.weight.device
+            # import pdb; pdb.set_trace()
+            content_embeds = self.output_proj(content_tokens.to(device))    # [T, N]
+            structure_embeds = self.output_proj(special_tokens.to(device))
+            # sentence_idx_embeds = self.sentence_idx_in_structure_emb(sentence_idx_in_structure_tokens.to(device))
+            sentence_idx_embeds = self.sentence_idx_in_structure_emb(sentence_idx_in_structure_tokens.to(device)) + self.sentence_reidx_in_structure_emb(sentence_reidx_in_structure_tokens.to(device))
+            if self.mode == 'sum':
+                embeds = content_embeds + structure_embeds + sentence_idx_embeds
+            else:
+                embeds = torch.cat((content_embeds, structure_embeds, sentence_idx_embeds), -1) # [T, N]
+            embeds_batch.append(embeds)
+        # set batch_size = 1, [B, T, N]
+        if self.max_len is not None:
+            max_len = self.max_len
+        else:
+            max_len = max([e.shape[0] for e in embeds_batch])
+        embeds, mask = self.pad_2d_tensor(embeds_batch, max_len)
+        return embeds, embeds, mask
+    def pad_2d_tensor(self, xs, max_len):
+        new_tensor = []
+        new_mask = []
+        for x in xs:
+            seq_len, dim = x.size()
+            pad_len = max_len - seq_len
+            if pad_len > 0:
+                pad_tensor = self.blank_emb.repeat(pad_len, 1).to(x.device)  # T, D
+                padded_tensor = torch.cat([x, pad_tensor], dim=0)
+                mask = torch.cat((torch.ones_like(x[:, 0]),
+                                  torch.zeros_like(pad_tensor[:, 0])), 0)   # T
+            elif pad_len < 0:
+                padded_tensor = x[:max_len]
+                mask = torch.ones_like(padded_tensor[:, 0])
+            else:
+                padded_tensor = x
+                mask = torch.ones_like(x[:, 0])
+            new_tensor.append(padded_tensor)
+            new_mask.append(mask)
+        # [B, T, D] & [B, T]
+        return torch.stack(new_tensor, 0), torch.stack(new_mask, 0)
+class QwTokenizerConditioner(TextConditioner):
+    def __init__(self, output_dim: int,
+                 token_path = "",
+                 max_len = 300,
+                 add_token_list=[]): #""
+        from transformers import Qwen2Tokenizer
+        self.text_tokenizer = Qwen2Tokenizer.from_pretrained(token_path)
+        if add_token_list != []:
+            self.text_tokenizer.add_tokens(add_token_list, special_tokens=True)
+        voc_size = len(self.text_tokenizer.get_vocab())
+        # here initialize a output_proj (nn.Embedding) layer
+        super().__init__(voc_size, output_dim, input_token=True, padding_idx=151643)
+        self.max_len = max_len
+        self.padding_idx =' <|endoftext|>'
+        vocab = self.text_tokenizer.get_vocab()
+        # struct是全部的结构
+        struct_tokens = [i for i in add_token_list if i[0]=='[' and i[-1]==']']
+        self.struct_token_ids = [vocab[i] for i in struct_tokens]
+        self.pad_token_idx = 151643
+        self.structure_emb = nn.Embedding(200, output_dim, padding_idx=0)
+        # self.split_token_id = vocab["."]
+        print("all structure tokens: ", {self.text_tokenizer.convert_ids_to_tokens(i):i for i in self.struct_token_ids})
+    def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Dict[str, torch.Tensor]:
+        x = ['<|im_start|>' + xi if xi is not None else "<|im_start|>" for xi in x]
+        # x = [xi if xi is not None else "" for xi in x]
+        inputs = self.text_tokenizer(x, return_tensors="pt", padding=True)
+        return inputs
+    def forward(self, inputs: tp.Dict[str, torch.Tensor]) -> ConditionType:
+        """
+        Add structure embeddings of {verse, chorus, bridge} to text/lyric tokens that
+        belong to these structures accordingly,
+        Then delete or keep these structure embeddings.
+        """
+        mask = inputs['attention_mask']
+        tokens = inputs['input_ids']
+        B = tokens.shape[0]
+        is_sp_embed = torch.any(torch.stack([tokens == i for i in self.struct_token_ids], dim=-1),dim=-1)
+        tp_cover_range = torch.zeros_like(tokens)
+        for b, is_sp in enumerate(is_sp_embed):
+            sp_list = torch.where(is_sp)[0].tolist()
+            sp_list.append(mask[b].sum())
+            for i, st in enumerate(sp_list[:-1]):
+                tp_cover_range[b, st: sp_list[i+1]] = tokens[b, st] - 151645
+        if self.max_len is not None:
+            if inputs['input_ids'].shape[-1] > self.max_len:
+                warnings.warn(f"Max len limit ({self.max_len}) Exceed! \
+                              {[self.text_tokenizer.convert_ids_to_tokens(i.tolist()) for i in tokens]} will be cut!")
+            tokens = self.pad_2d_tensor(tokens, self.max_len, self.pad_token_idx).to(self.output_proj.weight.device)
+            mask = self.pad_2d_tensor(mask, self.max_len, 0).to(self.output_proj.weight.device)
+            tp_cover_range = self.pad_2d_tensor(tp_cover_range, self.max_len, 0).to(self.output_proj.weight.device)
+        device = self.output_proj.weight.device
+        content_embeds = self.output_proj(tokens.to(device))
+        structure_embeds = self.structure_emb(tp_cover_range.to(device))
+        embeds = content_embeds + structure_embeds
+        return embeds, embeds, mask
+    def pad_2d_tensor(self, x, max_len, pad_id):
+        batch_size, seq_len = x.size()
+        pad_len = max_len - seq_len
+        if pad_len > 0:
+            pad_tensor = torch.full((batch_size, pad_len), pad_id, dtype=x.dtype, device=x.device)
+            padded_tensor = torch.cat([x, pad_tensor], dim=1)
+        elif pad_len < 0:
+            padded_tensor = x[:, :max_len]
+        else:
+            padded_tensor = x
+        return padded_tensor
+class QwTextConditioner(TextConditioner):
+    def __init__(self, output_dim: int,
+                 token_path = "",
+                 max_len = 300): #""
+        from transformers import Qwen2Tokenizer
+        self.text_tokenizer = Qwen2Tokenizer.from_pretrained(token_path)
+        voc_size = len(self.text_tokenizer.get_vocab())
+        # here initialize a output_proj (nn.Embedding) layer
+        super().__init__(voc_size, output_dim, input_token=True, padding_idx=151643)
+        self.max_len = max_len
+    def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Dict[str, torch.Tensor]:
+        x = ['<|im_start|>' + xi if xi is not None else "<|im_start|>" for xi in x]
+        inputs = self.text_tokenizer(x, return_tensors="pt", padding=True)
+        return inputs
+    def forward(self, inputs: tp.Dict[str, torch.Tensor], structure_dur = None) -> ConditionType:
+        """
+        Add structure embeddings of {verse, chorus, bridge} to text/lyric tokens that
+        belong to these structures accordingly,
+        Then delete or keep these structure embeddings.
+        """
+        mask = inputs['attention_mask']
+        tokens = inputs['input_ids']
+        if self.max_len is not None:
+            if inputs['input_ids'].shape[-1] > self.max_len:
+                warnings.warn(f"Max len limit ({self.max_len}) Exceed! \
+                              {[self.text_tokenizer.convert_ids_to_tokens(i.tolist()) for i in tokens]} will be cut!")
+            tokens = self.pad_2d_tensor(tokens, self.max_len, 151643).to(self.output_proj.weight.device)
+            mask = self.pad_2d_tensor(mask, self.max_len, 0).to(self.output_proj.weight.device)
+        embeds = self.output_proj(tokens)
+        return embeds, embeds, mask
+    def pad_2d_tensor(self, x, max_len, pad_id):
+        batch_size, seq_len = x.size()
+        pad_len = max_len - seq_len
+        if pad_len > 0:
+            pad_tensor = torch.full((batch_size, pad_len), pad_id, dtype=x.dtype, device=x.device)
+            padded_tensor = torch.cat([x, pad_tensor], dim=1)
+        elif pad_len < 0:
+            padded_tensor = x[:, :max_len]
+        else:
+            padded_tensor = x
+        return padded_tensor
+class AudioConditioner(BaseConditioner):
+    ...
+class QuantizedEmbeddingConditioner(AudioConditioner):
+    def __init__(self, dim: int,
+                 code_size: int,
+                 code_depth: int,
+                 max_len: int,
+                 **kwargs):
+        super().__init__(dim, dim, input_token=True)
+        self.code_depth = code_depth
+        # add 1 for <s> token
+        self.emb = nn.ModuleList([nn.Embedding(code_size+2, dim, padding_idx=code_size+1) for _ in range(code_depth)])
+        # add End-Of-Text embedding
+        self.EOT_emb = nn.Parameter(torch.randn(1, dim), requires_grad=True)
+        self.layer2_EOT_emb = nn.Parameter(torch.randn(1, dim), requires_grad=True)
+        self.output_proj = None
+        self.max_len = max_len
+        self.vocab_size = code_size
+    def tokenize(self, x: AudioCondition) -> AudioCondition:
+        """no extra ops"""
+        # wav, length, sample_rate, path, seek_time = x
+        # assert length is not None
+        return x #AudioCondition(wav, length, sample_rate, path, seek_time)
+    def forward(self, x: AudioCondition):
+        wav, lengths, *_ = x
+        B = wav.shape[0]
+        wav = wav.reshape(B, self.code_depth, -1).long()
+        if wav.shape[2] < self.max_len - 1:
+            wav = F.pad(wav, [0, self.max_len - 1 - wav.shape[2]], value=self.vocab_size+1)
+        else:
+            wav = wav[:, :, :self.max_len-1]
+        embeds1 = self.emb[0](wav[:, 0])
+        embeds1 = torch.cat((self.EOT_emb.unsqueeze(0).repeat(B, 1, 1),
+                                embeds1), dim=1)
+        embeds2 = sum([self.emb[k](wav[:, k]) for k in range(1, self.code_depth)]) # B,T,D
+        embeds2 = torch.cat((self.layer2_EOT_emb.unsqueeze(0).repeat(B, 1, 1),
+                             embeds2), dim=1)
+        lengths = lengths + 1
+        lengths = torch.clamp(lengths, max=self.max_len)
+        if lengths is not None:
+            mask = length_to_mask(lengths, max_len=embeds1.shape[1]).int()  # type: ignore
+        else:
+            mask = torch.ones((B, self.code_depth), device=embeds1.device, dtype=torch.int)
+        return embeds1, embeds2, mask
+# ================================================================
+# Aggregate all conditions and corresponding conditioners
+# ================================================================
+class ConditionerProvider(nn.Module):
+    """Prepare and provide conditions given all the supported conditioners.
+    Args:
+        conditioners (dict): Dictionary of conditioners.
+        device (torch.device or str, optional): Device for conditioners and output condition types.
+    """
+    def __init__(self, conditioners: tp.Dict[str, BaseConditioner]):
+        super().__init__()
+        self.conditioners = nn.ModuleDict(conditioners)
+    @property
+    def text_conditions(self):
+        return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]
+    @property
+    def audio_conditions(self):
+        return [k for k, v in self.conditioners.items() if isinstance(v, AudioConditioner)]
+    @property
+    def has_audio_condition(self):
+        return len(self.audio_conditions) > 0
+    def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]:
+        """Match attributes/audios with existing conditioners in self, and compute tokenize them accordingly.
+        This should be called before starting any real GPU work to avoid synchronization points.
+        This will return a dict matching conditioner names to their arbitrary tokenized representations.
+        Args:
+            inputs (list[ConditioningAttributes]): List of ConditioningAttributes objects containing
+                text and audio conditions.
+        """
+        assert all([isinstance(x, ConditioningAttributes) for x in inputs]), (
+            "Got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]",
+            f" but types were {set([type(x) for x in inputs])}")
+        output = {}
+        text = self._collate_text(inputs)
+        audios = self._collate_audios(inputs)
+        assert set(text.keys() | audios.keys()).issubset(set(self.conditioners.keys())), (
+            f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ",
+            f"got {text.keys(), audios.keys()}")
+        for attribute, batch in chain(text.items(), audios.items()):
+            output[attribute] = self.conditioners[attribute].tokenize(batch)
+        return output
+    def forward(self, tokenized: tp.Dict[str, tp.Any], structure_dur = None) -> tp.Dict[str, ConditionType]:
+        """Compute pairs of `(embedding, mask)` using the configured conditioners and the tokenized representations.
+        The output is for example:
+        {
+            "genre": (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
+            "description": (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
+            ...
+        }
+        Args:
+            tokenized (dict): Dict of tokenized representations as returned by `tokenize()`.
+        """
+        output = {}
+        for attribute, inputs in tokenized.items():
+            if attribute == 'description' and structure_dur is not None:
+                condition1, condition2, mask = self.conditioners[attribute](inputs, structure_dur = structure_dur)
+            else:
+                condition1, condition2, mask = self.conditioners[attribute](inputs)
+            output[attribute] = (condition1, condition2, mask)
+        return output
+    def _collate_text(self, samples: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.List[tp.Optional[str]]]:
+        """Given a list of ConditioningAttributes objects, compile a dictionary where the keys
+        are the attributes and the values are the aggregated input per attribute.
+        For example:
+        Input:
+        [
+            ConditioningAttributes(text={"genre": "Rock", "description": "A rock song with a guitar solo"}, wav=...),
+            ConditioningAttributes(text={"genre": "Hip-hop", "description": "A hip-hop verse"}, audio=...),
+        ]
+        Output:
+        {
+            "genre": ["Rock", "Hip-hop"],
+            "description": ["A rock song with a guitar solo", "A hip-hop verse"]
+        }
+        Args:
+            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
+        Returns:
+            dict[str, list[str, optional]]: A dictionary mapping an attribute name to text batch.
+        """
+        out: tp.Dict[str, tp.List[tp.Optional[str]]] = defaultdict(list)
+        texts = [x.text for x in samples]
+        for text in texts:
+            for condition in self.text_conditions:
+                out[condition].append(text[condition])
+        return out
+    def _collate_audios(self, samples: tp.List[ConditioningAttributes]) -> tp.Dict[str, AudioCondition]:
+        """Generate a dict where the keys are attributes by which we fetch similar audios,
+        and the values are Tensors of audios according to said attributes.
+        *Note*: by the time the samples reach this function, each sample should have some audios
+        inside the "audio" attribute. It should be either:
+        1. A real audio
+        2. A null audio due to the sample having no similar audios (nullified by the dataset)
+        3. A null audio due to it being dropped in a dropout module (nullified by dropout)
+        Args:
+            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
+        Returns:
+            dict[str, WavCondition]: A dictionary mapping an attribute name to wavs.
+        """
+        # import pdb; pdb.set_trace()
+        wavs = defaultdict(list)
+        lengths = defaultdict(list)
+        sample_rates = defaultdict(list)
+        paths = defaultdict(list)
+        seek_times = defaultdict(list)
+        out: tp.Dict[str, AudioCondition] = {}
+        for sample in samples:
+            for attribute in self.audio_conditions:
+                wav, length, sample_rate, path, seek_time = sample.audio[attribute]
+                assert wav.dim() == 3, f"Got wav with dim={wav.dim()}, but expected 3 [1, C, T]"
+                assert wav.size(0) == 1, f"Got wav [B, C, T] with shape={wav.shape}, but expected B == 1"
+                wavs[attribute].append(wav.flatten())  # [C*T]
+                lengths[attribute].append(length)
+                sample_rates[attribute].extend(sample_rate)
+                paths[attribute].extend(path)
+                seek_times[attribute].extend(seek_time)
+        # stack all wavs to a single tensor
+        for attribute in self.audio_conditions:
+            stacked_wav, _ = collate(wavs[attribute], dim=0)
+            out[attribute] = AudioCondition(
+                stacked_wav.unsqueeze(1),
+                torch.cat(lengths[attribute]), sample_rates[attribute],
+                paths[attribute], seek_times[attribute])
+        return out
+class ConditionFuser(StreamingModule):
+    """Condition fuser handles the logic to combine the different conditions
+    to the actual model input.
+    Args:
+        fuse2cond (tp.Dict[str, str]): A dictionary that says how to fuse
+            each condition. For example:
+            {
+                "prepend": ["description"],
+                "sum": ["genre", "bpm"],
+            }
+    """
+    FUSING_METHODS = ["sum", "prepend"] #, "cross", "input_interpolate"] (not support in this simplest version)
+    def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]]):
+        super().__init__()
+        assert all([k in self.FUSING_METHODS for k in fuse2cond.keys()]
+        ), f"Got invalid fuse method, allowed methods: {self.FUSING_METHODS}"
+        self.fuse2cond: tp.Dict[str, tp.List[str]] = fuse2cond
+        self.cond2fuse: tp.Dict[str, str] = {}
+        for fuse_method, conditions in fuse2cond.items():
+            for condition in conditions:
+                self.cond2fuse[condition] = fuse_method
+    def forward(
+        self,
+        input1: torch.Tensor,
+        input2: torch.Tensor,
+        conditions: tp.Dict[str, ConditionType]
+    ) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        """Fuse the conditions to the provided model input.
+        Args:
+            input (torch.Tensor): Transformer input.
+            conditions (dict[str, ConditionType]): Dict of conditions.
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]: The first tensor is the transformer input
+                after the conditions have been fused. The second output tensor is the tensor
+                used for cross-attention or None if no cross attention inputs exist.
+        """
+        #import pdb; pdb.set_trace()
+        B, T, _ = input1.shape
+        if 'offsets' in self._streaming_state:
+            first_step = False
+            offsets = self._streaming_state['offsets']
+        else:
+            first_step = True
+            offsets = torch.zeros(input1.shape[0], dtype=torch.long, device=input1.device)
+        assert set(conditions.keys()).issubset(set(self.cond2fuse.keys())), \
+            f"given conditions contain unknown attributes for fuser, " \
+            f"expected {self.cond2fuse.keys()}, got {conditions.keys()}"
+        # if 'prepend' mode is used,
+        # the concatenation order will be the SAME with the conditions in config:
+        # prepend: ['description', 'prompt_audio'] (then goes the input)
+        fused_input_1 = input1
+        fused_input_2 = input2
+        for fuse_op in self.fuse2cond.keys():
+            fuse_op_conditions = self.fuse2cond[fuse_op]
+            if fuse_op == 'sum' and len(fuse_op_conditions) > 0:
+                for cond in fuse_op_conditions:
+                    this_cond_1, this_cond_2, cond_mask = conditions[cond]
+                    fused_input_1 += this_cond_1
+                    fused_input_2 += this_cond_2
+            elif fuse_op == 'prepend' and len(fuse_op_conditions) > 0:
+                if not first_step:
+                    continue
+                reverse_list = deepcopy(fuse_op_conditions)
+                reverse_list.reverse()
+                for cond in reverse_list:
+                    this_cond_1, this_cond_2, cond_mask = conditions[cond]
+                    fused_input_1 = torch.cat((this_cond_1, fused_input_1), dim=1)  # concat along T dim
+                    fused_input_2 = torch.cat((this_cond_2, fused_input_2), dim=1)  # concat along T dim
+            elif fuse_op not in self.FUSING_METHODS:
+                raise ValueError(f"unknown op ({fuse_op})")
+        if self._is_streaming:
+            self._streaming_state['offsets'] = offsets + T
+        return fused_input_1, fused_input_2
+# ================================================================
+# Condition Dropout
+# ================================================================
+class DropoutModule(nn.Module):
+    """Base module for all dropout modules."""
+    def __init__(self, seed: int = 1234):
+        super().__init__()
+        self.rng = torch.Generator()
+        self.rng.manual_seed(seed)
+class ClassifierFreeGuidanceDropout(DropoutModule):
+    """Classifier Free Guidance dropout.
+    All attributes are dropped with the same probability.
+    Args:
+        p (float): Probability to apply condition dropout during training.
+        seed (int): Random seed.
+    """
+    def __init__(self, p: float, seed: int = 1234):
+        super().__init__(seed=seed)
+        self.p = p
+    def check(self, sample, condition_type, condition):
+        if condition_type not in ['text', 'audio']:
+            raise ValueError("dropout_condition got an unexpected condition type!"
+                f" expected 'text', 'audio' but got '{condition_type}'")
+        if condition not in getattr(sample, condition_type):
+            raise ValueError(
+                "dropout_condition received an unexpected condition!"
+                f" expected audio={sample.audio.keys()} and text={sample.text.keys()}"
+                f" but got '{condition}' of type '{condition_type}'!")
+    def get_null_wav(self, wav, sr=48000) -> AudioCondition:
+        out = wav * 0 + 16385
+        return AudioCondition(
+            wav=out,
+            length=torch.Tensor([0]).long(),
+            sample_rate=[sr],)
+    def dropout_condition(self,
+                          sample: ConditioningAttributes,
+                          condition_type: str,
+                          condition: str) -> ConditioningAttributes:
+        """Utility function for nullifying an attribute inside an ConditioningAttributes object.
+        If the condition is of type "wav", then nullify it using `nullify_condition` function.
+        If the condition is of any other type, set its value to None.
+        Works in-place.
+        """
+        self.check(sample, condition_type, condition)
+        if condition_type == 'audio':
+            audio_cond = sample.audio[condition]
+            depth = audio_cond.wav.shape[1]
+            sample.audio[condition] = self.get_null_wav(audio_cond.wav, sr=audio_cond.sample_rate[0])
+        else:
+            sample.text[condition] = None
+        return sample
+    def forward(self, samples: tp.List[ConditioningAttributes]) -> tp.List[ConditioningAttributes]:
+        """
+        Args:
+            samples (list[ConditioningAttributes]): List of conditions.
+        Returns:
+            list[ConditioningAttributes]: List of conditions after all attributes were set to None.
+        """
+        # decide on which attributes to drop in a batched fashion
+        # drop = torch.rand(1, generator=self.rng).item() < self.p
+        # if not drop:
+        #     return samples
+        # nullify conditions of all attributes
+        samples = deepcopy(samples)
+        for sample in samples:
+            drop = torch.rand(1, generator=self.rng).item()
+            if drop<self.p:
+                for condition_type in ["audio", "text"]:
+                    for condition in sample.attributes[condition_type]:
+                        self.dropout_condition(sample, condition_type, condition)
+        return samples
+    def __repr__(self):
+        return f"ClassifierFreeGuidanceDropout(p={self.p})"
+class ClassifierFreeGuidanceDropoutInference(ClassifierFreeGuidanceDropout):
+    """Classifier Free Guidance dropout during inference.
+    All attributes are dropped with the same probability.
+    Args:
+        p (float): Probability to apply condition dropout during training.
+        seed (int): Random seed.
+    """
+    def __init__(self, seed: int = 1234):
+        super().__init__(p=1, seed=seed)
+    def dropout_condition_customized(self,
+                                     sample: ConditioningAttributes,
+                                    condition_type: str,
+                                    condition: str,
+                                    customized: list = None) -> ConditioningAttributes:
+        """Utility function for nullifying an attribute inside an ConditioningAttributes object.
+        If the condition is of type "audio", then nullify it using `nullify_condition` function.
+        If the condition is of any other type, set its value to None.
+        Works in-place.
+        """
+        self.check(sample, condition_type, condition)
+        if condition_type == 'audio':
+            audio_cond = sample.audio[condition]
+            depth = audio_cond.wav.shape[1]
+            sample.audio[condition] = self.get_null_wav(audio_cond.wav, sr=audio_cond.sample_rate[0])
+        else:
+            if customized is None:
+                sample.text[condition] = None
+            else:
+                text_cond = deepcopy(sample.text[condition])
+                if "structure" in customized:
+                    for _s in ['[inst]', '[outro]', '[intro]', '[verse]', '[chorus]', '[bridge]']:
+                        text_cond = text_cond.replace(_s, "")
+                    text_cond = text_cond.replace(' , ', '')
+                    text_cond = text_cond.replace("  ", " ")
+                if '.' in customized:
+                    text_cond = text_cond.replace(" . ", " ")
+                    text_cond = text_cond.replace(".", " ")
+                sample.text[condition] = text_cond
+        return sample
+    def forward(self, samples: tp.List[ConditioningAttributes],
+                condition_types=["wav", "text"],
+                customized=None,
+                ) -> tp.List[ConditioningAttributes]:
+        """
+        100% dropout some condition attributes (description, prompt_wav) or types (text, wav) of
+        samples during inference.
+        Args:
+            samples (list[ConditioningAttributes]): List of conditions.
+        Returns:
+            list[ConditioningAttributes]: List of conditions after all attributes were set to None.
+        """
+        new_samples = deepcopy(samples)
+        for condition_type in condition_types:
+            for sample in new_samples:
+                for condition in sample.attributes[condition_type]:
+                    self.dropout_condition_customized(sample, condition_type, condition, customized)
+        return new_samples
+class AttributeDropout(ClassifierFreeGuidanceDropout):
+    """Dropout with a given probability per attribute.
+    This is different from the behavior of ClassifierFreeGuidanceDropout as this allows for attributes
+    to be dropped out separately. For example, "artist" can be dropped while "genre" remains.
+    This is in contrast to ClassifierFreeGuidanceDropout where if "artist" is dropped "genre"
+    must also be dropped.
+    Args:
+        p (tp.Dict[str, float]): A dict mapping between attributes and dropout probability. For example:
+            ...
+            "genre": 0.1,
+            "artist": 0.5,
+            "audio": 0.25,
+            ...
+        active_on_eval (bool, optional): Whether the dropout is active at eval. Default to False.
+        seed (int, optional): Random seed.
+    """
+    def __init__(self, p: tp.Dict[str, tp.Dict[str, float]], active_on_eval: bool = False, seed: int = 1234):
+        super().__init__(p=p, seed=seed)
+        self.active_on_eval = active_on_eval
+        # construct dict that return the values from p otherwise 0
+        self.p = {}
+        for condition_type, probs in p.items():
+            self.p[condition_type] = defaultdict(lambda: 0, probs)
+    def forward(self, samples: tp.List[ConditioningAttributes]) -> tp.List[ConditioningAttributes]:
+        """
+        Args:
+            samples (list[ConditioningAttributes]): List of conditions.
+        Returns:
+            list[ConditioningAttributes]: List of conditions after certain attributes were set to None.
+        """
+        if not self.training and not self.active_on_eval:
+            return samples
+        samples = deepcopy(samples)
+        for condition_type, ps in self.p.items():  # for condition types [text, wav]
+            for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
+                if torch.rand(1, generator=self.rng).item() < p:
+                    for sample in samples:
+                        self.dropout_condition(sample, condition_type, condition)
+        return samples

codeclm/modules/pattern.py ADDED Viewed

	@@ -0,0 +1,351 @@

+from collections import namedtuple
+from dataclasses import dataclass
+from functools import lru_cache
+import logging
+import typing as tp
+from abc import ABC, abstractmethod
+import torch
+LayoutCoord = namedtuple('LayoutCoord', ['t', 'q'])  # (timestep, codebook index)
+PatternLayout = tp.List[tp.List[LayoutCoord]]  # Sequence of coordinates
+logger = logging.getLogger(__name__)
+@dataclass
+class Pattern:
+    """Base implementation of a pattern over a sequence with multiple codebooks.
+    The codebook pattern consists in a layout, defining for each sequence step
+    the list of coordinates of each codebook timestep in the resulting interleaved sequence.
+    The first item of the pattern is always an empty list in order to properly insert a special token
+    to start with. For convenience, we also keep track of ``code_depth`` the number of codebooks used for the pattern
+    and ``timesteps`` the number of timesteps corresponding to the original sequence.
+    The pattern provides convenient methods to build and revert interleaved sequences from it:
+    ``build_pattern_sequence`` maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
+        to the interleaved sequence of shape [B, K, S] applying the pattern, with S being the batch size,
+        K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
+        for the output sequence. The unfilled positions are replaced with a special token and the built sequence
+        is returned along with a mask indicating valid tokens.
+    ``revert_pattern_sequence`` maps back an interleaved sequence of shape [B, K, S] to the original alignment
+        of codebooks across timesteps to an output tensor of shape [B, K, T], using again a special token and a mask
+        to fill and specify invalid positions if needed.
+    See the dedicated methods for more details.
+    """
+    # Pattern layout, for each sequence step, we have a list of coordinates
+    # corresponding to the original codebook timestep and position.
+    # The first list is always an empty list in order to properly insert
+    # a special token to start with.
+    layout: PatternLayout
+    timesteps: int
+    code_depth: int
+    def __post_init__(self):
+        assert len(self.layout) > 0
+        assert self.layout[0] == []
+        self._validate_layout()
+        self._build_reverted_sequence_scatter_indexes = lru_cache(100)(self._build_reverted_sequence_scatter_indexes)
+        self._build_pattern_sequence_scatter_indexes = lru_cache(100)(self._build_pattern_sequence_scatter_indexes)
+        logger.info("New pattern, time steps: %d, sequence steps: %d", self.timesteps, len(self.layout))
+    def _validate_layout(self):
+        """Runs checks on the layout to ensure a valid pattern is defined.
+        A pattern is considered invalid if:
+            - Multiple timesteps for a same codebook are defined in the same sequence step
+            - The timesteps for a given codebook are not in ascending order as we advance in the sequence
+              (this would mean that we have future timesteps before past timesteps).
+        """
+        q_timesteps = {q: 0 for q in range(self.code_depth)}
+        for s, seq_coords in enumerate(self.layout):
+            if len(seq_coords) > 0:
+                qs = set()
+                for coord in seq_coords:
+                    qs.add(coord.q)
+                    last_q_timestep = q_timesteps[coord.q]
+                    # assert coord.t >= last_q_timestep, \
+                    #     f"Past timesteps are found in the sequence for codebook = {coord.q} at step {s}"
+                    q_timesteps[coord.q] = coord.t
+                # each sequence step contains at max 1 coordinate per codebook
+                assert len(qs) == len(seq_coords), \
+                    f"Multiple entries for a same codebook are found at step {s}"
+    @property
+    def num_sequence_steps(self):
+        return len(self.layout) - 1
+    @property
+    def max_delay(self):
+        max_t_in_seq_coords = 0
+        for seq_coords in self.layout[1:]:
+            for coords in seq_coords:
+                max_t_in_seq_coords = max(max_t_in_seq_coords, coords.t + 1)
+        return max_t_in_seq_coords - self.timesteps
+    @property
+    def valid_layout(self):
+        valid_step = len(self.layout) - self.max_delay
+        return self.layout[:valid_step]
+    def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None):
+        """Get codebook coordinates in the layout that corresponds to the specified timestep t
+        and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
+        and the actual codebook coordinates.
+        """
+        assert t <= self.timesteps, "provided timesteps is greater than the pattern's number of timesteps"
+        if q is not None:
+            assert q <= self.code_depth, "provided number of codebooks is greater than the pattern's number of codebooks"
+        coords = []
+        for s, seq_codes in enumerate(self.layout):
+            for code in seq_codes:
+                if code.t == t and (q is None or code.q == q):
+                    coords.append((s, code))
+        return coords
+    def get_steps_with_timestep(self, t: int, q: tp.Optional[int] = None) -> tp.List[int]:
+        return [step for step, coords in self.get_sequence_coords_with_timestep(t, q)]
+    def get_first_step_with_timesteps(self, t: int, q: tp.Optional[int] = None) -> tp.Optional[int]:
+        steps_with_timesteps = self.get_steps_with_timestep(t, q)
+        return steps_with_timesteps[0] if len(steps_with_timesteps) > 0 else None
+    def _build_pattern_sequence_scatter_indexes(self, timesteps: int,
+                                                code_depth: int,
+                                                keep_only_valid_steps: bool,
+                                                device: tp.Union[torch.device, str] = 'cpu'):
+        """Build scatter indexes corresponding to the pattern, up to the provided sequence_steps.
+        Args:
+            timesteps (int): Maximum number of timesteps steps to consider.
+            keep_only_valid_steps (bool): Restrict the pattern layout to match only valid steps.
+            device (torch.device or str): Device for created tensors.
+        Returns:
+            indexes (torch.Tensor): Indexes corresponding to the sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes, of shape [K, S].
+        """
+        assert code_depth == self.code_depth, f"invalid number of codebooks for the sequence and the pattern: {code_depth} != {self.code_depth}"
+        assert timesteps <= self.timesteps, "invalid number of timesteps used to build the sequence from the pattern"
+        # use the proper layout based on whether we limit ourselves to valid steps only or not,
+        # note that using the valid_layout will result in a truncated sequence up to the valid steps
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(code_depth, len(ref_layout), dtype=torch.long).numpy()
+        mask = torch.zeros(code_depth, len(ref_layout), dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        # the last value is code_depth * timesteps as we have flattened z and append special token as the last token
+        # which will correspond to the index: code_depth * timesteps
+        indexes[:] = code_depth * timesteps
+        # iterate over the pattern and fill scattered indexes and mask
+        for s, sequence_coords in enumerate(ref_layout):
+            for coords in sequence_coords:
+                if coords.t < timesteps:
+                    indexes[coords.q, s] = coords.t + coords.q * timesteps
+                    mask[coords.q, s] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+    def build_pattern_sequence(self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+        """Build sequence corresponding to the pattern from the input tensor z.
+        The sequence is built using up to sequence_steps if specified, and non-pattern
+        coordinates are filled with the special token.
+        Args:
+            z (torch.Tensor): Input tensor of multi-codebooks sequence, of shape [B, K, T].
+            special_token (int): Special token used to fill non-pattern coordinates in the new sequence.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S
+                corresponding either to the sequence_steps if provided, otherwise to the length of the pattern.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S].
+        """
+        B, K, T = z.shape
+        indexes, mask = self._build_pattern_sequence_scatter_indexes(
+            T, K, keep_only_valid_steps=keep_only_valid_steps, device=str(z.device)
+        )
+        z = z.reshape(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        z = torch.cat([z, torch.zeros_like(z[:, :1]) + special_token], dim=1)
+        values = z[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        # import pdb; pdb.set_trace()
+        return values, indexes, mask
+    def _build_reverted_sequence_scatter_indexes(self, sequence_steps: int, code_depth: int,
+                                                 keep_only_valid_steps: bool = False,
+                                                 is_model_output: bool = False,
+                                                 device: tp.Union[torch.device, str] = 'cpu'):
+        """Builds scatter indexes required to retrieve the original multi-codebook sequence
+        from interleaving pattern.
+        Args:
+            sequence_steps (int): Sequence steps.
+            code_depth (int): Number of codebooks.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+            is_model_output (bool): Whether to keep the sequence item corresponding to initial special token or not.
+            device (torch.device or str): Device for created tensors.
+        Returns:
+            indexes (torch.Tensor): Indexes for reconstructing the output, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        """
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        timesteps = self.timesteps
+        assert code_depth == self.code_depth, f"invalid number of codebooks for the sequence and the pattern: {code_depth} != {self.code_depth}"
+        assert sequence_steps <= len(ref_layout), \
+            f"sequence to revert is longer than the defined pattern: {sequence_steps} > {len(ref_layout)}"
+        # ensure we take the appropriate indexes to keep the model output from the first special token as well
+        if is_model_output:
+            ref_layout = ref_layout[1:]
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(code_depth, timesteps, dtype=torch.long).numpy()
+        mask = torch.zeros(code_depth, timesteps, dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        indexes[:] = code_depth * sequence_steps
+        for s, sequence_codes in enumerate(ref_layout):
+            if s < sequence_steps:
+                for code in sequence_codes:
+                    if code.t < timesteps:
+                        indexes[code.q, code.t] = s + code.q * sequence_steps
+                        mask[code.q, code.t] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+    def revert_pattern_sequence(self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+        """Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving.
+        The sequence is reverted using up to timesteps if specified, and non-pattern coordinates
+        are filled with the special token.
+        Args:
+            s (torch.Tensor): Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].
+            special_token (int or float): Special token used to fill non-pattern coordinates in the new sequence.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T
+                corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        """
+        B, K, S = s.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, K, keep_only_valid_steps, is_model_output=False, device=str(s.device)
+        )
+        s = s.view(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        s = torch.cat([s, torch.zeros_like(s[:, :1]) + special_token], dim=1)
+        values = s[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        return values, indexes, mask
+    def revert_pattern_logits(self, logits: torch.Tensor, special_token: float, keep_only_valid_steps: bool = False):
+        """Revert model logits obtained on a sequence built from the pattern
+        back to a tensor matching the original sequence.
+        This method is similar to ``revert_pattern_sequence`` with the following specificities:
+        1. It is designed to work with the extra cardinality dimension
+        2. We return the logits for the first sequence item that matches the special_token and
+        which matching target in the original sequence is the first item of the sequence,
+        while we skip the last logits as there is no matching target
+        """
+        B, card, K, S = logits.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, K, keep_only_valid_steps, is_model_output=True, device=logits.device
+        )
+        logits = logits.reshape(B, card, -1)
+        # we append the special token as the last index of our flattened z tensor
+        logits = torch.cat([logits, torch.zeros_like(logits[:, :, :1]) + special_token], dim=-1)  # [B, card, K x S]
+        values = logits[:, :, indexes.view(-1)]
+        values = values.view(B, card, K, indexes.shape[-1])
+        return values, indexes, mask
+class CodebooksPatternProvider(ABC):
+    """Abstraction around providing pattern for interleaving codebooks.
+    The CodebooksPatternProvider abstraction allows to implement various strategies to
+    define interleaving pattern of sequences composed of multiple codebooks. For a given
+    number of codebooks `code_depth`, the pattern provider can generate a specified pattern
+    corresponding to a sequence of `T` timesteps with `code_depth` parallel codebooks. This pattern
+    can be used to construct a new sequence from the original codes respecting the specified
+    pattern. The pattern is defined as a list of list of code coordinates, code coordinate
+    being a tuple with the original timestep and codebook to build the new sequence.
+    Note that all patterns must start with an empty list that is then used to insert a first
+    sequence step of special tokens in the newly generated sequence.
+    Args:
+        code_depth (int): number of codebooks.
+        cached (bool): if True, patterns for a given length are cached. In general
+            that should be true for efficiency reason to avoid synchronization points.
+    """
+    def __init__(self, code_depth: int, cached: bool = True):
+        assert code_depth > 0
+        self.code_depth = code_depth
+        self.get_pattern = lru_cache(100)(self.get_pattern)  # type: ignore
+    @abstractmethod
+    def get_pattern(self, timesteps: int) -> Pattern:
+        """Builds pattern with specific interleaving between codebooks.
+        Args:
+            timesteps (int): Total number of timesteps.
+        """
+        raise NotImplementedError()
+class DelayedPatternProvider(CodebooksPatternProvider):
+    """Provider for delayed pattern across delayed codebooks.
+    Codebooks are delayed in the sequence and sequence steps will contain codebooks
+    from different timesteps.
+    Example:
+        Taking timesteps=4 and code_depth=3, delays=None, the multi-codebook sequence:
+        [[1, 2, 3, 4],
+        [1, 2, 3, 4],
+        [1, 2, 3, 4]]
+        The resulting sequence obtained from the returned pattern is:
+        [[S, 1, 2, 3, 4],
+        [S, S, 1, 2, 3],
+        [S, S, S, 1, 2]]
+        (with S being a special token)
+    Args:
+        code_depth (int): Number of codebooks.
+        delays (list of int, optional): Delay for each of the codebooks.
+            If delays not defined, each codebook is delayed by 1 compared to the previous one.
+        flatten_first (int): Flatten the first N timesteps.
+        empty_initial (int): Prepend with N empty list of coordinates.
+    """
+    def __init__(self, code_depth: int, delays: tp.Optional[tp.List[int]] = None,
+                 flatten_first: int = 0, empty_initial: int = 0):
+        super().__init__(code_depth)
+        if delays is None:
+            delays = list(range(code_depth))
+        self.delays = delays
+        self.flatten_first = flatten_first
+        self.empty_initial = empty_initial
+        assert len(self.delays) == self.code_depth
+        assert sorted(self.delays) == self.delays
+    def get_pattern(self, timesteps: int) -> Pattern:
+        out: PatternLayout = [[]]
+        max_delay = max(self.delays)
+        if self.empty_initial:
+            out += [[] for _ in range(self.empty_initial)]
+        if self.flatten_first:
+            for t in range(min(timesteps, self.flatten_first)):
+                for q in range(self.code_depth):
+                    out.append([LayoutCoord(t, q)])
+        for t in range(self.flatten_first, timesteps + max_delay):
+            v = []
+            for q, delay in enumerate(self.delays):
+                t_for_q = t - delay
+                if t_for_q >= self.flatten_first:
+                    v.append(LayoutCoord(t_for_q, q))
+            out.append(v)
+        return Pattern(out, code_depth=self.code_depth, timesteps=timesteps)

codeclm/modules/streaming.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+Streaming module API that should be implemented by all Streaming components,
+"""
+from contextlib import contextmanager
+import typing as tp
+from torch import nn
+import torch
+State = tp.Dict[str, torch.Tensor]
+class StreamingModule(nn.Module):
+    """Common API for streaming components.
+    Each streaming component has a streaming state, which is just a dict[str, Tensor].
+    By convention, the first dim of each tensor must be the batch size.
+    Don't use dots in the key names, as this would clash with submodules
+    (like in state_dict).
+    If `self._is_streaming` is True, the component should use and remember
+    the proper state inside `self._streaming_state`.
+    To set a streaming component in streaming state, use
+        with module.streaming():
+            ...
+    This will automatically reset the streaming state when exiting the context manager.
+    This also automatically propagates to all streaming children module.
+    Some module might also implement the `StreamingModule.flush` method, although
+    this one is trickier, as all parents module must be StreamingModule and implement
+    it as well for it to work properly. See `StreamingSequential` after.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self._streaming_state: State = {}
+        self._is_streaming = False
+    def _apply_named_streaming(self, fn: tp.Any):
+        for name, module in self.named_modules():
+            if isinstance(module, StreamingModule):
+                fn(name, module)
+    def _set_streaming(self, streaming: bool):
+        def _set_streaming(name, module):
+            module._is_streaming = streaming
+        self._apply_named_streaming(_set_streaming)
+    @contextmanager
+    def streaming(self):
+        """Context manager to enter streaming mode. Reset streaming state on exit."""
+        self._set_streaming(True)
+        try:
+            yield
+        finally:
+            self._set_streaming(False)
+            self.reset_streaming()
+    def reset_streaming(self):
+        """Reset the streaming state."""
+        def _reset(name: str, module: StreamingModule):
+            module._streaming_state.clear()
+        self._apply_named_streaming(_reset)
+    def get_streaming_state(self) -> State:
+        """Return the streaming state, including that of sub-modules."""
+        state: State = {}
+        def _add(name: str, module: StreamingModule):
+            if name:
+                name += "."
+            for key, value in module._streaming_state.items():
+                state[name + key] = value
+        self._apply_named_streaming(_add)
+        return state
+    def set_streaming_state(self, state: State):
+        """Set the streaming state, including that of sub-modules."""
+        state = dict(state)
+        def _set(name: str, module: StreamingModule):
+            if name:
+                name += "."
+            module._streaming_state.clear()
+            for key, value in list(state.items()):
+                # complexity is not ideal here, but probably fine.
+                if key.startswith(name):
+                    local_key = key[len(name):]
+                    if '.' not in local_key:
+                        module._streaming_state[local_key] = value
+                        del state[key]
+        self._apply_named_streaming(_set)
+        assert len(state) == 0, list(state.keys())
+    def flush(self, x: tp.Optional[torch.Tensor] = None):
+        """Flush any remaining outputs that were waiting for completion.
+        Typically, for convolutions, this will add the final padding
+        and process the last buffer.
+        This should take an optional argument `x`, which will be provided
+        if a module before this one in the streaming pipeline has already
+        spitted out a flushed out buffer.
+        """
+        if x is None:
+            return None
+        else:
+            return self(x)

codeclm/tokenizer/Flow1dVAE/audio.py ADDED Viewed

	@@ -0,0 +1,304 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@File    : audio.py
+@Time    : 2023/8/8 下午7:18
+@Author  : waytan
+@Contact : [email protected]
+@License : (C)Copyright 2023, Tencent
+@Desc    : Audio
+"""
+import json
+import subprocess as sp
+import typing as tp
+from pathlib import Path
+import lameenc
+import julius
+import torch
+import numpy as np
+import torchaudio as ta
+from contextlib import contextmanager
+import tempfile
+import os
+@contextmanager
+def temp_filenames(count: int, delete=True):
+    names = []
+    try:
+        for _ in range(count):
+            names.append(tempfile.NamedTemporaryFile(delete=False).name)
+        yield names
+    finally:
+        if delete:
+            for name in names:
+                os.unlink(name)
+def _read_info(path):
+    stdout_data = sp.check_output([
+        'ffprobe', "-loglevel", "panic",
+        str(path), '-print_format', 'json', '-show_format', '-show_streams'
+    ])
+    return json.loads(stdout_data.decode('utf-8'))
+class AudioFile:
+    """
+    Allows to read audio from any format supported by ffmpeg, as well as resampling or
+    converting to mono on the fly. See :method:`read` for more details.
+    """
+    def __init__(self, path: Path):
+        self.path = Path(path)
+        self._info = None
+    def __repr__(self):
+        features = [("path", self.path)]
+        features.append(("samplerate", self.samplerate()))
+        features.append(("channels", self.channels()))
+        features.append(("streams", len(self)))
+        features_str = ", ".join(f"{name}={value}" for name, value in features)
+        return f"AudioFile({features_str})"
+    @property
+    def info(self):
+        if self._info is None:
+            self._info = _read_info(self.path)
+        return self._info
+    @property
+    def duration(self):
+        return float(self.info['format']['duration'])
+    @property
+    def _audio_streams(self):
+        return [
+            index for index, stream in enumerate(self.info["streams"])
+            if stream["codec_type"] == "audio"
+        ]
+    def __len__(self):
+        return len(self._audio_streams)
+    def channels(self, stream=0):
+        return int(self.info['streams'][self._audio_streams[stream]]['channels'])
+    def samplerate(self, stream=0):
+        return int(self.info['streams'][self._audio_streams[stream]]['sample_rate'])
+    def read(self,
+             seek_time=None,
+             duration=None,
+             streams=slice(None),
+             samplerate=None,
+             channels=None):
+        """
+        Slightly more efficient implementation than stempeg,
+        in particular, this will extract all stems at once
+        rather than having to loop over one file multiple times
+        for each stream.
+        Args:
+            seek_time (float):  seek time in seconds or None if no seeking is needed.
+            duration (float): duration in seconds to extract or None to extract until the end.
+            streams (slice, int or list): streams to extract, can be a single int, a list or
+                a slice. If it is a slice or list, the output will be of size [S, C, T]
+                with S the number of streams, C the number of channels and T the number of samples.
+                If it is an int, the output will be [C, T].
+            samplerate (int): if provided, will resample on the fly. If None, no resampling will
+                be done. Original sampling rate can be obtained with :method:`samplerate`.
+            channels (int): if 1, will convert to mono. We do not rely on ffmpeg for that
+                as ffmpeg automatically scale by +3dB to conserve volume when playing on speakers.
+                See https://sound.stackexchange.com/a/42710.
+                Our definition of mono is simply the average of the two channels. Any other
+                value will be ignored.
+        """
+        streams = np.array(range(len(self)))[streams]
+        single = not isinstance(streams, np.ndarray)
+        if single:
+            streams = [streams]
+        if duration is None:
+            target_size = None
+            query_duration = None
+        else:
+            target_size = int((samplerate or self.samplerate()) * duration)
+            query_duration = float((target_size + 1) / (samplerate or self.samplerate()))
+        with temp_filenames(len(streams)) as filenames:
+            command = ['ffmpeg', '-y']
+            command += ['-loglevel', 'panic']
+            if seek_time:
+                command += ['-ss', str(seek_time)]
+            command += ['-i', str(self.path)]
+            for stream, filename in zip(streams, filenames):
+                command += ['-map', f'0:{self._audio_streams[stream]}']
+                if query_duration is not None:
+                    command += ['-t', str(query_duration)]
+                command += ['-threads', '1']
+                command += ['-f', 'f32le']
+                if samplerate is not None:
+                    command += ['-ar', str(samplerate)]
+                command += [filename]
+            sp.run(command, check=True)
+            wavs = []
+            for filename in filenames:
+                wav = np.fromfile(filename, dtype=np.float32)
+                wav = torch.from_numpy(wav)
+                wav = wav.view(-1, self.channels()).t()
+                if channels is not None:
+                    wav = convert_audio_channels(wav, channels)
+                if target_size is not None:
+                    wav = wav[..., :target_size]
+                wavs.append(wav)
+        wav = torch.stack(wavs, dim=0)
+        if single:
+            wav = wav[0]
+        return wav
+def convert_audio_channels(wav, channels=2):
+    """Convert audio to the given number of channels."""
+    *shape, src_channels, length = wav.shape
+    if src_channels == channels:
+        pass
+    elif channels == 1:
+        # Case 1:
+        # The caller asked 1-channel audio, but the stream have multiple
+        # channels, downmix all channels.
+        wav = wav.mean(dim=-2, keepdim=True)
+    elif src_channels == 1:
+        # Case 2:
+        # The caller asked for multiple channels, but the input file have
+        # one single channel, replicate the audio over all channels.
+        wav = wav.expand(*shape, channels, length)
+    elif src_channels >= channels:
+        # Case 3:
+        # The caller asked for multiple channels, and the input file have
+        # more channels than requested. In that case return the first channels.
+        wav = wav[..., :channels, :]
+    else:
+        # Case 4: What is a reasonable choice here?
+        raise ValueError('The audio file has less channels than requested but is not mono.')
+    return wav
+def convert_audio(wav, from_samplerate, to_samplerate, channels):
+    """Convert audio from a given samplerate to a target one and target number of channels."""
+    wav = convert_audio_channels(wav, channels)
+    return julius.resample_frac(wav, from_samplerate, to_samplerate)
+def i16_pcm(wav):
+    """Convert audio to 16 bits integer PCM format."""
+    if wav.dtype.is_floating_point:
+        return (wav.clamp_(-1, 1) * (2**15 - 1)).short()
+    else:
+        return wav
+def f32_pcm(wav):
+    """Convert audio to float 32 bits PCM format."""
+    if wav.dtype.is_floating_point:
+        return wav
+    else:
+        return wav.float() / (2**15 - 1)
+def as_dtype_pcm(wav):
+    """Convert audio to either f32 pcm or i16 pcm depending on the given dtype."""
+    if wav.dtype.is_floating_point:
+        return f32_pcm(wav)
+    else:
+        return i16_pcm(wav)
+def encode_mp3(wav, path, samplerate=44100, bitrate=320, verbose=False):
+    """Save given audio as mp3. This should work on all OSes."""
+    c, _ = wav.shape
+    wav = i16_pcm(wav)
+    encoder = lameenc.Encoder()
+    encoder.set_bit_rate(bitrate)
+    encoder.set_in_sample_rate(samplerate)
+    encoder.set_channels(c)
+    encoder.set_quality(2)  # 2-highest, 7-fastest
+    if not verbose:
+        encoder.silence()
+    wav = wav.data.cpu()
+    wav = wav.transpose(0, 1).numpy()
+    mp3_data = encoder.encode(wav.tobytes())
+    mp3_data += encoder.flush()
+    with open(path, "wb") as f:
+        f.write(mp3_data)
+def prevent_clip(wav, mode='rescale'):
+    """
+    different strategies for avoiding raw clipping.
+    """
+    if mode is None or mode == 'none':
+        return wav
+    assert wav.dtype.is_floating_point, "too late for clipping"
+    if mode == 'rescale':
+        wav = wav / max(1.01 * wav.abs().max(), 1)
+    elif mode == 'clamp':
+        wav = wav.clamp(-0.99, 0.99)
+    elif mode == 'tanh':
+        wav = torch.tanh(wav)
+    else:
+        raise ValueError(f"Invalid mode {mode}")
+    return wav
+def save_audio(wav: torch.Tensor,
+               path: tp.Union[str, Path],
+               samplerate: int,
+               bitrate: int = 320,
+               clip: tp.Union[str] = 'rescale',
+               bits_per_sample: tp.Union[int] = 16,
+               as_float: bool = False):
+    """Save audio file, automatically preventing clipping if necessary
+    based on the given `clip` strategy. If the path ends in `.mp3`, this
+    will save as mp3 with the given `bitrate`.
+    """
+    wav = prevent_clip(wav, mode=clip)
+    path = Path(path)
+    suffix = path.suffix.lower()
+    if suffix == ".mp3":
+        encode_mp3(wav, path, samplerate, bitrate, verbose=True)
+    elif suffix == ".wav":
+        if as_float:
+            bits_per_sample = 32
+            encoding = 'PCM_F'
+        else:
+            encoding = 'PCM_S'
+        ta.save(str(path), wav, sample_rate=samplerate,
+                encoding=encoding, bits_per_sample=bits_per_sample)
+    elif suffix == ".flac":
+        ta.save(str(path), wav, sample_rate=samplerate, bits_per_sample=bits_per_sample)
+    else:
+        raise ValueError(f"Invalid suffix for path: {suffix}")
+def load_track(track, audio_channels, samplerate):
+    errors = {}
+    wav = None
+    try:
+        wav = AudioFile(track).read(
+            streams=0,
+            samplerate=samplerate,
+            channels=audio_channels)
+    except sp.CalledProcessError:
+        errors['ffmpeg'] = 'FFmpeg could not read the file.'
+    if wav is None:
+        try:
+            wav, sr = ta.load(str(track))
+        except RuntimeError as err:
+            errors['torchaudio'] = err.args[0]
+        else:
+            wav = convert_audio(wav, sr, samplerate, audio_channels)
+    return wav, errors

codeclm/tokenizer/Flow1dVAE/cal_token_stat.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import kaldiio
+from tqdm import tqdm
+import torch
+if __name__ == "__main__":
+    bar = torch.zeros(1, 16384)
+    with open('token.scp', 'r') as f:
+        for item_idx, line in tqdm(enumerate(f)):
+            idx, pos = line.strip().split()
+            codes = kaldiio.load_mat(pos)
+            for i0 in range(codes.shape[-1]):
+                bar[0, codes[0, 0, i0]] += 1
+            if(item_idx % 1000 == 0):
+                print("=========")
+                print(1 - (bar[0]==0).sum() / bar.shape[-1])
+                print("=========")
+        print("=========")
+        print(1 - (bar[0]==0).sum() / bar.shape[-1])
+        print("=========")

codeclm/tokenizer/Flow1dVAE/compare_model_weight.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch
+import sys
+from safetensors.torch import load_file
+if __name__ == "__main__":
+    m0, m1 = sys.argv[1], sys.argv[2]
+    m0 = load_file(m0)
+    m1 = load_file(m1)
+    ks = [k for k in m0.keys() if 'bestrq' in k]
+    for k in ks:
+        print(k, (m0[k] - m1[k]).abs().sum())

codeclm/tokenizer/Flow1dVAE/configs/models/transformer2D_wocross_inch112_1x4_multi_large.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_class_name": "Transformer2DModel",
+  "_diffusers_version": "0.22.0.dev0",
+  "activation_fn": "gelu-approximate",
+  "attention_bias": true,
+  "attention_head_dim": 72,
+  "attention_type": "default",
+  "cross_attention_dim": null,
+  "double_self_attention": false,
+  "dropout": 0.0,
+  "in_channels": 96,
+  "norm_elementwise_affine": false,
+  "norm_eps": 1e-06,
+  "norm_num_groups": 32,
+  "norm_type": "ada_norm_single",
+  "num_attention_heads": 22,
+  "num_embeds_ada_norm": 1000,
+  "num_layers": 24,
+  "num_vector_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 32,
+  "patch_size": 2,
+  "sample_size": 384,
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

codeclm/tokenizer/Flow1dVAE/configs/scheduler/stable_diffusion_2.1_largenoise_sample.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "_class_name": "DDIMScheduler",
+  "_diffusers_version": "0.8.0",
+  "beta_end": 0.02,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.0015,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "prediction_type": "sample",
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "trained_betas": null
+}

codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x1_and_sep_npy.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch,torchaudio
+import os,sys,json
+from tqdm import tqdm
+import numpy as np
+#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
+from generate_septoken import Tango as Tango_sep
+from generate_2rvq import Tango as Tango_1x2
+import kaldiio
+from kaldiio import WriteHelper
+from audio import AudioFile
+from demucs.models.pretrained import get_model_from_yaml
+from filelock import FileLock
+# os.path.join(args.model_dir, "htdemucs.pth"), os.path.join(args.model_dir, "htdemucs.yaml")
+class Separator:
+    def __init__(self, dm_model_path='demucs/ckpt/htdemucs.pth', dm_config_path='demucs/ckpt/htdemucs.yaml', gpu_id=0) -> None:
+        if torch.cuda.is_available() and gpu_id < torch.cuda.device_count():
+            self.device = torch.device(f"cuda:{gpu_id}")
+        else:
+            self.device = torch.device("cpu")
+        self.demucs_model = self.init_demucs_model(dm_model_path, dm_config_path)
+    def init_demucs_model(self, model_path, config_path):
+        model = get_model_from_yaml(config_path, model_path)
+        model.to(self.device)
+        model.eval()
+        return model
+    def load_audio(self, f):
+        a, fs = torchaudio.load(f)
+        if (fs != 48000):
+            a = torchaudio.functional.resample(a, fs, 48000)
+        # if a.shape[-1] >= 48000*10:
+        #     a = a[..., :48000*10]
+        # else:
+        #     a = torch.cat([a, a], -1)
+        # return a[:, 0:48000*10]
+        return a
+    def run(self, audio_path, output_dir='demucs/test_output', ext=".flac"):
+        name, _ = os.path.splitext(os.path.split(audio_path)[-1])
+        output_paths = []
+        # lock_path = os.path.join(output_dir, f"{name}.lock")
+        # with FileLock(lock_path):  # 加一个避免多卡访问时死锁
+        for stem in self.demucs_model.sources:
+            output_path = os.path.join(output_dir, f"{name}_{stem}{ext}")
+            if os.path.exists(output_path):
+                output_paths.append(output_path)
+        if len(output_paths) == 1:  # 4
+            # drums_path, bass_path, other_path, vocal_path = output_paths
+            vocal_path = output_paths[0]
+        else:
+            lock_path = os.path.join(output_dir, f"{name}_separate.lock")
+            with FileLock(lock_path):
+                drums_path, bass_path, other_path, vocal_path = self.demucs_model.separate(audio_path, output_dir, device=self.device)
+        full_audio = self.load_audio(audio_path)
+        vocal_audio = self.load_audio(vocal_path)
+        minlen = min(full_audio.shape[-1], vocal_audio.shape[-1])
+        # bgm_audio = full_audio[:, 0:minlen] - vocal_audio[:, 0:minlen]
+        bgm_audio = self.load_audio(drums_path) + self.load_audio(bass_path) + self.load_audio(other_path)
+        for path in [drums_path, bass_path, other_path, vocal_path]:
+            os.remove(path)
+        return full_audio, vocal_audio, bgm_audio
+def read_wav(fname, sample_rate=48_000):
+    try:
+        orig_samples, fs = torchaudio.load(fname)
+    except:
+        af = AudioFile(fname)
+        orig_samples = af.read()
+        fs = af.samplerate()
+        orig_samples = orig_samples[0]
+    if(fs!=sample_rate):
+        orig_samples = torchaudio.functional.resample(orig_samples, fs, sample_rate)
+        fs = sample_rate
+    if orig_samples.shape[0] == 1:
+        orig_samples = torch.cat([orig_samples, orig_samples], 0)
+    return orig_samples
+if __name__ == "__main__":
+    # Define Model
+    json_path = sys.argv[1]
+    mus_infos = []
+    with open(json_path) as f:
+        for line in f:
+            item = json.loads(line)
+            mus_infos.append(item)
+    tango_sep = Tango_sep(model_path="./saved/model_septoken/model_2.safetensors")
+    tango_1x2 = Tango_1x2(model_path = './saved/model_2rvq/model_2_fixed.safetensors', rvq_num=2)
+    separator = Separator()
+    # Feature extraction loop
+    # for i in tqdm(range(2000)):
+    first_time = True
+    for item in tqdm(mus_infos):
+        if(os.path.exists(item['path'])):
+            full_path = item['path']
+        else:
+            full_path = '/mnt/share/' + item['path']
+        full_tensor, vocal_tensor, bgm_tensor = separator.run(full_path)
+        # full_tensor = read_wav(full_path)
+        # vocal_tensor = read_wav(vocal_path)
+        # length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
+        # full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
+        # bgm_tensor = full_tensor - vocal_tensor
+        codes_1x2 = tango_1x2.sound2code(full_tensor)
+        codes_vocal, codes_bgm = tango_sep.sound2code(vocal_tensor, bgm_tensor)
+        codes = torch.cat([codes_1x2[:,[0],:], codes_vocal, codes_bgm], 1).cpu().numpy()
+        save_path = full_path.replace('.wav', '.1x1_and_sep.npy').replace('.mp3', '.1x1_and_sep.npy').replace('.flac', '.1x1_and_sep.npy').replace('.ogg', '.1x1_and_sep.npy')
+        assert save_path != full_path, (save_path, full_path)
+        np.save(save_path, codes)
+        if(first_time):
+            first_time = False
+            print(codes_vocal.shape, codes_bgm.shape)

codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x1_sep.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch,torchaudio
+import os,sys,json
+from tqdm import tqdm
+#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
+from generate_septoken import Tango
+import kaldiio
+from kaldiio import WriteHelper
+from audio import AudioFile
+def read_wav(fname, sample_rate=48_000):
+    try:
+        orig_samples, fs = torchaudio.load(fname)
+    except:
+        af = AudioFile(fname)
+        orig_samples = af.read()
+        fs = af.samplerate()
+        orig_samples = orig_samples[0]
+    if(fs!=sample_rate):
+        orig_samples = torchaudio.functional.resample(orig_samples, fs, sample_rate)
+        fs = sample_rate
+    if orig_samples.shape[0] == 1:
+        orig_samples = torch.cat([orig_samples, orig_samples], 0)
+    return orig_samples
+if __name__ == "__main__":
+    # Define Model
+    json_path = sys.argv[1]
+    outdir = sys.argv[2]
+    mus_infos = []
+    with open(json_path) as f:
+        for line in f:
+            item = json.loads(line)
+            mus_infos.append(item)
+    tango = Tango(model_path="./saved/model_septoken/model_2.safetensors")
+    # Feature extraction loop
+    # for i in tqdm(range(2000)):
+    first_time = True
+    with WriteHelper('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir), write_function="pickle") as writer_vocal,  WriteHelper('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir), write_function="pickle") as writer_bgm:
+        print('ark,scp:{}/token_vocal.ark,{}/token_vocal.scp'.format(outdir, outdir))
+        print('ark,scp:{}/token_bgm.ark,{}/token_bgm.scp'.format(outdir, outdir))
+        for item in tqdm(mus_infos):
+            try:
+            # if True:
+                idx = item['idx']
+                # print(idx)
+                if(os.path.exists(item['path'])):
+                    full_path = item['path']
+                else:
+                    full_path = '/mnt/share/' + item['path']
+                if(os.path.exists(item['vocal_path'])):
+                    vocal_path = item['vocal_path']
+                    bgm_paths = item['bgm_path']
+                else:
+                    vocal_path = '/mnt/share/' + item['vocal_path']
+                    bgm_paths = ['/mnt/share/' + p for p in item['bgm_path']]
+                vocal_tensor = read_wav(vocal_path)
+                # full_tensor = read_wav(full_path)
+                # length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
+                # full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
+                # bgm_tensor = full_tensor - vocal_tensor
+                bgm_tensor = sum([read_wav(p) for p in bgm_paths])
+                codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor)
+                writer_vocal(str(idx), codes_vocal.cpu())
+                writer_bgm(str(idx), codes_bgm.cpu())
+                if(first_time):
+                    first_time = False
+                    print(codes_vocal.shape, codes_bgm.shape)
+            except:
+                print(item['vocal_path'])
+                print(item['bgm_path'])
+                continue
+            # idx = item['idx']
+            # # print(idx)
+            # full_path = item['path']
+            # vocal_path = item['vocal_path']
+            # bgm_paths = item['bgm_path']
+            # full_tensor = read_wav(full_path)
+            # vocal_tensor = read_wav(vocal_path)
+            # length = min(full_tensor.shape[-1], vocal_tensor.shape[-1])
+            # full_tensor, vocal_tensor = full_tensor[:, 0:length], vocal_tensor[:, 0:length]
+            # bgm_tensor = full_tensor - vocal_tensor
+            # codes_vocal, codes_bgm = tango.sound2code(vocal_tensor, bgm_tensor)
+            # writer_vocal(str(idx), codes_vocal.cpu())
+            # writer_bgm(str(idx), codes_bgm.cpu())
+            # if(first_time):
+            #     first_time = False
+            #     print(codes_vocal.shape, codes_bgm.shape)

codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x2.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch,torchaudio
+import os,sys,json
+from tqdm import tqdm
+#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
+from generate_2rvq import Tango
+import kaldiio
+from kaldiio import WriteHelper
+import torch
+import subprocess
+import time
+import sys
+def get_gpu_memory():
+    _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
+    ACCEPTABLE_AVAILABLE_MEMORY = 1024
+    COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
+    memory_free_info = _output_to_list(subprocess.check_output(COMMAND.split()))[1:]
+    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
+    return memory_free_values
+if __name__ == "__main__":
+    # Define Model
+    json_path = sys.argv[1]
+    outdir = sys.argv[2]
+    gpu_idx = int(os.environ['CUDA_VISIBLE_DEVICES'])
+    while True:
+        free_mem = get_gpu_memory()
+        free_mem = free_mem[gpu_idx]
+        if(free_mem > 25_000):
+            print("GPU memory {}, run matrix cal".format(free_mem))
+            break
+        else:
+            print("GPU memory {}, sleep 1min".format(free_mem))
+            time.sleep(60)
+    mus_infos = []
+    with open(json_path) as f:
+        for line in f:
+            item = json.loads(line)
+            mus_infos.append(item)
+    tango = Tango(model_path = './saved/model_2rvq/model_2_fixed.safetensors', rvq_num=2)
+    # Feature extraction loop
+    # for i in tqdm(range(2000)):
+    with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer:
+        print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir))
+        for item in tqdm(mus_infos):
+            try:
+            # if True:
+                idx = item['idx']
+                # print(idx)
+                with torch.autocast(device_type="cuda", dtype=torch.float16):
+                    if(os.path.exists(item['path'])):
+                        codes = tango.file2code(item['path'])
+                    else:
+                        codes = tango.file2code('/mnt/share/' + item['path'])
+                writer(str(idx), codes.cpu())
+            except:
+                print(item['path'])
+                continue
+            # idx = item['idx']
+            # # print(idx)
+            # with torch.autocast(device_type="cuda", dtype=torch.float16):
+            #     codes = tango.file2code(item['path'])
+            # writer(str(idx), codes.cpu())

codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch,torchaudio
+import os,sys,json
+from tqdm import tqdm
+#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
+from generate_4rvq import Tango
+import kaldiio
+from kaldiio import WriteHelper
+if __name__ == "__main__":
+    # Define Model
+    json_path = sys.argv[1]
+    outdir = sys.argv[2]
+    mus_infos = []
+    with open(json_path) as f:
+        for line in f:
+            item = json.loads(line)
+            mus_infos.append(item)
+    tango = Tango(model_path = './saved/model_4rvq/model_2_fixed.safetensors', rvq_num=4)
+    # Feature extraction loop
+    # for i in tqdm(range(2000)):
+    with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer:
+        print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir))
+        for item in tqdm(mus_infos):
+            try:
+            # if True:
+                idx = item['idx']
+                # print(idx)
+                with torch.autocast(device_type="cuda", dtype=torch.float16):
+                    if(os.path.exists(item['path'])):
+                        codes = tango.file2code(item['path'])
+                    else:
+                        codes = tango.file2code('/mnt/share/' + item['path'])
+                writer(str(idx), codes.cpu())
+            except:
+                print(item['path'])
+                continue
+            # idx = item['idx']
+            # # print(idx)
+            # with torch.autocast(device_type="cuda", dtype=torch.float16):
+            #     codes = tango.file2code(item['path'])
+            # writer(str(idx), codes.cpu())

codeclm/tokenizer/Flow1dVAE/extract_codes_stereo_7_1x4_ds.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch,torchaudio
+import os,sys,json
+from tqdm import tqdm
+#from codeclm_song_v1.codeclm.semantic_extractor.SpeechDecoder_v01.generate import Tango
+from generate_4rvq import Tango
+import kaldiio
+from kaldiio import WriteHelper
+import torch
+import subprocess
+import time
+import sys
+def get_gpu_memory():
+    _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
+    ACCEPTABLE_AVAILABLE_MEMORY = 1024
+    COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
+    memory_free_info = _output_to_list(subprocess.check_output(COMMAND.split()))[1:]
+    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
+    return memory_free_values
+if __name__ == "__main__":
+    # Define Model
+    json_path = sys.argv[1]
+    outdir = sys.argv[2]
+    ds = int(sys.argv[3])
+    gpu_idx = int(os.environ['CUDA_VISIBLE_DEVICES'])
+    while True:
+        free_mem = get_gpu_memory()
+        free_mem = free_mem[gpu_idx]
+        if(free_mem > 25_000):
+            print("GPU memory {}, run matrix cal".format(free_mem))
+            break
+        else:
+            print("GPU memory {}, sleep 1min".format(free_mem))
+            time.sleep(60)
+    mus_infos = []
+    with open(json_path) as f:
+        for line in f:
+            item = json.loads(line)
+            mus_infos.append(item)
+    tango = Tango(model_path = './saved/model_4rvq/model_2_fixed.safetensors', rvq_num=4)
+    # Feature extraction loop
+    # for i in tqdm(range(2000)):
+    with WriteHelper('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir), write_function="pickle") as writer:
+        print('ark,scp:{}/token.ark,{}/token.scp'.format(outdir, outdir))
+        bar = torch.zeros(4, 16384)
+        for item_idx, item in tqdm(enumerate(mus_infos)):
+            try:
+            # if True:
+                idx = item['idx']
+                # print(idx)
+                with torch.autocast(device_type="cuda", dtype=torch.float16):
+                    if(os.path.exists(item['path'])):
+                        codes = tango.file2code_ds(item['path'], ds)
+                    else:
+                        codes = tango.file2code_ds('/mnt/share/' + item['path'], ds)
+                codes = codes.cpu()
+                writer(str(idx), codes)
+                for i0 in range(codes.shape[-1]):
+                    bar[0, codes[0, 0, i0]] += 1
+                    bar[1, codes[0, 1, i0]] += 1
+                    bar[2, codes[0, 2, i0]] += 1
+                    bar[3, codes[0, 3, i0]] += 1
+            except Exception as e:
+                print(item['path'])
+                # print(e.message, e.args)
+                # exit(1)
+                continue
+            if(item_idx % 1000 == 0):
+                print("=========")
+                print(1 - (bar[0]==0).sum() / bar.shape[-1])
+                print("=========")
+            # idx = item['idx']
+            # # print(idx)
+            # with torch.autocast(device_type="cuda", dtype=torch.float16):
+            #     codes = tango.file2code(item['path'])
+            # writer(str(idx), codes.cpu())

codeclm/tokenizer/Flow1dVAE/generate_1rvq.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import json
+import torch
+from tqdm import tqdm
+from model_1rvq import PromptCondAudioDiffusion
+from diffusers import DDIMScheduler, DDPMScheduler
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+from tools.get_1dvae_large import get_model
+import tools.torch_tools as torch_tools
+from safetensors.torch import load_file
+class Tango:
+    def __init__(self, \
+        model_path, \
+        vae_config="",
+        vae_model="",
+        layer_num=6, \
+        device="cuda:0"):
+        self.sample_rate = 48000
+        scheduler_name = "configs/scheduler/stable_diffusion_2.1_largenoise_sample.json"
+        self.device = device
+        self.vae = get_model(vae_config, vae_model)
+        self.vae = self.vae.to(device)
+        self.vae=self.vae.eval()
+        self.layer_num = layer_num
+        self.MAX_DURATION = 360
+        main_config = {
+            "num_channels":32,
+            "unet_model_name":None,
+            "unet_model_config_path":"configs/models/transformer2D_wocross_inch112_1x4_multi_large.json",
+            "snr_gamma":None,
+        }
+        self.model = PromptCondAudioDiffusion(**main_config).to(device)
+        if model_path.endswith(".safetensors"):
+            main_weights = load_file(model_path)
+        else:
+            main_weights = torch.load(model_path, map_location=device)
+        self.model.load_state_dict(main_weights, strict=False)
+        print ("Successfully loaded checkpoint from:", model_path)
+        self.model.eval()
+        self.model.init_device_dtype(torch.device(device), torch.float32)
+        print("scaling factor: ", self.model.normfeat.std)
+        # self.scheduler = DDIMScheduler.from_pretrained( \
+        #     scheduler_name, subfolder="scheduler")
+        # self.scheduler = DDPMScheduler.from_pretrained( \
+        #     scheduler_name, subfolder="scheduler")
+        # print("Successfully loaded inference scheduler from {}".format(scheduler_name))
+    # def sound2sound(self, orig_samples, lyric, st_et, batch_size=1, duration=40.96, steps=200, disable_progress=False,scenario = "start_seg"):
+    #     """ Genrate audio without condition. """
+    #     with torch.no_grad():
+    #         if(orig_samples.shape[-1]<int(duration*48000)+480):
+    #             orig_samples =  torch.cat([orig_samples, torch.zeros(orig_samples.shape[0], int(duration*48000+480)-orig_samples.shape[-1], \
+    #                 dtype=orig_samples.dtype, device=orig_samples.device)], -1)
+    #         orig_samples = orig_samples.to(self.device)
+    #         saved_samples = orig_samples[:,0:40*48000].clamp(-1,1)
+    #         orig_samples = orig_samples[:,0:40*48000].clamp(-1,1)
+    #         max_volume = orig_samples.abs().max(dim=-1)[0]
+    #         orig_samples = orig_samples/max_volume.unsqueeze(-1)
+    #         print("orig_samples.shape", orig_samples.shape)
+    #         latent_length = int((st_et[1] - st_et[0]) * 48000) // 1920 + 1
+    #         true_latents = self.vae.encode_audio(orig_samples).permute(0,2,1)
+    #         print("true_latents.shape", true_latents.shape)
+    #         latents = self.model.inference(orig_samples.repeat(batch_size, 1), [lyric, ]*batch_size, true_latents, latent_length, additional_feats=[], guidance_scale=1.5, num_steps = steps, disable_progress=disable_progress,layer=6, scenario = scenario)
+    #         print("latents.shape", latents.shape)
+    #         print("latent_length", latent_length)
+    #         latents = latents[:,:,:latent_length]
+    #         audio = self.vae.decode_audio(latents)
+    #         print("audio.shape:",audio.shape)
+    #         audio = torch.cat((audio, torch.zeros(audio.shape[0],audio.shape[1], 48000*40 - audio.shape[-1], dtype=audio.dtype, device=audio.device)), dim=-1)
+    #         print("audio.shape:",audio.shape)
+    #         # audio = audio.reshape(audio.shape[0]//2, 2, -1)
+    #         # audio = torch.from_numpy(audio)
+    #         if(saved_samples.shape[-1]<audio.shape[-1]):
+    #             saved_samples = torch.cat([saved_samples, torch.zeros(saved_samples.shape[0], audio.shape[-1]-saved_samples.shape[-1], dtype=saved_samples.dtype, device=saved_samples.device)],-1)
+    #         else:
+    #             saved_samples = saved_samples[:,0:audio.shape[-1]]
+    #         output = torch.cat([saved_samples.detach().cpu(),audio[0].detach().cpu()],0)
+    #     return output
+    @torch.no_grad()
+    @torch.autocast(device_type="cuda", dtype=torch.float32)
+    def sound2code(self, orig_samples, batch_size=3):
+        if(orig_samples.ndim == 2):
+            audios = orig_samples.unsqueeze(0).to(self.device)
+        elif(orig_samples.ndim == 3):
+            audios = orig_samples.to(self.device)
+        else:
+            assert orig_samples.ndim in (2,3), orig_samples.shape
+        audios = self.preprocess_audio(audios)
+        audios = audios.squeeze(0)
+        orig_length = audios.shape[-1]
+        min_samples = int(40 * self.sample_rate)
+        # 40秒对应10个token
+        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
+        print("output_len: ", output_len)
+        while(audios.shape[-1] < min_samples):
+            audios = torch.cat([audios, audios], -1)
+        int_max_len=audios.shape[-1]//min_samples+1
+        audios = torch.cat([audios, audios], -1)
+        audios=audios[:,:int(int_max_len*(min_samples))]
+        codes_list=[]
+        audio_input = audios.reshape(2, -1, min_samples).permute(1, 0, 2).reshape(-1, 2, min_samples)
+        for audio_inx in range(0, audio_input.shape[0], batch_size):
+            # import pdb; pdb.set_trace()
+            codes, _, spk_embeds = self.model.fetch_codes_batch((audio_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer=self.layer_num)
+            codes_list.append(torch.cat(codes, 1))
+            # print("codes_list",codes_list[0].shape)
+        codes = torch.cat(codes_list, 0).permute(1,0,2).reshape(1, -1)[None] # B 3 T -> 3 B T
+        codes=codes[:,:,:output_len]
+        return codes
+    @torch.no_grad()
+    def code2sound(self, codes, prompt=None, duration=40, guidance_scale=1.5, num_steps=20, disable_progress=False):
+        codes = codes.to(self.device)
+        min_samples = int(duration * 25) # 40ms per frame
+        hop_samples = min_samples // 4 * 3
+        ovlp_samples = min_samples - hop_samples
+        hop_frames = hop_samples
+        ovlp_frames = ovlp_samples
+        first_latent = torch.randn(codes.shape[0], min_samples, 64).to(self.device)
+        first_latent_length = 0
+        first_latent_codes_length = 0
+        if(isinstance(prompt, torch.Tensor)):
+            # prepare prompt
+            prompt = prompt.to(self.device)
+            if(prompt.ndim == 3):
+                assert prompt.shape[0] == 1, prompt.shape
+                prompt = prompt[0]
+            elif(prompt.ndim == 1):
+                prompt = prompt.unsqueeze(0).repeat(2,1)
+            elif(prompt.ndim == 2):
+                if(prompt.shape[0] == 1):
+                    prompt = prompt.repeat(2,1)
+            if(prompt.shape[-1] < int(30 * self.sample_rate)):
+                # if less than 30s, just choose the first 10s
+                prompt = prompt[:,:int(10*self.sample_rate)] # limit max length to 10.24
+            else:
+                # else choose from 20.48s which might includes verse or chorus
+                prompt = prompt[:,int(20*self.sample_rate):int(30*self.sample_rate)] # limit max length to 10.24
+            true_latent = self.vae.encode_audio(prompt).permute(0,2,1)
+            # print("true_latent.shape", true_latent.shape)
+            # print("first_latent.shape", first_latent.shape)
+            #true_latent.shape torch.Size([1, 250, 64])
+            # first_latent.shape torch.Size([1, 1000, 64])
+            first_latent[:,0:true_latent.shape[1],:] = true_latent
+            first_latent_length = true_latent.shape[1]
+            first_latent_codes = self.sound2code(prompt)
+            first_latent_codes_length = first_latent_codes.shape[-1]
+            codes = torch.cat([first_latent_codes, codes], -1)
+        codes_len= codes.shape[-1]
+        target_len = int((codes_len - first_latent_codes_length) / 100 * 4 * self.sample_rate)
+        # target_len = int(codes_len / 100 * 4 * self.sample_rate)
+        # code repeat
+        if(codes_len < min_samples):
+            while(codes.shape[-1] < min_samples):
+                codes = torch.cat([codes, codes], -1)
+            codes = codes[:,:,0:min_samples]
+        codes_len = codes.shape[-1]
+        if((codes_len - ovlp_samples) % hop_samples > 0):
+            len_codes=math.ceil((codes_len - ovlp_samples) / float(hop_samples)) * hop_samples + ovlp_samples
+            while(codes.shape[-1] < len_codes):
+                codes = torch.cat([codes, codes], -1)
+            codes = codes[:,:,0:len_codes]
+        latent_length = min_samples
+        latent_list = []
+        spk_embeds = torch.zeros([1, 32, 1, 32], device=codes.device)
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            for sinx in range(0, codes.shape[-1]-hop_samples, hop_samples):
+                codes_input=[]
+                codes_input.append(codes[:,:,sinx:sinx+min_samples])
+                if(sinx == 0):
+                    # print("Processing {} to {}".format(sinx/self.sample_rate, (sinx + min_samples)/self.sample_rate))
+                    incontext_length = first_latent_length
+                    latents = self.model.inference_codes(codes_input, spk_embeds, first_latent, latent_length, incontext_length=incontext_length, additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
+                    latent_list.append(latents)
+                else:
+                    # print("Processing {} to {}".format(sinx/self.sample_rate, (sinx + min_samples)/self.sample_rate))
+                    true_latent = latent_list[-1][:,:,-ovlp_frames:].permute(0,2,1)
+                    print("true_latent.shape", true_latent.shape)
+                    len_add_to_1000 = min_samples - true_latent.shape[-2]
+                    # print("len_add_to_1000", len_add_to_1000)
+                    # exit()
+                    incontext_length = true_latent.shape[-2]
+                    true_latent = torch.cat([true_latent, torch.randn(true_latent.shape[0],  len_add_to_1000, true_latent.shape[-1]).to(self.device)], -2)
+                    latents = self.model.inference_codes(codes_input, spk_embeds, true_latent, latent_length, incontext_length=incontext_length,  additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
+                    latent_list.append(latents)
+        latent_list = [l.float() for l in latent_list]
+        latent_list[0] = latent_list[0][:,:,first_latent_length:]
+        min_samples =  int(min_samples * self.sample_rate // 1000 * 40)
+        hop_samples = int(hop_samples * self.sample_rate // 1000 * 40)
+        ovlp_samples = min_samples - hop_samples
+        with torch.no_grad():
+            output = None
+            for i in range(len(latent_list)):
+                latent = latent_list[i]
+                cur_output = self.vae.decode_audio(latent)[0].detach().cpu()
+                if output is None:
+                    output = cur_output
+                else:
+                    ov_win = torch.from_numpy(np.linspace(0, 1, ovlp_samples)[None, :])
+                    ov_win = torch.cat([ov_win, 1 - ov_win], -1)
+                    print("output.shape", output.shape)
+                    print("ov_win.shape", ov_win.shape)
+                    output[:, -ovlp_samples:] = output[:, -ovlp_samples:] * ov_win[:, -ovlp_samples:] + cur_output[:, 0:ovlp_samples] * ov_win[:, 0:ovlp_samples]
+                    output = torch.cat([output, cur_output[:, ovlp_samples:]], -1)
+            output = output[:, 0:target_len]
+        return output
+    @torch.no_grad()
+    def preprocess_audio(self, input_audios, threshold=0.8):
+        assert len(input_audios.shape) == 3, input_audios.shape
+        nchan = input_audios.shape[1]
+        input_audios = input_audios.reshape(input_audios.shape[0], -1)
+        norm_value = torch.ones_like(input_audios[:,0])
+        max_volume = input_audios.abs().max(dim=-1)[0]
+        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
+        return input_audios.reshape(input_audios.shape[0], nchan, -1)/norm_value.unsqueeze(-1).unsqueeze(-1)
+    @torch.no_grad()
+    def sound2sound(self, sound, prompt=None, steps=50, disable_progress=False):
+        codes = self.sound2code(sound)
+        # print(codes.shape)
+        wave = self.code2sound(codes, prompt, guidance_scale=1.5, num_steps=steps, disable_progress=disable_progress)
+        # print(fname, wave.shape)
+        return wave
+    @torch.no_grad()
+    def sound2sound_vae(self, sound, prompt=None, steps=50, disable_progress=False):
+        min_samples = int(40 * 25) # 40ms per frame
+        hop_samples = min_samples // 4 * 3
+        ovlp_samples = min_samples - hop_samples
+        dur = 20
+        latent_list = []
+        for i in range(0, sound.shape[-1], dur*48000):
+            if(i+dur*2*48000 > sound.shape[-1]):
+                latent = tango.vae.encode_audio(sound.cuda()[None,:,i:])
+                break
+            else:
+                latent = tango.vae.encode_audio(sound.cuda()[None,:,i:i+dur*48000])
+            latent_list.append(latent)
+        output = None
+        for i in range(len(latent_list)):
+            print(i)
+            latent = latent_list[i]
+            cur_output = self.vae.decode_audio(latent)[0].detach().cpu()
+            if output is None:
+                output = cur_output
+            else:
+                output = torch.cat([output, cur_output], -1)
+        return output

codeclm/tokenizer/Flow1dVAE/generate_2rvq.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import json
+import torch
+from tqdm import tqdm
+from model_2rvq import PromptCondAudioDiffusion
+from diffusers import DDIMScheduler, DDPMScheduler
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+# from tools.get_mulan import get_mulan
+from tools.get_1dvae_large import get_model
+import tools.torch_tools as torch_tools
+from safetensors.torch import load_file
+from audio import AudioFile
+import kaldiio
+class Tango:
+    def __init__(self, \
+        model_path, \
+        layer_num=6, \
+        rvq_num=1, \
+        device="cuda:0"):
+        self.sample_rate = 48000
+        scheduler_name = "configs/scheduler/stable_diffusion_2.1_largenoise_sample.json"
+        self.device = device
+        self.vae = get_model()
+        self.vae = self.vae.to(device)
+        self.vae=self.vae.eval()
+        self.layer_num = layer_num
+        self.MAX_DURATION = 360
+        main_config = {
+            "num_channels":32,
+            "unet_model_name":None,
+            "unet_model_config_path":"configs/models/transformer2D_wocross_inch112_1x4_multi_large.json",
+            "snr_gamma":None,
+        }
+        self.rvq_num = rvq_num
+        # print("rvq_num: ", self.rvq_num)
+        # exit()
+        self.model = PromptCondAudioDiffusion(**main_config).to(device)
+        if model_path.endswith(".safetensors"):
+            main_weights = load_file(model_path)
+        else:
+            main_weights = torch.load(model_path, map_location=device)
+        self.model.load_state_dict(main_weights, strict=False)
+        print ("Successfully loaded checkpoint from:", model_path)
+        self.model.eval()
+        self.model.init_device_dtype(torch.device(device), torch.float32)
+        print("scaling factor: ", self.model.normfeat.std)
+        # self.scheduler = DDIMScheduler.from_pretrained( \
+        #     scheduler_name, subfolder="scheduler")
+        # self.scheduler = DDPMScheduler.from_pretrained( \
+        #     scheduler_name, subfolder="scheduler")
+        print("Successfully loaded inference scheduler from {}".format(scheduler_name))
+    @torch.no_grad()
+    @torch.autocast(device_type="cuda", dtype=torch.float32)
+    def sound2code(self, orig_samples, batch_size=8):
+        if(orig_samples.ndim == 2):
+            audios = orig_samples.unsqueeze(0).to(self.device)
+        elif(orig_samples.ndim == 3):
+            audios = orig_samples.to(self.device)
+        else:
+            assert orig_samples.ndim in (2,3), orig_samples.shape
+        audios = self.preprocess_audio(audios)
+        audios = audios.squeeze(0)
+        orig_length = audios.shape[-1]
+        min_samples = int(40 * self.sample_rate)
+        # 40秒对应10个token
+        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
+        # print("output_len: ", output_len)
+        while(audios.shape[-1] < min_samples):
+            audios = torch.cat([audios, audios], -1)
+        int_max_len=audios.shape[-1]//min_samples+1
+        audios = torch.cat([audios, audios], -1)
+        audios=audios[:,:int(int_max_len*(min_samples))]
+        codes_list=[]
+        audio_input = audios.reshape(2, -1, min_samples).permute(1, 0, 2).reshape(-1, 2, min_samples)
+        for audio_inx in range(0, audio_input.shape[0], batch_size):
+            # import pdb; pdb.set_trace()
+            codes, _, spk_embeds = self.model.fetch_codes_batch((audio_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer=self.layer_num, rvq_num=self.rvq_num)
+            # print("codes",codes[0].shape)
+            codes_list.append(torch.cat(codes, 1))
+            # print("codes_list",codes_list[0].shape)
+        codes = torch.cat(codes_list, 0).permute(1,0,2).reshape(self.rvq_num, -1)[None] # B 3 T -> 3 B T
+        codes=codes[:,:,:output_len]
+        return codes
+    @torch.no_grad()
+    @torch.autocast(device_type="cuda", dtype=torch.float32)
+    def sound2code_ds(self, orig_samples, ds, batch_size=8):
+        if(orig_samples.ndim == 2):
+            audios = orig_samples.unsqueeze(0).to(self.device)
+        elif(orig_samples.ndim == 3):
+            audios = orig_samples.to(self.device)
+        else:
+            assert orig_samples.ndim in (2,3), orig_samples.shape
+        audios = self.preprocess_audio(audios)
+        audios = audios.squeeze(0)
+        orig_length = audios.shape[-1]
+        min_samples = int(40 * self.sample_rate)
+        # 40秒对应10个token
+        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
+        # print("output_len: ", output_len)
+        while(audios.shape[-1] < min_samples):
+            audios = torch.cat([audios, audios], -1)
+        int_max_len=audios.shape[-1]//min_samples+1
+        audios = torch.cat([audios, audios], -1)
+        audios=audios[:,:int(int_max_len*(min_samples))]
+        codes_list=[]
+        audio_input = audios.reshape(2, -1, min_samples).permute(1, 0, 2).reshape(-1, 2, min_samples)
+        for audio_inx in range(0, audio_input.shape[0], batch_size):
+            # import pdb; pdb.set_trace()
+            codes, _, spk_embeds = self.model.fetch_codes_batch_ds((audio_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer=self.layer_num, rvq_num=self.rvq_num, ds=ds)
+            # print("codes",codes[0].shape)
+            codes_list.append(torch.cat(codes, 1))
+            # print("codes_list",codes_list[0].shape)
+        codes = torch.cat(codes_list, 0).permute(1,0,2).reshape(self.rvq_num, -1)[None] # B 3 T -> 3 B T
+        codes=codes[:,:,:output_len]
+        return codes
+    @torch.no_grad()
+    def code2sound(self, codes, prompt=None, duration=40, guidance_scale=1.5, num_steps=20, disable_progress=False):
+        codes = codes.to(self.device)
+        min_samples = duration * 25 # 40ms per frame
+        hop_samples = min_samples // 4 * 3
+        ovlp_samples = min_samples - hop_samples
+        hop_frames = hop_samples
+        ovlp_frames = ovlp_samples
+        first_latent = torch.randn(codes.shape[0], min_samples, 64).to(self.device)
+        first_latent_length = 0
+        first_latent_codes_length = 0
+        if(isinstance(prompt, torch.Tensor)):
+            # prepare prompt
+            prompt = prompt.to(self.device)
+            if(prompt.ndim == 3):
+                assert prompt.shape[0] == 1, prompt.shape
+                prompt = prompt[0]
+            elif(prompt.ndim == 1):
+                prompt = prompt.unsqueeze(0).repeat(2,1)
+            elif(prompt.ndim == 2):
+                if(prompt.shape[0] == 1):
+                    prompt = prompt.repeat(2,1)
+            if(prompt.shape[-1] < int(30 * self.sample_rate)):
+                # if less than 30s, just choose the first 10s
+                prompt = prompt[:,:int(10*self.sample_rate)] # limit max length to 10.24
+            else:
+                # else choose from 20.48s which might includes verse or chorus
+                prompt = prompt[:,int(20*self.sample_rate):int(30*self.sample_rate)] # limit max length to 10.24
+            true_latent = self.vae.encode_audio(prompt).permute(0,2,1)
+            # print("true_latent.shape", true_latent.shape)
+            # print("first_latent.shape", first_latent.shape)
+            #true_latent.shape torch.Size([1, 250, 64])
+            # first_latent.shape torch.Size([1, 1000, 64])
+            first_latent[:,0:true_latent.shape[1],:] = true_latent
+            first_latent_length = true_latent.shape[1]
+            first_latent_codes = self.sound2code(prompt)
+            first_latent_codes_length = first_latent_codes.shape[-1]
+            codes = torch.cat([first_latent_codes, codes], -1)
+        codes_len= codes.shape[-1]
+        target_len = int((codes_len - first_latent_codes_length) / 100 * 4 * self.sample_rate)
+        # target_len = int(codes_len / 100 * 4 * self.sample_rate)
+        # code repeat
+        if(codes_len < min_samples):
+            while(codes.shape[-1] < min_samples):
+                codes = torch.cat([codes, codes], -1)
+            codes = codes[:,:,0:min_samples]
+        codes_len = codes.shape[-1]
+        if((codes_len - ovlp_samples) % hop_samples > 0):
+            len_codes=math.ceil((codes_len - ovlp_samples) / float(hop_samples)) * hop_samples + ovlp_samples
+            while(codes.shape[-1] < len_codes):
+                codes = torch.cat([codes, codes], -1)
+            codes = codes[:,:,0:len_codes]
+        latent_length = min_samples
+        latent_list = []
+        spk_embeds = torch.zeros([1, 32, 1, 32], device=codes.device)
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            for sinx in range(0, codes.shape[-1]-hop_samples, hop_samples):
+                codes_input=[]
+                codes_input.append(codes[:,:,sinx:sinx+min_samples])
+                if(sinx == 0):
+                    # print("Processing {} to {}".format(sinx/self.sample_rate, (sinx + min_samples)/self.sample_rate))
+                    incontext_length = first_latent_length
+                    latents = self.model.inference_codes(codes_input, spk_embeds, first_latent, latent_length, incontext_length=incontext_length, additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
+                    latent_list.append(latents)
+                else:
+                    # print("Processing {} to {}".format(sinx/self.sample_rate, (sinx + min_samples)/self.sample_rate))
+                    true_latent = latent_list[-1][:,:,-ovlp_frames:].permute(0,2,1)
+                    print("true_latent.shape", true_latent.shape)
+                    len_add_to_1000 = 1000 - true_latent.shape[-2]
+                    # print("len_add_to_1000", len_add_to_1000)
+                    # exit()
+                    incontext_length = true_latent.shape[-2]
+                    true_latent = torch.cat([true_latent, torch.randn(true_latent.shape[0],  len_add_to_1000, true_latent.shape[-1]).to(self.device)], -2)
+                    latents = self.model.inference_codes(codes_input, spk_embeds, true_latent, latent_length, incontext_length=incontext_length,  additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
+                    latent_list.append(latents)
+        latent_list = [l.float() for l in latent_list]
+        latent_list[0] = latent_list[0][:,:,first_latent_length:]
+        min_samples =  int(min_samples * self.sample_rate // 1000 * 40)
+        hop_samples = int(hop_samples * self.sample_rate // 1000 * 40)
+        ovlp_samples = min_samples - hop_samples
+        with torch.no_grad():
+            output = None
+            for i in range(len(latent_list)):
+                latent = latent_list[i]
+                cur_output = self.vae.decode_audio(latent)[0].detach().cpu()
+                if output is None:
+                    output = cur_output
+                else:
+                    ov_win = torch.from_numpy(np.linspace(0, 1, ovlp_samples)[None, :])
+                    ov_win = torch.cat([ov_win, 1 - ov_win], -1)
+                    print("output.shape", output.shape)
+                    print("ov_win.shape", ov_win.shape)
+                    output[:, -ovlp_samples:] = output[:, -ovlp_samples:] * ov_win[:, -ovlp_samples:] + cur_output[:, 0:ovlp_samples] * ov_win[:, 0:ovlp_samples]
+                    output = torch.cat([output, cur_output[:, ovlp_samples:]], -1)
+            output = output[:, 0:target_len]
+        return output
+    @torch.no_grad()
+    def preprocess_audio(self, input_audios, threshold=0.8):
+        assert len(input_audios.shape) == 3, input_audios.shape
+        nchan = input_audios.shape[1]
+        input_audios = input_audios.reshape(input_audios.shape[0], -1)
+        norm_value = torch.ones_like(input_audios[:,0])
+        max_volume = input_audios.abs().max(dim=-1)[0]
+        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
+        return input_audios.reshape(input_audios.shape[0], nchan, -1)/norm_value.unsqueeze(-1).unsqueeze(-1)
+    @torch.no_grad()
+    def sound2sound(self, sound, prompt=None, steps=50, disable_progress=False):
+        codes = self.sound2code(sound)
+        # print(codes.shape)
+        # exit()
+        wave = self.code2sound(codes, prompt, guidance_scale=1.5, num_steps=steps, disable_progress=disable_progress)
+        # print(fname, wave.shape)
+        return wave
+    def file2code(self, fname):
+        try:
+            orig_samples, fs = torchaudio.load(fname)
+        except:
+            af = AudioFile(fname)
+            orig_samples = af.read()
+            fs = af.samplerate()
+            orig_samples = orig_samples[0]
+        if(fs!=self.sample_rate):
+            orig_samples = torchaudio.functional.resample(orig_samples, fs, self.sample_rate)
+            fs = self.sample_rate
+        if orig_samples.shape[0] == 1:
+            orig_samples = torch.cat([orig_samples, orig_samples], 0)
+        return self.sound2code(orig_samples)
+    def file2code_ds(self, fname, ds):
+        try:
+            orig_samples, fs = torchaudio.load(fname)
+        except:
+            af = AudioFile(fname)
+            orig_samples = af.read()
+            fs = af.samplerate()
+            orig_samples = orig_samples[0]
+        if(fs!=self.sample_rate):
+            orig_samples = torchaudio.functional.resample(orig_samples, fs, self.sample_rate)
+            fs = self.sample_rate
+        if orig_samples.shape[0] == 1:
+            orig_samples = torch.cat([orig_samples, orig_samples], 0)
+        return self.sound2code_ds(orig_samples, ds)

codeclm/tokenizer/Flow1dVAE/generate_4rvq.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import json
+import torch
+from tqdm import tqdm
+from model_4rvq import PromptCondAudioDiffusion
+from diffusers import DDIMScheduler, DDPMScheduler
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+# from tools.get_mulan import get_mulan
+from tools.get_1dvae_large import get_model
+import tools.torch_tools as torch_tools
+from safetensors.torch import load_file
+from audio import AudioFile
+class Tango:
+    def __init__(self, \
+        model_path, \
+        layer_num=6, \
+        rvq_num=1, \
+        device="cuda:0"):
+        self.sample_rate = 48000
+        scheduler_name = "configs/scheduler/stable_diffusion_2.1_largenoise_sample.json"
+        self.device = device
+        self.vae = get_model()
+        self.vae = self.vae.to(device)
+        self.vae=self.vae.eval()
+        self.layer_num = layer_num
+        self.MAX_DURATION = 360
+        main_config = {
+            "num_channels":32,
+            "unet_model_name":None,
+            "unet_model_config_path":"configs/models/transformer2D_wocross_inch112_1x4_multi_large.json",
+            "snr_gamma":None,
+        }
+        self.rvq_num = rvq_num
+        # print("rvq_num: ", self.rvq_num)
+        # exit()
+        self.model = PromptCondAudioDiffusion(**main_config).to(device)
+        if model_path.endswith(".safetensors"):
+            main_weights = load_file(model_path)
+        else:
+            main_weights = torch.load(model_path, map_location=device)
+        self.model.load_state_dict(main_weights, strict=False)
+        print ("Successfully loaded checkpoint from:", model_path)
+        self.model.eval()
+        self.model.init_device_dtype(torch.device(device), torch.float32)
+        print("scaling factor: ", self.model.normfeat.std)
+        # self.scheduler = DDIMScheduler.from_pretrained( \
+        #     scheduler_name, subfolder="scheduler")
+        # self.scheduler = DDPMScheduler.from_pretrained( \
+        #     scheduler_name, subfolder="scheduler")
+        print("Successfully loaded inference scheduler from {}".format(scheduler_name))
+    @torch.no_grad()
+    @torch.autocast(device_type="cuda", dtype=torch.float32)
+    def sound2code(self, orig_samples, batch_size=8):
+        if(orig_samples.ndim == 2):
+            audios = orig_samples.unsqueeze(0).to(self.device)
+        elif(orig_samples.ndim == 3):
+            audios = orig_samples.to(self.device)
+        else:
+            assert orig_samples.ndim in (2,3), orig_samples.shape
+        audios = self.preprocess_audio(audios)
+        audios = audios.squeeze(0)
+        orig_length = audios.shape[-1]
+        min_samples = int(40 * self.sample_rate)
+        # 40秒对应10个token
+        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
+        # print("output_len: ", output_len)
+        while(audios.shape[-1] < min_samples):
+            audios = torch.cat([audios, audios], -1)
+        int_max_len=audios.shape[-1]//min_samples+1
+        audios = torch.cat([audios, audios], -1)
+        audios=audios[:,:int(int_max_len*(min_samples))]
+        codes_list=[]
+        audio_input = audios.reshape(2, -1, min_samples).permute(1, 0, 2).reshape(-1, 2, min_samples)
+        for audio_inx in range(0, audio_input.shape[0], batch_size):
+            # import pdb; pdb.set_trace()
+            codes, _, spk_embeds = self.model.fetch_codes_batch((audio_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer=self.layer_num, rvq_num=self.rvq_num)
+            # print("codes",codes[0].shape)
+            codes_list.append(torch.cat(codes, 1))
+            # print("codes_list",codes_list[0].shape)
+        codes = torch.cat(codes_list, 0).permute(1,0,2).reshape(self.rvq_num, -1)[None] # B 3 T -> 3 B T
+        codes=codes[:,:,:output_len]
+        return codes
+    @torch.no_grad()
+    @torch.autocast(device_type="cuda", dtype=torch.float32)
+    def sound2code_ds(self, orig_samples, ds, batch_size=6):
+        if(orig_samples.ndim == 2):
+            audios = orig_samples.unsqueeze(0).to(self.device)
+        elif(orig_samples.ndim == 3):
+            audios = orig_samples.to(self.device)
+        else:
+            assert orig_samples.ndim in (2,3), orig_samples.shape
+        audios = self.preprocess_audio(audios)
+        audios = audios.squeeze(0)
+        orig_length = audios.shape[-1]
+        min_samples = int(40 * self.sample_rate)
+        # 40秒对应10个token
+        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
+        # print("output_len: ", output_len)
+        while(audios.shape[-1] < min_samples):
+            audios = torch.cat([audios, audios], -1)
+        int_max_len=audios.shape[-1]//min_samples+1
+        audios = torch.cat([audios, audios], -1)
+        audios=audios[:,:int(int_max_len*(min_samples))]
+        codes_list=[]
+        audio_input = audios.reshape(2, -1, min_samples).permute(1, 0, 2).reshape(-1, 2, min_samples)
+        for audio_inx in range(0, audio_input.shape[0], batch_size):
+            # import pdb; pdb.set_trace()
+            codes, _, spk_embeds = self.model.fetch_codes_batch_ds((audio_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer=self.layer_num, rvq_num=self.rvq_num, ds=ds)
+            # print("codes",codes[0].shape)
+            codes_list.append(torch.cat(codes, 1))
+            # print("codes_list",codes_list[0].shape)
+        codes = torch.cat(codes_list, 0).permute(1,0,2).reshape(self.rvq_num, -1)[None] # B 3 T -> 3 B T
+        codes=codes[:,:,:output_len]
+        return codes
+    @torch.no_grad()
+    def code2sound(self, codes, prompt=None, duration=40, guidance_scale=1.5, num_steps=20, disable_progress=False):
+        codes = codes.to(self.device)
+        min_samples = duration * 25 # 40ms per frame
+        hop_samples = min_samples // 4 * 3
+        ovlp_samples = min_samples - hop_samples
+        hop_frames = hop_samples
+        ovlp_frames = ovlp_samples
+        first_latent = torch.randn(codes.shape[0], min_samples, 64).to(self.device)
+        first_latent_length = 0
+        first_latent_codes_length = 0
+        if(isinstance(prompt, torch.Tensor)):
+            # prepare prompt
+            prompt = prompt.to(self.device)
+            if(prompt.ndim == 3):
+                assert prompt.shape[0] == 1, prompt.shape
+                prompt = prompt[0]
+            elif(prompt.ndim == 1):
+                prompt = prompt.unsqueeze(0).repeat(2,1)
+            elif(prompt.ndim == 2):
+                if(prompt.shape[0] == 1):
+                    prompt = prompt.repeat(2,1)
+            if(prompt.shape[-1] < int(30 * self.sample_rate)):
+                # if less than 30s, just choose the first 10s
+                prompt = prompt[:,:int(10*self.sample_rate)] # limit max length to 10.24
+            else:
+                # else choose from 20.48s which might includes verse or chorus
+                prompt = prompt[:,int(20*self.sample_rate):int(30*self.sample_rate)] # limit max length to 10.24
+            true_latent = self.vae.encode_audio(prompt).permute(0,2,1)
+            # print("true_latent.shape", true_latent.shape)
+            # print("first_latent.shape", first_latent.shape)
+            #true_latent.shape torch.Size([1, 250, 64])
+            # first_latent.shape torch.Size([1, 1000, 64])
+            first_latent[:,0:true_latent.shape[1],:] = true_latent
+            first_latent_length = true_latent.shape[1]
+            first_latent_codes = self.sound2code(prompt)
+            first_latent_codes_length = first_latent_codes.shape[-1]
+            codes = torch.cat([first_latent_codes, codes], -1)
+        codes_len= codes.shape[-1]
+        target_len = int((codes_len - first_latent_codes_length) / 100 * 4 * self.sample_rate)
+        # target_len = int(codes_len / 100 * 4 * self.sample_rate)
+        # code repeat
+        if(codes_len < min_samples):
+            while(codes.shape[-1] < min_samples):
+                codes = torch.cat([codes, codes], -1)
+            codes = codes[:,:,0:min_samples]
+        codes_len = codes.shape[-1]
+        if((codes_len - ovlp_samples) % hop_samples > 0):
+            len_codes=math.ceil((codes_len - ovlp_samples) / float(hop_samples)) * hop_samples + ovlp_samples
+            while(codes.shape[-1] < len_codes):
+                codes = torch.cat([codes, codes], -1)
+            codes = codes[:,:,0:len_codes]
+        latent_length = min_samples
+        latent_list = []
+        spk_embeds = torch.zeros([1, 32, 1, 32], device=codes.device)
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            for sinx in range(0, codes.shape[-1]-hop_samples, hop_samples):
+                codes_input=[]
+                codes_input.append(codes[:,:,sinx:sinx+min_samples])
+                if(sinx == 0):
+                    # print("Processing {} to {}".format(sinx/self.sample_rate, (sinx + min_samples)/self.sample_rate))
+                    incontext_length = first_latent_length
+                    latents = self.model.inference_codes(codes_input, spk_embeds, first_latent, latent_length, incontext_length=incontext_length, additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
+                    latent_list.append(latents)
+                else:
+                    # print("Processing {} to {}".format(sinx/self.sample_rate, (sinx + min_samples)/self.sample_rate))
+                    true_latent = latent_list[-1][:,:,-ovlp_frames:].permute(0,2,1)
+                    print("true_latent.shape", true_latent.shape)
+                    len_add_to_1000 = 1000 - true_latent.shape[-2]
+                    # print("len_add_to_1000", len_add_to_1000)
+                    # exit()
+                    incontext_length = true_latent.shape[-2]
+                    true_latent = torch.cat([true_latent, torch.randn(true_latent.shape[0],  len_add_to_1000, true_latent.shape[-1]).to(self.device)], -2)
+                    latents = self.model.inference_codes(codes_input, spk_embeds, true_latent, latent_length, incontext_length=incontext_length,  additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
+                    latent_list.append(latents)
+        latent_list = [l.float() for l in latent_list]
+        latent_list[0] = latent_list[0][:,:,first_latent_length:]
+        min_samples =  int(min_samples * self.sample_rate // 1000 * 40)
+        hop_samples = int(hop_samples * self.sample_rate // 1000 * 40)
+        ovlp_samples = min_samples - hop_samples
+        with torch.no_grad():
+            output = None
+            for i in range(len(latent_list)):
+                latent = latent_list[i]
+                cur_output = self.vae.decode_audio(latent)[0].detach().cpu()
+                if output is None:
+                    output = cur_output
+                else:
+                    ov_win = torch.from_numpy(np.linspace(0, 1, ovlp_samples)[None, :])
+                    ov_win = torch.cat([ov_win, 1 - ov_win], -1)
+                    print("output.shape", output.shape)
+                    print("ov_win.shape", ov_win.shape)
+                    output[:, -ovlp_samples:] = output[:, -ovlp_samples:] * ov_win[:, -ovlp_samples:] + cur_output[:, 0:ovlp_samples] * ov_win[:, 0:ovlp_samples]
+                    output = torch.cat([output, cur_output[:, ovlp_samples:]], -1)
+            output = output[:, 0:target_len]
+        return output
+    @torch.no_grad()
+    def preprocess_audio(self, input_audios, threshold=0.8):
+        assert len(input_audios.shape) == 3, input_audios.shape
+        nchan = input_audios.shape[1]
+        input_audios = input_audios.reshape(input_audios.shape[0], -1)
+        norm_value = torch.ones_like(input_audios[:,0])
+        max_volume = input_audios.abs().max(dim=-1)[0]
+        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
+        return input_audios.reshape(input_audios.shape[0], nchan, -1)/norm_value.unsqueeze(-1).unsqueeze(-1)
+    @torch.no_grad()
+    def sound2sound(self, sound, prompt=None, steps=50, disable_progress=False):
+        codes = self.sound2code(sound)
+        # print(codes.shape)
+        # exit()
+        wave = self.code2sound(codes, prompt, guidance_scale=1.5, num_steps=steps, disable_progress=disable_progress)
+        # print(fname, wave.shape)
+        return wave
+    def file2code(self, fname):
+        try:
+            orig_samples, fs = torchaudio.load(fname)
+        except:
+            af = AudioFile(fname)
+            orig_samples = af.read()
+            fs = af.samplerate()
+            orig_samples = orig_samples[0]
+        if(fs!=self.sample_rate):
+            orig_samples = torchaudio.functional.resample(orig_samples, fs, self.sample_rate)
+            fs = self.sample_rate
+        if orig_samples.shape[0] == 1:
+            orig_samples = torch.cat([orig_samples, orig_samples], 0)
+        return self.sound2code(orig_samples)
+    def file2code_ds(self, fname, ds):
+        try:
+            orig_samples, fs = torchaudio.load(fname)
+        except:
+            af = AudioFile(fname)
+            orig_samples = af.read()
+            fs = af.samplerate()
+            orig_samples = orig_samples[0]
+        if(fs!=self.sample_rate):
+            orig_samples = torchaudio.functional.resample(orig_samples, fs, self.sample_rate)
+            fs = self.sample_rate
+        if orig_samples.shape[0] == 1:
+            orig_samples = torch.cat([orig_samples, orig_samples], 0)
+        return self.sound2code_ds(orig_samples, ds)

codeclm/tokenizer/Flow1dVAE/generate_septoken.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import json
+import torch
+from tqdm import tqdm
+from model_septoken import PromptCondAudioDiffusion
+from diffusers import DDIMScheduler, DDPMScheduler
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+# from tools.get_mulan import get_mulan
+from tools.get_1dvae_large import get_model
+import tools.torch_tools as torch_tools
+from safetensors.torch import load_file
+from third_party.demucs.models.pretrained import get_model_from_yaml
+from filelock import FileLock
+import kaldiio
+# os.path.join(args.model_dir, "htdemucs.pth"), os.path.join(args.model_dir, "htdemucs.yaml")
+class Separator:
+    def __init__(self, dm_model_path='demucs/ckpt/htdemucs.pth', dm_config_path='demucs/ckpt/htdemucs.yaml', gpu_id=0) -> None:
+        if torch.cuda.is_available() and gpu_id < torch.cuda.device_count():
+            self.device = torch.device(f"cuda:{gpu_id}")
+        else:
+            self.device = torch.device("cpu")
+        self.demucs_model = self.init_demucs_model(dm_model_path, dm_config_path)
+    def init_demucs_model(self, model_path, config_path):
+        model = get_model_from_yaml(config_path, model_path)
+        model.to(self.device)
+        model.eval()
+        return model
+    def load_audio(self, f):
+        a, fs = torchaudio.load(f)
+        if (fs != 48000):
+            a = torchaudio.functional.resample(a, fs, 48000)
+        # if a.shape[-1] >= 48000*10:
+        #     a = a[..., :48000*10]
+        # else:
+        #     a = torch.cat([a, a], -1)
+        # return a[:, 0:48000*10]
+        return a
+    def run(self, audio_path, output_dir='demucs/test_output', ext=".flac"):
+        name, _ = os.path.splitext(os.path.split(audio_path)[-1])
+        output_paths = []
+        # lock_path = os.path.join(output_dir, f"{name}.lock")
+        # with FileLock(lock_path):  # 加一个避免多卡访问时死锁
+        for stem in self.demucs_model.sources:
+            output_path = os.path.join(output_dir, f"{name}_{stem}{ext}")
+            if os.path.exists(output_path):
+                output_paths.append(output_path)
+        if len(output_paths) == 1:  # 4
+            # drums_path, bass_path, other_path, vocal_path = output_paths
+            vocal_path = output_paths[0]
+        else:
+            lock_path = os.path.join(output_dir, f"{name}_separate.lock")
+            with FileLock(lock_path):
+                drums_path, bass_path, other_path, vocal_path = self.demucs_model.separate(audio_path, output_dir, device=self.device)
+        full_audio = self.load_audio(audio_path)
+        vocal_audio = self.load_audio(vocal_path)
+        minlen = min(full_audio.shape[-1], vocal_audio.shape[-1])
+        # bgm_audio = full_audio[:, 0:minlen] - vocal_audio[:, 0:minlen]
+        bgm_audio = self.load_audio(drums_path) + self.load_audio(bass_path) + self.load_audio(other_path)
+        for path in [drums_path, bass_path, other_path, vocal_path]:
+            os.remove(path)
+        return full_audio, vocal_audio, bgm_audio
+class Tango:
+    def __init__(self, \
+        model_path, \
+        vae_config,
+        vae_model,
+        layer_vocal=7,\
+        layer_bgm=3,\
+        device="cuda:0"):
+        self.sample_rate = 48000
+        scheduler_name = "configs/scheduler/stable_diffusion_2.1_largenoise_sample.json"
+        self.device = device
+        self.vae = get_model(vae_config, vae_model)
+        self.vae = self.vae.to(device)
+        self.vae=self.vae.eval()
+        self.layer_vocal=layer_vocal
+        self.layer_bgm=layer_bgm
+        self.MAX_DURATION = 360
+        main_config = {
+            "num_channels":32,
+            "unet_model_name":None,
+            "unet_model_config_path":"configs/models/transformer2D_wocross_inch112_1x4_multi_large.json",
+            "snr_gamma":None,
+        }
+        self.model = PromptCondAudioDiffusion(**main_config).to(device)
+        if model_path.endswith(".safetensors"):
+            main_weights = load_file(model_path)
+        else:
+            main_weights = torch.load(model_path, map_location=device)
+        self.model.load_state_dict(main_weights, strict=False)
+        print ("Successfully loaded checkpoint from:", model_path)
+        self.model.eval()
+        self.model.init_device_dtype(torch.device(device), torch.float32)
+        print("scaling factor: ", self.model.normfeat.std)
+        # self.scheduler = DDIMScheduler.from_pretrained( \
+        #     scheduler_name, subfolder="scheduler")
+        # self.scheduler = DDPMScheduler.from_pretrained( \
+        #     scheduler_name, subfolder="scheduler")
+        print("Successfully loaded inference scheduler from {}".format(scheduler_name))
+    @torch.no_grad()
+    @torch.autocast(device_type="cuda", dtype=torch.float32)
+    def sound2code(self, orig_vocal, orig_bgm, batch_size=8):
+        if(orig_vocal.ndim == 2):
+            audios_vocal = orig_vocal.unsqueeze(0).to(self.device)
+        elif(orig_vocal.ndim == 3):
+            audios_vocal = orig_vocal.to(self.device)
+        else:
+            assert orig_vocal.ndim in (2,3), orig_vocal.shape
+        if(orig_bgm.ndim == 2):
+            audios_bgm = orig_bgm.unsqueeze(0).to(self.device)
+        elif(orig_bgm.ndim == 3):
+            audios_bgm = orig_bgm.to(self.device)
+        else:
+            assert orig_bgm.ndim in (2,3), orig_bgm.shape
+        audios_vocal = self.preprocess_audio(audios_vocal)
+        audios_vocal = audios_vocal.squeeze(0)
+        audios_bgm = self.preprocess_audio(audios_bgm)
+        audios_bgm = audios_bgm.squeeze(0)
+        if audios_vocal.shape[-1] > audios_bgm.shape[-1]:
+            audios_vocal = audios_vocal[:,:audios_bgm.shape[-1]]
+        else:
+            audios_bgm = audios_bgm[:,:audios_vocal.shape[-1]]
+        orig_length = audios_vocal.shape[-1]
+        min_samples = int(40 * self.sample_rate)
+        # 40秒对应10个token
+        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
+        while(audios_vocal.shape[-1] < min_samples):
+            audios_vocal = torch.cat([audios_vocal, audios_vocal], -1)
+            audios_bgm = torch.cat([audios_bgm, audios_bgm], -1)
+        int_max_len=audios_vocal.shape[-1]//min_samples+1
+        audios_vocal = torch.cat([audios_vocal, audios_vocal], -1)
+        audios_bgm = torch.cat([audios_bgm, audios_bgm], -1)
+        audios_vocal=audios_vocal[:,:int(int_max_len*(min_samples))]
+        audios_bgm=audios_bgm[:,:int(int_max_len*(min_samples))]
+        codes_vocal_list=[]
+        codes_bgm_list=[]
+        audio_vocal_input = audios_vocal.reshape(2, -1, min_samples).permute(1, 0, 2).reshape(-1, 2, min_samples)
+        audio_bgm_input = audios_bgm.reshape(2, -1, min_samples).permute(1, 0, 2).reshape(-1, 2, min_samples)
+        for audio_inx in range(0, audio_vocal_input.shape[0], batch_size):
+            [codes_vocal,codes_bgm], _, spk_embeds = self.model.fetch_codes_batch((audio_vocal_input[audio_inx:audio_inx+batch_size]), (audio_bgm_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer_vocal=self.layer_vocal,layer_bgm=self.layer_bgm)
+            codes_vocal_list.append(codes_vocal)
+            codes_bgm_list.append(codes_bgm)
+        codes_vocal = torch.cat(codes_vocal_list, 0).permute(1,0,2).reshape(1, -1)[None]
+        codes_bgm = torch.cat(codes_bgm_list, 0).permute(1,0,2).reshape(1, -1)[None]
+        codes_vocal=codes_vocal[:,:,:output_len]
+        codes_bgm=codes_bgm[:,:,:output_len]
+        return codes_vocal, codes_bgm
+    @torch.no_grad()
+    def code2sound(self, codes, prompt_vocal=None, prompt_bgm=None, duration=40, guidance_scale=1.5, num_steps=20, disable_progress=False):
+        codes_vocal,codes_bgm = codes
+        codes_vocal = codes_vocal.to(self.device)
+        codes_bgm = codes_bgm.to(self.device)
+        min_samples = duration * 25 # 40ms per frame
+        hop_samples = min_samples // 4 * 3
+        ovlp_samples = min_samples - hop_samples
+        hop_frames = hop_samples
+        ovlp_frames = ovlp_samples
+        first_latent = torch.randn(codes_vocal.shape[0], min_samples, 64).to(self.device)
+        first_latent_length = 0
+        first_latent_codes_length = 0
+        if(isinstance(prompt_vocal, torch.Tensor)):
+            # prepare prompt
+            prompt_vocal = prompt_vocal.to(self.device)
+            prompt_bgm = prompt_bgm.to(self.device)
+            if(prompt_vocal.ndim == 3):
+                assert prompt_vocal.shape[0] == 1, prompt_vocal.shape
+                prompt_vocal = prompt_vocal[0]
+                prompt_bgm = prompt_bgm[0]
+            elif(prompt_vocal.ndim == 1):
+                prompt_vocal = prompt_vocal.unsqueeze(0).repeat(2,1)
+                prompt_bgm = prompt_bgm.unsqueeze(0).repeat(2,1)
+            elif(prompt_vocal.ndim == 2):
+                if(prompt_vocal.shape[0] == 1):
+                    prompt_vocal = prompt_vocal.repeat(2,1)
+                    prompt_bgm = prompt_bgm.repeat(2,1)
+            if(prompt_vocal.shape[-1] < int(30 * self.sample_rate)):
+                # if less than 30s, just choose the first 10s
+                prompt_vocal = prompt_vocal[:,:int(10*self.sample_rate)] # limit max length to 10.24
+                prompt_bgm = prompt_bgm[:,:int(10*self.sample_rate)] # limit max length to 10.24
+            else:
+                # else choose from 20.48s which might includes verse or chorus
+                prompt_vocal = prompt_vocal[:,int(20*self.sample_rate):int(30*self.sample_rate)] # limit max length to 10.24
+                prompt_bgm = prompt_bgm[:,int(20*self.sample_rate):int(30*self.sample_rate)] # limit max length to 10.24
+            true_latent = self.vae.encode_audio(prompt_vocal+prompt_bgm).permute(0,2,1)
+            first_latent[:,0:true_latent.shape[1],:] = true_latent
+            first_latent_length = true_latent.shape[1]
+            first_latent_codes = self.sound2code(prompt_vocal, prompt_bgm)
+            first_latent_codes_vocal = first_latent_codes[0]
+            first_latent_codes_bgm = first_latent_codes[1]
+            first_latent_codes_length = first_latent_codes_vocal.shape[-1]
+            codes_vocal = torch.cat([first_latent_codes_vocal, codes_vocal], -1)
+            codes_bgm = torch.cat([first_latent_codes_bgm, codes_bgm], -1)
+        codes_len= codes_vocal.shape[-1]
+        target_len = int((codes_len - first_latent_codes_length) / 100 * 4 * self.sample_rate)
+        # target_len = int(codes_len / 100 * 4 * self.sample_rate)
+        # code repeat
+        if(codes_len < min_samples):
+            while(codes_vocal.shape[-1] < min_samples):
+                codes_vocal = torch.cat([codes_vocal, codes_vocal], -1)
+                codes_bgm = torch.cat([codes_bgm, codes_bgm], -1)
+            codes_vocal = codes_vocal[:,:,0:min_samples]
+            codes_bgm = codes_bgm[:,:,0:min_samples]
+        codes_len = codes_vocal.shape[-1]
+        if((codes_len - ovlp_samples) % hop_samples > 0):
+            len_codes=math.ceil((codes_len - ovlp_samples) / float(hop_samples)) * hop_samples + ovlp_samples
+            while(codes_vocal.shape[-1] < len_codes):
+                codes_vocal = torch.cat([codes_vocal, codes_vocal], -1)
+                codes_bgm = torch.cat([codes_bgm, codes_bgm], -1)
+            codes_vocal = codes_vocal[:,:,0:len_codes]
+            codes_bgm = codes_bgm[:,:,0:len_codes]
+        latent_length = min_samples
+        latent_list = []
+        spk_embeds = torch.zeros([1, 32, 1, 32], device=codes_vocal.device)
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            for sinx in range(0, codes_vocal.shape[-1]-hop_samples, hop_samples):
+                codes_vocal_input=codes_vocal[:,:,sinx:sinx+min_samples]
+                codes_bgm_input=codes_bgm[:,:,sinx:sinx+min_samples]
+                if(sinx == 0):
+                    incontext_length = first_latent_length
+                    latents = self.model.inference_codes([codes_vocal_input,codes_bgm_input], spk_embeds, first_latent, latent_length, incontext_length=incontext_length, additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
+                    latent_list.append(latents)
+                else:
+                    true_latent = latent_list[-1][:,:,-ovlp_frames:].permute(0,2,1)
+                    len_add_to_1000 = min_samples - true_latent.shape[-2]
+                    incontext_length = true_latent.shape[-2]
+                    true_latent = torch.cat([true_latent, torch.randn(true_latent.shape[0],  len_add_to_1000, true_latent.shape[-1]).to(self.device)], -2)
+                    latents = self.model.inference_codes([codes_vocal_input,codes_bgm_input], spk_embeds, true_latent, latent_length, incontext_length=incontext_length,  additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
+                    latent_list.append(latents)
+        latent_list = [l.float() for l in latent_list]
+        latent_list[0] = latent_list[0][:,:,first_latent_length:]
+        min_samples =  int(min_samples * self.sample_rate // 1000 * 40)
+        hop_samples = int(hop_samples * self.sample_rate // 1000 * 40)
+        ovlp_samples = min_samples - hop_samples
+        with torch.no_grad():
+            output = None
+            for i in range(len(latent_list)):
+                latent = latent_list[i]
+                cur_output = self.vae.decode_audio(latent)[0].detach().cpu()
+                if output is None:
+                    output = cur_output
+                else:
+                    ov_win = torch.from_numpy(np.linspace(0, 1, ovlp_samples)[None, :])
+                    ov_win = torch.cat([ov_win, 1 - ov_win], -1)
+                    output[:, -ovlp_samples:] = output[:, -ovlp_samples:] * ov_win[:, -ovlp_samples:] + cur_output[:, 0:ovlp_samples] * ov_win[:, 0:ovlp_samples]
+                    output = torch.cat([output, cur_output[:, ovlp_samples:]], -1)
+            output = output[:, 0:target_len]
+        return output
+    @torch.no_grad()
+    def preprocess_audio(self, input_audios_vocal, threshold=0.8):
+        assert len(input_audios_vocal.shape) == 3, input_audios_vocal.shape
+        nchan = input_audios_vocal.shape[1]
+        input_audios_vocal = input_audios_vocal.reshape(input_audios_vocal.shape[0], -1)
+        norm_value = torch.ones_like(input_audios_vocal[:,0])
+        max_volume = input_audios_vocal.abs().max(dim=-1)[0]
+        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
+        return input_audios_vocal.reshape(input_audios_vocal.shape[0], nchan, -1)/norm_value.unsqueeze(-1).unsqueeze(-1)
+    @torch.no_grad()
+    def sound2sound(self, orig_vocal,orig_bgm, prompt_vocal=None,prompt_bgm=None, steps=50, disable_progress=False):
+        codes_vocal, codes_bgm = self.sound2code(orig_vocal,orig_bgm)
+        codes=[codes_vocal, codes_bgm]
+        wave = self.code2sound(codes, prompt_vocal,prompt_bgm, guidance_scale=1.5, num_steps=steps, disable_progress=disable_progress)
+        return wave

codeclm/tokenizer/Flow1dVAE/libs/datasets/MusicSoundMixedDataset.py ADDED Viewed

	@@ -0,0 +1,1278 @@

+from torch.utils.data import Dataset
+from beartype.typing import Sequence, Callable, Optional, Dict, Tuple, List, Union
+from beartype import beartype
+from beartype.door import is_bearable
+import random
+import pandas as pd
+import os
+from torchaudio.functional import resample
+import torch
+import typing as tp
+from pathlib import Path
+import torchaudio as ta
+import torch.nn.functional as F
+import numpy as np
+import json
+import yaml
+import torchaudio
+import math
+import re
+from loguru import logger
+import ffmpeg
+class Read_and_PadCrop_Normalized_T(torch.nn.Module):
+    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):
+        super().__init__()
+        self.n_samples = n_samples
+        self.sample_rate = sample_rate
+        self.randomize = randomize
+    def __call__(self, filename: str, duration: float, cur_sample_rate: int) -> Tuple[torch.Tensor, float, float, int, int]:
+        if  self.n_samples < 0: #means not clip
+            chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
+            t_start = 0.
+            t_end = 1.0
+            offset = 0
+        else:
+            if(duration<(float(self.n_samples)/self.sample_rate+1)):
+                # print(duration,(float(self.n_samples)/self.sample_rate+1))
+                chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
+                t_start = 0.
+                t_end = min(1.0, float(self.n_samples) / float(self.sample_rate) / duration)
+                offset = 0
+                # print('c1:',chunk.shape)
+            else:
+                offset = np.random.randint(0,int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
+                t_start = offset / float(cur_sample_rate) / duration
+                t_end = t_start + float(self.n_samples) / float(self.sample_rate) / duration
+                chunk, _ = torchaudio.load(filename, frame_offset=offset, num_frames=int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
+                # print('offset:',offset)
+                # print('c0:',chunk.shape)
+            # Pad with silence if necessary.
+        if(chunk.shape[0]>1):
+            chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
+        else:
+            chunk = chunk[[0],:].float()
+        if(cur_sample_rate!=self.sample_rate):
+            # print('a:',cur_sample_rate,chunk.shape)
+            chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sample_rate)
+            # print('b:',self.sample_rate,chunk.shape)
+        if self.n_samples > 0:
+            if chunk.shape[-1] < self.n_samples:
+                chunk = torch.cat([chunk, torch.zeros((1, self.n_samples - chunk.shape[-1],))],-1)
+            else:
+                chunk = chunk[:,0:self.n_samples]
+        seconds_start = math.floor(offset / cur_sample_rate)
+        seconds_total = math.floor(duration)
+        return (
+            chunk,
+            t_start,
+            t_end,
+            seconds_start,
+            seconds_total
+        )
+class Read_and_PadCrop_Normalized_T_Avoid_Watermark(torch.nn.Module):
+    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True, w_start = 0, w_interval = 11.3):
+        super().__init__()
+        self.n_samples = n_samples
+        self.sample_rate = sample_rate
+        self.randomize = randomize
+        self.w_start = w_start
+        self.w_interval = w_interval
+    def __call__(self, filename: str, duration: float, cur_sample_rate: int) -> Tuple[torch.Tensor, float, float, int, int]:
+        if  self.n_samples < 0: #means not clip
+            chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
+            t_start = 0.
+            t_end = 1.0
+            offset = 0
+        else:
+            if(duration<(float(self.n_samples)/self.sample_rate+1)):
+                # print(duration,(float(self.n_samples)/self.sample_rate+1))
+                chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
+                t_start = 0.
+                t_end = min(1.0, float(self.n_samples) / float(self.sample_rate) / duration)
+                offset = 0
+                # print('c1:',chunk.shape)
+            else:
+                n_offset_option = (duration - self.w_start) // self.w_interval
+                if n_offset_option <= 1:
+                    offset = 0
+                else:
+                    offset = int((random.randint(0,n_offset_option-1) * self.w_interval + self.w_start) * cur_sample_rate)
+                # offset = np.random.randint(0,int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
+                t_start = offset / float(cur_sample_rate) / duration
+                t_end = t_start + float(self.n_samples) / float(self.sample_rate) / duration
+                chunk, _ = torchaudio.load(filename, frame_offset=offset, num_frames=int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
+                # print('offset:',offset)
+                # print('c0:',chunk.shape)
+            # Pad with silence if necessary.
+        if(chunk.shape[0]>1):
+            chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
+        else:
+            chunk = chunk[[0],:].float()
+        if(cur_sample_rate!=self.sample_rate):
+            # print('a:',cur_sample_rate,chunk.shape)
+            chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sample_rate)
+            # print('b:',self.sample_rate,chunk.shape)
+        if self.n_samples > 0:
+            if chunk.shape[-1] < self.n_samples:
+                chunk = torch.cat([chunk, torch.zeros((1, self.n_samples - chunk.shape[-1],))],-1)
+            else:
+                chunk = chunk[:,0:self.n_samples]
+        seconds_start = math.floor(offset / cur_sample_rate)
+        seconds_total = math.floor(duration)
+        return (
+            chunk,
+            t_start,
+            t_end,
+            seconds_start,
+            seconds_total
+        )
+USE_DUMMY_AUDIO = False #当测试代码时，可以将其置为True，这样就不会读取实际数据，而是用生成的静默音频代替
+if USE_DUMMY_AUDIO:
+    logger.warning("USE_DUMMY_AUDIO flag is True, don't use it when train or test!")
+class SafeAudioReader:
+    """
+       This class is an adaptor to Read_and_PadCrop_Normalized_T, make it safe to read audio data.
+    """
+    def __init__(self,
+                duration: float,  # 返回音频长度
+                sample_rate: int, # 返回音频的采样率，如与实际音频采样率不同，会作resample
+                randomize: bool = True,
+                use_avoid_watermark_policy = False,
+                ):
+        self.n_samples = int(sample_rate * duration)
+        self.reader = (
+            Read_and_PadCrop_Normalized_T_Avoid_Watermark if use_avoid_watermark_policy \
+            else Read_and_PadCrop_Normalized_T
+            )(n_samples=self.n_samples, sample_rate=sample_rate, randomize=randomize)
+    #NOTE:这个是核心的函数，所有数据集读取音频都是调用的这个函数！
+    def __call__(self,
+                 filepath: os.PathLike,  # 音频路径
+                 origin_sample_rate: Optional[int] = None,  # 从json文件中读取的实际采样率，如果不给定，则会从文件头中读取
+                 origin_duration: float = None, # 从json文件中读取的实际时长，如果不给定，则会从文件头中读取
+                 ) -> torch.Tensor:
+        if USE_DUMMY_AUDIO:
+            wav = torch.zeros(self.n_samples, dtype=torch.float32)
+            return wav
+        try:
+            if origin_sample_rate is None or origin_duration is None:
+                # audio_info = torchaudio.info(filepath)
+                # origin_sample_rate = audio_info.sample_rate
+                # origin_duration = audio_info.num_frames / origin_sample_rate
+                info = ffmpeg.probe(filepath)
+                origin_duration = float(info['format']['duration'])
+                origin_sample_rate = int(info['streams'][0]['sample_rate'])
+            wav, *ignored = self.reader(filepath, origin_duration, origin_sample_rate)
+            wav = wav.squeeze_(0)
+        except Exception as e:
+            logger.error(f"Error reading {filepath}: {e}")
+            wav = torch.zeros(self.n_samples, dtype=torch.float32)
+        return wav
+class PromptTemplate:
+    def __init__(self, template_text: str, tag_map: Dict[str, str], lang:str ='en'):
+        self.template_text = template_text
+        self.tag_map = tag_map
+        self.lang = lang
+    @property
+    def tags(self):
+        return tuple(self.tag_map.keys())
+    def apply(self, **kwargs):
+        for tag in list(kwargs.keys()):
+            if kwargs[tag] == '':
+                kwargs.pop(tag)
+        for tag in self.tags:
+            if tag in kwargs:
+                kwargs[tag] = self.tag_map[tag].format(**{tag: kwargs[tag]}).strip('[]')
+            else:
+                kwargs[tag] = ''
+        prompt = self.template_text.format(**kwargs)
+        return self.beautify(prompt)
+    def beautify(self, text):
+        if self.lang == 'en':
+            return self._beautify_en(text)
+        elif self.lang == 'zh':
+            return self._beautify_zh(text)
+        else:
+            raise ValueError(f'Unknown language {self.lang}')
+    @staticmethod
+    def _beautify_en(text):
+        # no continuous commas without content between them
+        text = re.sub(r'[,\s]*,[,\s]*', r', ', text)
+        # no continuous whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # the comma is NOT followed by whitespace, and should be followed by ONE whitespace
+        text = re.sub(r'\s+,', r',', text)
+        text = re.sub(r',\s+', r', ', text)
+        # no whitespace before the full stop
+        text = re.sub(r'\s+\.', r'.', text)
+        # strip whitespace, comma, and replace ',.'
+        text = text.strip(' ,')
+        text = text.replace(',.', '.')
+        return text
+    @staticmethod
+    def _beautify_zh(text):
+        # no continuous commas without content between them
+        text = re.sub(r'[，、\s]*，[，、\s]*', r'，', text)
+        text = re.sub(r'[，、\s]*、[，、\s]*', r'、', text)
+        # assume there should be NO whitespace in Chinese
+        text = re.sub(r'\s+', r'', text)
+        # strip whitespace, comma, and replace '，。'
+        text = text.strip('， 、')
+        text = text.replace('，。', '。')
+        return text
+    def __repr__(self):
+        return f'PromptTemplate({self.template_text!r}, {self.tag_map!r})'
+    __str__ = __repr__
+def parse_prompt_template(prompt_template_text, lang='en'):
+    span_pattern = re.compile(r'\[.*?{.+?}.*?\]', re.DOTALL)
+    tag_pattern = re.compile(r'{.+?}', re.DOTALL)
+    template_text = prompt_template_text.strip()
+    span_texts = span_pattern.findall(prompt_template_text)
+    tag_map = {}
+    for span_text in span_texts:
+        tag = tag_pattern.findall(span_text)[0].strip('{}')
+        tag_map[tag] = span_text
+        template_text = template_text.replace(span_text, '{'+tag+'}')
+    return PromptTemplate(template_text=template_text, tag_map=tag_map, lang=lang)
+def load_prompt_templates(path, num = 5, lang='en') -> List[PromptTemplate]:
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    cnt = 0
+    pts = []
+    for line in lines:
+        pt = parse_prompt_template(line, lang=lang)
+        cnt += 1
+        if len(pt.tags) < num:
+            logger.error(f'Not enough tags on {path} in line {cnt}: {pt.tags}')
+        pts.append(pt)
+    return pts
+def get_base_dir_file(key: os.PathLike):
+    base = os.path.basename(key)
+    dirname = os.path.basename(os.path.dirname(key))
+    return os.path.join(dirname, base)
+def read_jsonlike(path: os.PathLike):
+    #json or jsonl
+    if str(path).endswith(".json"):
+        with open(path, 'r', encoding='utf8') as f:
+            data = json.load(f)
+        return data
+    elif str(path).endswith(".jsonl"):
+        with open(path, 'r', encoding='utf8') as f:
+            data = [json.loads(line) for line in f.readlines()]
+        return data
+    else:
+        raise ValueError("Unknown file format")
+dist_prob_map = {
+    1: (1.0,),
+    2: (0.5, 0.5),
+    3: (0.3, 0.4, 0.3),
+    4: (0.2, 0.3, 0.3, 0.2),
+    5: (0.2, 0.2, 0.3, 0.2, 0.1),
+    6: (0.1, 0.15, 0.2, 0.2, 0.2, 0.15),
+    7: (0.05, 0.1, 0.1, 0.2, 0.25, 0.2, 0.1),
+    8: (0.03, 0.05, 0.1, 0.15, 0.25, 0.2, 0.1, 0.12),
+    9: (0.02, 0.1, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.08),
+    10: (0.01, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.05, 0.05, 0.09)
+}
+'''
+#更加偏向短文本的方案
+dist_prob_map = {
+    1: (1.0,),
+    2: (0.7, 0.3),
+    3: (0.7, 0.2, 0.1),
+    4: (0.6, 0.2, 0.1, 0.1),
+    5: (0.6, 0.2, 0.1, 0.05, 0.05),
+    6: (0.6, 0.15, 0.1, 0.05, 0.05, 0.05),
+    7: (0.05, 0.1, 0.1, 0.2, 0.25, 0.2, 0.1),
+    8: (0.03, 0.05, 0.1, 0.15, 0.25, 0.2, 0.1, 0.12),
+    9: (0.02, 0.1, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.08),
+    10: (0.01, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.05, 0.05, 0.09)
+}
+'''
+#全部都用的方案
+# dist_prob_map = {
+#     1: (1.0,),
+#     2: (0, 1.0),
+#     3: (0, 0, 1.0),
+#     4: (0, 0, 0, 1.0),
+#     5: (0, 0, 0, 0, 1.0),
+#     6: (0, 0, 0, 0, 0, 1.0),
+#     7: (0, 0, 0, 0, 0, 0, 1.0),
+#     8: (0, 0, 0, 0, 0, 0, 0, 1.0),
+#     9: (0, 0, 0, 0, 0, 0, 0, 0, 1.0),
+#     10: (0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0)
+# }
+dist_prob_map_low = {
+    1: (1.0,),
+    2: (0.8, 0.2),
+    3: (0.8, 0.1, 0.1),
+    4: (0.7, 0.1, 0.1, 0.1),
+    5: (0.7, 0.1, 0.1, 0.05, 0.05),
+    6: (0.7, 0.1, 0.05, 0.05, 0.05, 0.05),
+}
+_bpm_range_rights = (
+    (40, '20-40'),
+    (60, '40-60'),
+    (66, '60-66'),
+    (76, '66-76'),
+    (108, '76-108'),
+    (120, '108-120'),
+    (168, '120-168'),
+    (176, '168-176'),
+    (200, '176-200')
+)
+_bpm_desc_map = {
+    '20-40': ("glacial pace", "extremely slow tempo", "crawl-like speed", "snail's pace", "almost motionless rhythm", "Larghissimo"),
+    '40-60': ("broad and slow", "spacious tempo", "unhurried pace", "calm rhythm", "relaxed speed", "Largo"),
+    '60-66': ("gentle tempo", "leisurely pace", "easy-going rhythm", "unrushed speed", "smooth and slow", 'Larghetto'),
+    '66-76': ("slow and steady", "deliberate tempo", "unhurried pace", "relaxed rhythm", "easy speed", 'Adagio'),
+    '76-108': ("walking pace", "moderate tempo", "steady rhythm", "balanced speed", "easy-flowing tempo", "Andante"),
+    '108-120': ("medium pace", "comfortable tempo", "even rhythm", "measured speed", "controlled tempo", 'Moderato'),
+    '120-168': ("quick and lively", "brisk pace", "energetic tempo", "upbeat rhythm", "spirited speed", 'Allegro'),
+    '168-176': ("lively and fast", "bright tempo", "sprightly pace", "vibrant rhythm", "animated speed", 'Vivace'),
+    '176-200': ("very fast tempo", "rapid pace", "high-speed rhythm", "hurried speed", "accelerated tempo", 'Presto'),
+    '>200': ("extremely fast", "breakneck speed", "blazing tempo", "lightning-fast rhythm", "supercharged pace", 'Prestissimo')
+}
+_bpm_desc_map_zh = {
+    '20-40': ("极度缓慢", "极慢的节奏", "悠长的旋律", "迟缓的节奏", "几乎静止的节奏", "甚缓"),
+    '40-60': ("宽广而缓慢", "宽敞的节奏", "从容不迫的速度", "平静的节奏", "轻松的速度", "广板"),
+    '60-66': ("柔和的节奏", "悠闲的速度", "轻松的节奏", "不慌不忙的速度", "平滑而缓慢", '小广板'),
+    '66-76': ("缓慢而稳定", "沉稳的旋律", "从容不迫的速度", "轻松的节奏", "轻松的速度", '慢板'),
+    '76-108': ("步行速度", "适中的节奏", "稳定的节奏", "平衡的速度", "流畅的节奏", "行板"),
+    '108-120': ("中等速度", "舒适的节奏", "均匀的节奏", "有节制的速度", "稳定的氛围", '中板'),
+    '120-168': ("快速而生动", "轻快的速度", "充满活力的节奏", "欢快的节奏", "富有精神的速度", '快板'),
+    '168-176': ("生动而快速", "明快的节奏", "活泼的速度", "充满活力的节奏", "生气勃勃的速度", '活泼的'),
+    '176-200': ("非常快的节奏", "快速的速度", "高速的节奏", "匆忙的速度", "加速的节奏", '急板'),
+    '>200': ("极快的速度", "极速旋律", "炽热的节奏", "闪电般的节奏", "疾驰的速度", '最急板')
+}
+def get_bpm_range(bpm):
+    bpm = int(bpm)
+    for right, tag in _bpm_range_rights:
+        if bpm <= right:
+            return tag
+    return '>200'
+def gen_bpm_descript(bpm, lang='en'):
+    bpm_range = get_bpm_range(bpm)
+    if lang == 'en':
+        return random.choice(_bpm_desc_map[bpm_range])
+    elif lang == 'zh':
+        return random.choice(_bpm_desc_map_zh[bpm_range])
+    else:
+        raise ValueError(f"Unknown language {lang}")
+def read_translate(translate: Union[Dict[str, os.PathLike], os.PathLike, None]):
+    if translate is None:
+        return None
+    if isinstance(translate, str):
+        return read_jsonlike(translate)
+    return {k: read_jsonlike(path) for k, path in translate.items()}
+def gen_plain_prompt(key_list, sep=', '):
+    if len(key_list) == 0:
+        return 'none'
+    key_list = [k.strip() for k in key_list]
+    if len(key_list) > 10:
+        random.shuffle(key_list)
+        key_list = key_list[:10]
+    probs = dist_prob_map[len(key_list)]
+    num_tags = random.choices(range(1, len(key_list)+1), probs, k=1)[0]
+    random.shuffle(key_list)
+    tags = key_list[:num_tags]
+    tags_str = sep.join(tags)
+    return tags_str
+class MagnaTagATuneDataset(Dataset):
+    def __init__(self):
+        pass
+def tags_to_desc(tag_list, sep=',') -> str:
+    if not isinstance(tag_list, Sequence):
+        return str(tag_list)
+    if isinstance(tag_list, str):
+        return tag_list
+    if len(tag_list) <= 0:
+        return ''
+    elif len(tag_list) <= 5:
+        probs = dist_prob_map[len(tag_list)]
+        tags_num = random.choices(range(1, len(tag_list)+1), probs)[0]
+        random.shuffle(tag_list)
+        tag_list = tag_list[:tags_num]
+        return sep.join(tag_list)
+    else:
+        probs = dist_prob_map[5]
+        tags_num = random.choices(range(1, 6), probs)[0]
+        random.shuffle(tag_list)
+        tag_list = tag_list[:tags_num]
+        return sep.join(tag_list)
+def get_sr_and_duration_info(item):
+    return item.get('sample_rate', None), item.get('duration', None)
+class MtgJamendoDatasetFromJson(Dataset):
+    def __init__(self,
+                data_dir:str,
+                json_path:str,
+                duration:float=10,
+                sr:int = 0,
+                lang = 'en',
+                plain_rate = 0,
+                return_audio = True,
+                return_path = False,
+                prompt_template_path: os.PathLike = None,
+                tag_types = [],
+                translate:Optional[Dict[str, os.PathLike]] = None,
+                use_literal_none = True,
+                ):
+        self.audio_reader = SafeAudioReader(duration, sr)
+        self.data_dir = data_dir
+        self._load_metadata_json(json_path)
+        self.sr = sr
+        self.duration = duration
+        self.plain_rate = plain_rate
+        self.return_audio = return_audio
+        self.return_path = return_path
+        self.use_literal_none = use_literal_none
+        self.lang = lang
+        self.use_dynamic_prompt = prompt_template_path is not None and plain_rate < 1.0
+        if self.use_dynamic_prompt:
+            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types))
+        self.tag_types = tag_types
+        self.translate = read_translate(translate)
+    #这些tag被认为是弱语义的，会避免产生仅包含这些tag的文本提示
+    WEAK_TAG_LIST = ["title", "artist"]
+    def _load_metadata_json(self, json_path):
+        with open(json_path) as fp:
+            self.data = json.load(fp)
+    def convert_key_to_path(self, key):
+        return os.path.join(self.data_dir, get_base_dir_file(key))
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        path = self.convert_key_to_path(item['key'])
+        description = self.generate_description(item)
+        if self.return_audio:
+            sr, duration = get_sr_and_duration_info(item)
+            audio = self.audio_reader(path, sr, duration)
+        else:
+            audio = None
+        if self.return_path:
+            return audio, description, path
+        return audio, description
+    def tags_to_desc(self, tag_list, tag_type) -> str:
+        if self.lang == 'en':
+            return tags_to_desc(tag_list)
+        elif self.lang == 'zh':
+            translator = self.translate[tag_type]
+            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
+            return tags_to_desc(translated_tag_list, sep='、')
+    def generate_description(self, item):
+        if random.random() > self.plain_rate:
+            # dynamically generate prompt from given prompt template
+            prompt_template = random.choice(self.prompt_templates)
+            description = self.generate_description_dynamic(item, prompt_template)
+        else:
+            # use plain prompt, i.e. tags sequence separated by comma
+            description = self.generate_description_plain(item)
+        return description
+    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
+        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
+        exists_weak_tag = list(filter(lambda t: t in self.WEAK_TAG_LIST, exists_tag))
+        exists_strong_tag = list(filter(lambda t: t not in self.WEAK_TAG_LIST, exists_tag))
+        if len(exists_strong_tag) > 0:
+            probs = dist_prob_map[len(exists_strong_tag)]
+            tags_num = random.choices(range(1, len(exists_strong_tag)+1), probs)[0]
+            random.shuffle(exists_strong_tag)
+            tags = exists_strong_tag[:tags_num]
+            weak_probs = dist_prob_map_low[len(exists_weak_tag) + 1]
+            weak_tags_num = random.choices(range(0, len(exists_weak_tag) + 1), weak_probs)[0]
+            random.shuffle(exists_weak_tag)
+            weak_tags = exists_weak_tag[:weak_tags_num]
+            tags += weak_tags
+            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
+            prompt = prompt_template.apply(**tags_args)
+        else:
+            # no strong tags, use all weak tags instead
+            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in exists_weak_tag}
+            prompt = prompt_template.apply(**tags_args)
+        if self.use_literal_none and len(tags_args) == 0:
+            return 'none'
+        return prompt
+    def generate_description_plain(self, item):
+        keywords = []
+        for tag_t in self.tag_types:
+            this_key = item[tag_t]
+            if this_key is None:
+                continue
+            if isinstance(this_key, str):
+                this_key = [this_key]
+            if self.lang != 'en':
+                this_key = [self.get_translation(tag_t, k) for k in this_key]
+            keywords += this_key
+        return gen_plain_prompt(keywords, sep=self.keysep)
+    def get_translation(self, tag_t, k):
+        k = k.strip()
+        if k in self.translate[tag_t]:
+            return self.translate[tag_t][k]
+        else:
+            return k
+    @property
+    def keysep(self):
+        if self.lang == 'zh':
+            return '，' if random.random() > 0.5 else '、'
+        elif self.lang == 'en':
+            return ', '
+class AudioStockDataset(Dataset):
+    def __init__(self,
+                metadata_path:str,
+                duration:float=10,
+                sr:int = 0,
+                plain_rate = 0,
+                return_path = False,
+                return_audio = True,
+                prompt_template_path: os.PathLike = None,
+                tag_types = [],
+                lang = 'en',
+                translate:Optional[Dict[str, os.PathLike]] = None,
+                use_literal_none = True,
+                ):
+        self.audio_reader = SafeAudioReader(duration, sr)
+        self._load_metadata(metadata_path)
+        self.sr = sr
+        self.duration = duration
+        self.plain_rate = plain_rate
+        self.return_path = return_path
+        self.return_audio = return_audio
+        self.use_literal_none = use_literal_none
+        self.use_dynamic_prompt = prompt_template_path is not None and plain_rate < 1.0
+        if self.use_dynamic_prompt:
+            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types), lang = lang)
+        self.tag_types = tag_types
+        self.lang = lang
+        self.translate = read_translate(translate)
+    def _load_metadata(self, metadata_path):
+        with open(metadata_path) as fp:
+            lines = fp.readlines()
+            self.data = []
+            for line in lines:
+                item = json.loads(line)
+                self.data.append(item)
+        self.is_info_recorded = bool('Tags' in self.data[0])
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        path:str = self.data[idx]["path"]
+        json_path = path[:path.rfind('.')] + ".json"
+        if self.is_info_recorded:
+            item = self.data[idx]
+        else:
+            try:
+                with open(json_path) as fp:
+                    item:dict = json.load(fp)
+            except Exception as e:
+                print(f"Error loading json file {json_path} :\n{e}")
+                item = {}
+        description = self.generate_description(item)
+        if self.return_audio:
+            sr, duration = get_sr_and_duration_info(item)
+            audio = self.audio_reader(path, sr, duration)
+        else:
+            audio = None
+        if self.return_path:
+            return audio, description, path
+        return audio, description
+    def generate_description(self, item):
+        if random.random() > self.plain_rate:
+            # dynamically generate prompt from given prompt template
+            prompt_template = random.choice(self.prompt_templates)
+            description = self.generate_description_dynamic(item, prompt_template)
+        else:
+            # use plain prompt, i.e. tags sequence separated by comma
+            description = self.generate_description_plain(item)
+        return description
+    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
+        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
+        if len(exists_tag) > 0:
+            probs = dist_prob_map[len(exists_tag)]
+            tags_num = random.choices(range(1, len(exists_tag)+1), probs)[0]
+            random.shuffle(exists_tag)
+            tags = exists_tag[:tags_num]
+            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
+            tags_args = self.handle_BPM_tag(tags_args)
+            prompt = prompt_template.apply(**tags_args)
+        else:
+            return 'none'
+        if self.use_literal_none and len(tags_args) == 0:
+            return 'none'
+        return prompt
+    def get_translation(self, tag_t, k):
+        k = k.strip()
+        if k in self.translate[tag_t]:
+            return self.translate[tag_t][k]
+        else:
+            return k
+    def generate_description_plain(self, item):
+        keywords = []
+        for tag_t in self.tag_types:
+            if tag_t == 'BPMDescript':
+                bpm = item['BPM']
+                if bpm is None or bpm.strip() == '' or bpm.strip() == '0':
+                    continue
+                this_key = gen_bpm_descript(bpm.strip(), lang=self.lang)
+            elif tag_t == 'BPM':
+                bpm = item['BPM']
+                if bpm is None or bpm.strip() == '' or bpm.strip() == '0':
+                    continue
+                this_key = f"{bpm.strip()} bpm"
+            else:
+                this_key = item[tag_t]
+                if this_key is None:
+                    continue
+                if isinstance(this_key, str):
+                    this_key = [this_key]
+                if self.lang != 'en':
+                    this_key = [self.get_translation(tag_t, k) for k in this_key]
+            if this_key is None:
+                continue
+            if isinstance(this_key, str):
+                this_key = [this_key]
+            keywords += this_key
+        return gen_plain_prompt(keywords, sep=self.keysep)
+    @property
+    def keysep(self):
+        if self.lang == 'zh':
+            return '，' if random.random() > 0.5 else '、'
+        elif self.lang == 'en':
+            return ', '
+    def tags_to_desc(self, tag_list, tag_type) -> str:
+        if self.lang == 'en':
+            return tags_to_desc(tag_list)
+        elif self.lang == 'zh':
+            if tag_type == 'BPM':
+                return tags_to_desc(tag_list, sep='、')
+            translator = self.translate[tag_type]
+            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
+            return tags_to_desc(translated_tag_list, sep='、')
+    def handle_BPM_tag(self, tags_args):
+        if "BPM" in tags_args and 'BPMDescript' in  self.tag_types:
+            bpm = tags_args["BPM"]
+            del tags_args["BPM"]
+            tag_types_used = random.choice((('BPM',), ('BPMDescript',), ('BPM', 'BPMDescript')))
+            for tag_type in tag_types_used:
+                tags_args[tag_type] = bpm if tag_type == 'BPM' else gen_bpm_descript(bpm, lang=self.lang)
+        return tags_args
+def mp3_path_to_id(mp3_path):
+    return int(
+        mp3_path[mp3_path.rindex('/') + 1 : mp3_path.rindex('.')]
+    )
+class TmeDataset(Dataset):
+    def __init__(self,
+                data_index:str,
+                music_info:str = None,
+                duration:float = 10,
+                sr:int = 0,
+                plain_rate = 0,
+                return_path = False,
+                return_audio = True,
+                return_ID = False,
+                prompt_format_path: os.PathLike = None,
+                tag_types = ['*'],
+                lang = 'zh',
+                translate: Optional[os.PathLike] = None,
+                prompt_dir: os.PathLike = None, #使用GPT生成的预有的prompt
+                ):
+        if plain_rate > 0:
+            print("Tme Dataset do not support plain rate > 0, use plain_rate = 0 instead.")
+            plain_rate = 0
+        self.audio_reader = SafeAudioReader(duration, sr)
+        self.sr = sr
+        self.duration = duration
+        self.plain_rate = plain_rate
+        self.return_path = return_path
+        self.return_audio = return_audio
+        self.return_ID = return_ID
+        self.lang = lang
+        self.use_ready_prompt = prompt_dir is not None
+        data_index = read_jsonlike(data_index)
+        self.data_index_dict = {mp3_path_to_id(d['path']) : d for d in data_index}
+        self.data_ids = list(self.data_index_dict.keys())
+        if not self.use_ready_prompt:
+            #读取音乐的信息文件
+            music_info = read_jsonlike(music_info)
+            if 'music' in music_info:
+                music_info = music_info['music']
+            self.music_info_dict = {d["歌曲ID"]:d for d in music_info}
+            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.music_info_dict}
+            self.data_ids = list(self.data_index_dict.keys())
+            with open(prompt_format_path) as fp:
+                self.prompt_formats = yaml.load(fp, Loader=yaml.FullLoader)
+            #加载tag types，并分成一般的tag_types和关键的key_tag_types
+            if '*' in tag_types:
+                self.tag_types = ['歌曲名', 'bpm', '专辑名', '歌手名', '作曲', 'tag']
+            else:
+                self.tag_types = tag_types
+            self.key_tag_types = []
+            if 'tag' in self.tag_types:
+                self.tag_types.remove('tag')
+                self.key_tag_types = list(self.prompt_formats['tag'].keys())
+            #加载translate翻译
+            if translate is not None:
+                self.translator = read_jsonlike(translate)
+        else:
+            data_ids_set = set(self.data_ids)
+            self.prompts_dict = {}
+            for fname in os.listdir(prompt_dir):
+                items = read_jsonlike(os.path.join(prompt_dir, fname))
+                for item in items:
+                    if item['ID'] not in data_ids_set or not self.is_valid_prompt_text(item['Text']):
+                        continue
+                    if item['ID'] not in self.prompts_dict:
+                        self.prompts_dict[item['ID']] = []
+                        self.prompts_dict[item['ID']].append(item['Text'])
+            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.prompts_dict}
+            self.data_ids = list(self.data_index_dict.keys())
+    def tags_to_desc(self, tag_list) -> str:
+        if is_bearable(tag_list, int):
+            return str(tag_list)
+        if self.lang == 'zh':
+            return tags_to_desc(tag_list, sep=self.sep)
+        else:
+            translated_tag_list = [self.translator[tag] for tag in tag_list if tag in self.translator ]
+            return tags_to_desc(translated_tag_list, sep=self.sep)
+    def gen_desc_of_tag(self, formats, tags):
+        fmt = random.choice(formats)
+        return fmt.format(self.tags_to_desc(tags))
+    @staticmethod
+    def check_valid(value):
+        if isinstance(value, int) or isinstance(value, float):
+            return value > 0
+        if (value is not None) and (not isinstance(value, Sequence) or len(value) > 0):
+            return True
+        return False
+    @staticmethod
+    def remove_repeat(data):
+        #若专辑名和歌曲名相同，则只使用后者
+        album_name = data.get('专辑名', None)
+        if album_name is not None and album_name == data.get('歌曲名', None):
+            del data['专辑名']
+        return data
+    @property
+    def comma(self):
+        if self.lang == 'zh':
+            return '，'
+        elif self.lang == 'en':
+            return ', '
+    @property
+    def sep(self):
+        if self.lang == 'zh':
+            return '、'
+        elif self.lang == 'en':
+            return ', '
+    def generate_description(self, item):
+        if random.random() > self.plain_rate:
+            # dynamically generate prompt from given prompt template
+            description = self.generate_description_dynamic(item)
+        else:
+            # use plain prompt, i.e. tags sequence separated by comma
+            description = self.generate_description_plain(item)
+        return description
+    def generate_description_dynamic(self, data):
+        data = self.remove_repeat(data)
+        weak_tags = [key for key in data if (key in self.tag_types and self.check_valid(data[key]))] #弱语义的tag，这些tag的出现比例会放低
+        key_tags = [key for key in data['tag'] if (key in self.key_tag_types and self.check_valid(data['tag'][key]))] #关键的tag，这些tag必须出现至少一个
+        prompts = []
+        if len(weak_tags) > 0:
+            probs = dist_prob_map_low[len(weak_tags)]
+            if len(key_tags) > 0:
+                tags_num = random.choices(range(0, len(weak_tags)), probs)[0]
+            else:
+                tags_num = random.choices(range(1, len(weak_tags) + 1), probs)[0]
+            random.shuffle(weak_tags)
+            tags = weak_tags[:tags_num]
+            for tag_type in tags:
+                tag_desc = self.gen_desc_of_tag(self.prompt_formats[tag_type], int(data[tag_type]) if tag_type == 'bpm' else data[tag_type])
+                prompts.append(tag_desc)
+        if len(key_tags) > 0:
+            probs = dist_prob_map[len(key_tags)]
+            tags_num = random.choices(range(1, len(key_tags) + 1), probs)[0]
+            random.shuffle(key_tags)
+            tags = key_tags[:tags_num]
+            for tag_type in tags:
+                tag_desc = self.gen_desc_of_tag(self.prompt_formats['tag'][tag_type], data['tag'][tag_type])
+                prompts.append(tag_desc)
+        random.shuffle(prompts)
+        return self.comma.join(prompts)
+    def generate_description_plain(self, item):
+        keywords = item['tag']
+        if self.lang != 'en':
+            keywords = [self.translator[k.strip()] for k in keywords]
+        return gen_plain_prompt(keywords, sep=self.keysep)
+    @property
+    def keysep(self):
+        if self.lang == 'zh':
+            return '，' if random.random() > 0.5 else '、'
+        elif self.lang == 'en':
+            return ', '
+    def is_valid_prompt_text(self, text):
+        for bad in ('抱歉','sorry', 'Sorry'):
+            if bad in text:
+                return False
+        return True
+    def get_ready_prompt(self, path):
+        sid = mp3_path_to_id(path)
+        return random.choice(self.prompts_dict[sid])
+    def __len__(self):
+        return len(self.data_ids)
+    def __getitem__(self, idx):
+        data_id = self.data_ids[idx]
+        item = self.data_index_dict[data_id]
+        path = item['path']
+        if not self.use_ready_prompt:
+            info = self.music_info_dict[data_id]
+            description = self.generate_description(info)
+        else:
+            description = self.get_ready_prompt(path)
+        if self.return_audio:
+            sr, duration = get_sr_and_duration_info(item)
+            audio = self.audio_reader(path, sr, duration)
+        else:
+            audio = None
+        if self.return_path:
+            if self.return_ID:
+                return audio, description, path, info['歌曲ID']
+            return audio, description, path
+        if self.return_ID:
+            return audio, description, info['歌曲ID']
+        return audio, description
+class Pond5Dataset(Dataset):
+    MAX_PROMPT_LEN = 200
+    def __init__(self,
+                metadata_path:str,
+                index_path:str,
+                duration:float=10,
+                sr:int = 0,
+                plain_rate = 0,
+                return_path = False,
+                return_audio = True,
+                lang = 'en',
+                translate:Optional[Dict[str, os.PathLike]] = None,
+                use_literal_none = True,
+                use_avoid_watermark_policy = None,
+                ):
+        if use_avoid_watermark_policy is None:
+            raise ValueError("`use_avoid_watermark_policy` is an important param, you need to explicitly specify it with bool type")
+        self.use_avoid_watermark_policy = use_avoid_watermark_policy
+        self.audio_reader = SafeAudioReader(duration, sr, use_avoid_watermark_policy=use_avoid_watermark_policy)
+        self._load_metadata(metadata_path, index_path)
+        self.sr = sr
+        self.duration = duration
+        self.plain_rate = plain_rate
+        self.return_path = return_path
+        self.return_audio = return_audio
+        self.use_literal_none = use_literal_none
+        self.lang = lang
+        self.translate = read_translate(translate)
+    def _load_metadata(self, metadata_path, index_path):
+        data_index = read_jsonlike(index_path)
+        data_ids = set([item['id'] for item in data_index])
+        with open(metadata_path) as fp:
+            lines = fp.readlines()
+        append_ids = set()
+        self.data = []
+        for line in lines:
+            item = json.loads(line)
+            if item['id'] in data_ids and item['id'] not in append_ids:
+                self.data.append(item)
+                append_ids.add(item['id'])
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        path:str = item["path"]
+        description = self.generate_description(item)
+        if self.return_audio:
+            sr, duration = get_sr_and_duration_info(item)
+            audio = self.audio_reader(path, sr, duration)
+        else:
+            audio = None
+        if self.return_path:
+            return audio, description, path
+        return audio, description
+    @property
+    def keysep(self):
+        if self.lang == 'zh':
+            return '，' if random.random() > 0.5 else '、'
+        elif self.lang == 'en':
+            return ', '
+    def generate_description(self, item):
+        if random.random() > self.plain_rate:
+            # dynamically generate prompt from given prompt template
+            description = self.generate_description_dynamic(item)
+        else:
+            # use plain prompt, i.e. tags sequence separated by comma
+            description = self.generate_description_plain(item)
+        return description
+    def get_translation(self, k):
+        k = k.strip()
+        if k in self.translate:
+            return self.translate[k]
+        else:
+            return k
+    def generate_description_plain(self, item):
+        keywords = item['keywords']
+        if self.lang != 'en':
+            keywords = [self.get_translation(k) for k in keywords]
+        return gen_plain_prompt(keywords, sep=self.keysep)
+    def generate_description_dynamic(self,item):
+        desc = item.get('desc', 'none')
+        if desc is None:
+            desc = 'none'
+        desc = desc.strip()
+        if len(desc) > self.MAX_PROMPT_LEN:
+            shorter_desc = desc[:self.MAX_PROMPT_LEN]
+            # find last stop
+            stop_idx = shorter_desc.rfind('.')
+            if stop_idx == -1:
+                stop_idx = shorter_desc.rfind('!')
+            if stop_idx == -1:
+                stop_idx = shorter_desc.rfind(',')
+            if stop_idx == -1:
+                stop_idx = self.MAX_PROMPT_LEN - 1
+            desc = desc[:stop_idx+1]
+        return desc
+class SoundDataset(Dataset):
+    def __init__(self,
+                metadata_index: str,
+                duration:float = 10,
+                min_non_silent_duration:float = 3,
+                sr:int = 0,
+                return_path = False,
+                return_audio = True,
+                ):
+        self.data = read_jsonlike(metadata_index)
+        self.sr = sr
+        self.reader = SafeAudioReader(duration, sr)
+        self.duration = duration
+        self.min_non_silent_duration = min_non_silent_duration
+        self.return_audio = return_audio
+        self.return_path = return_path
+    def __getitem__(self, index):
+        item = self.data[index]
+        if self.return_audio:
+            origin_duration = item['duration']
+            if origin_duration < self.min_non_silent_duration:
+                audio = self.read_and_repeat_and_pad(item)
+            else:
+                audio = self.reader(item['path'], item['sample_rate'], origin_duration)
+        else:
+            audio = None
+        desc = item['caption']
+        if self.return_path:
+            return audio, desc, item['path']
+        else:
+            return audio, desc
+    def __len__(self):
+        return len(self.data)
+    def read_and_repeat_and_pad(self, item):
+        path = item['path']
+        try:
+            # read
+            clip, sr = torchaudio.load(path)
+            if len(clip.shape) > 1:
+                clip = torch.mean(clip, dim=0, keepdim=True)
+            clip = resample(clip, sr, self.sr)
+            #repeat
+            n_repeats = math.ceil(self.min_non_silent_duration/item['duration'])
+            clip = torch.repeat_interleave(clip, n_repeats, dim=0).reshape(-1)
+            #pad
+            n_samples = int(self.duration * self.sr)
+            if clip.shape[0] >= n_samples:
+                audio = clip[:n_samples]
+            else:
+                audio = torch.zeros(int(self.duration * self.sr), dtype=clip.dtype)
+                start_pos = np.random.randint(0, max(0,(n_samples - clip.shape[0])))
+                audio[start_pos:start_pos+clip.shape[0]] = clip
+            return audio
+        except Exception as e:
+            logger.error(f"Error reading {path}: {e}")
+            wav = torch.zeros(int(self.duration * self.sr), dtype=torch.float32)
+        return wav
+class CombinedDataset(Dataset):
+    @beartype
+    def __init__(self, datasets: Sequence[Dataset], ratios: Sequence[int]):
+        self.datasets = datasets
+        self.datasets_index = []
+        for i,dataset in enumerate(datasets):
+            if dataset is None:
+                continue
+            for dup in range(ratios[i]):
+                for j in range(len(dataset)):
+                    self.datasets_index.append((i,j))
+    def __len__(self):
+        return len(self.datasets_index)
+    def __getitem__(self, idx):
+        index = self.datasets_index[idx]
+        i,j = index
+        return self.datasets[i][j]
+class CombinedDataset_random(Dataset):
+    @beartype
+    def __init__(self, num_examples:int, datasets: Sequence[Dataset], ratios: Sequence[int]):
+        self.datasets = datasets
+        self.datasets_index = []
+        for i,dataset in enumerate(datasets):
+            if dataset is None:
+                continue
+            for dup in range(ratios[i]):
+                for j in range(len(dataset)):
+                    self.datasets_index.append((i,j))
+        if num_examples > 0:
+            self.random_choose = True
+            self.dataset_len = num_examples
+        else:
+            self.random_choose = False
+            self.dataset_len = len(self.datasets_index)
+    def __len__(self):
+        return self.dataset_len
+    def __getitem__(self, idx):
+        first_try = True
+        try_cnt = 0
+        while True:
+            try:
+                if(self.random_choose or not first_try):
+                    index2 = []
+                    index2.append(np.random.randint(0,len(self.datasets)))
+                    index2.append(np.random.randint(0,len(self.datasets[index2[-1]])))
+                else:
+                    index2 = self.datasets_index[idx]
+                first_try = False
+                out = list(self.datasets[index2[0]][index2[1]])
+                return out
+            except:
+                print("Error loadding ", index2)
+                try_cnt += 1
+                if(try_cnt>10):
+                    raise ValueError()
+class SoundMixedDataset(Dataset):
+    @staticmethod
+    def music_desc(desc):
+        return f'Music:<{desc}>'
+    @staticmethod
+    def sound_desc(desc):
+        return f'Effect:<{desc}>'
+    def __init__(self,
+                 music_dataset: Dataset,
+                 sound_dataset: Dataset,
+                 mixed_ratios: Tuple[float, float, float] = (0.3, 0.3, 0.4)  # 只有音乐：只有音效：音乐音效混合 的比例
+                 ) -> None:
+        self.music_dataset = music_dataset
+        self.sound_dataset = sound_dataset
+        music_r, sound_r, mix_r = [r/sum(mixed_ratios) for r in mixed_ratios] #化为0-1间的比例
+        #三个概率区间的左端点
+        self.music_anchor = 0
+        self.sound_anchor = music_r
+        self.mix_anchor = music_r + sound_r
+    def __len__(self):
+        return len(self.music_dataset)
+    def get_random_sound_data(self):
+        idx = random.randint(0, len(self.sound_dataset)-1)
+        return self.sound_dataset[idx]
+    def __getitem__(self, idx):
+        p = random.random()
+        if p >= self.mix_anchor:
+            music, m_desc = self.music_dataset[idx]
+            sound, s_desc = self.get_random_sound_data()
+            audio = music + sound
+            if(audio.abs().max()>1.0):
+                music = music / audio.abs().max() * 0.95
+                audio = audio / audio.abs().max() * 0.95
+            desc = self.music_desc(m_desc) + self.sound_desc(s_desc)
+            return audio[None,:], music[None,:], desc
+        elif p >= self.sound_anchor:
+            audio, desc = self.get_random_sound_data()
+            return audio[None,:], torch.zeros_like(audio[None,:]), self.sound_desc(desc)
+        else:
+            audio, desc = self.music_dataset[idx]
+            return audio[None,:], audio[None,:], self.music_desc(desc)
+class DecoTagDataset(Dataset):
+    '''这个类把普通的datatset包装成适用于标签解耦学习的dataset'''
+    TAG_TYPES = ('genre', 'mood', 'insrument')
+    def __init__(self, dataset_class: type, tag_map: Dict[str, str], *args, **kwargs):
+        self.datasets = []
+        for i, tag_t in enumerate(self.TAG_TYPES):
+            kwargs['tag_types'] = [tag_map[tag_t]]
+            kwargs['return_audio'] = (i == 0) #只有第0个需要返回音频和文本，其余只需要返回文本
+            self.datasets.append(dataset_class(*args, **kwargs))
+    def __len__(self):
+        return len(self.datasets[0])
+    def __getitem__(self, idx):
+        audio, text = self.datasets[0][idx]
+        texts = (text, self.datasets[1][idx][1], self.datasets[2][idx][1])
+        return audio, texts
+class DecoTagWrapper:
+    '''这是一个包装器，便于选择是否使用标签解耦学习'''
+    def __init__(self, dataset_class: Dataset, deco_tag_types: List[str] = list(), switch_on: bool = False):
+        self.dataset_class = dataset_class
+        self.tag_map = dict(zip(DecoTagDataset.TAG_TYPES, deco_tag_types))
+        self.switch_on = switch_on
+    def __call__(self, *args, **kwargs):
+        if self.switch_on:
+            return DecoTagDataset(self.dataset_class, self.tag_map, *args, **kwargs)
+        else:
+            return self.dataset_class(*args, **kwargs)

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_429.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import re
+import sys
+import json
+from typing import List, Union
+from torch.utils.data import Dataset
+import torchaudio
+from torchaudio.functional import resample
+import torch
+import numpy as np
+from torch.nn.utils.rnn import pad_sequence
+PARAGRAPH_GAP = 6
+MIN_MUSIC_LEN = 3
+def check_lryics(lyric):
+    _FILTER_STRING = [
+        '作词', '作曲', '编曲', '【', '策划',
+        '录音', '混音', '母带', '：', '制作',
+        '版权', '校对', '演奏', '制作', '伴奏'
+    ]
+    for item in _FILTER_STRING:
+        if item in lyric:
+            return True
+    return False
+def process_lyrics(lines):
+    lyric_part = []
+    timestamp_part = []
+    timestamp_pattern = re.compile(r'\[\d+:\d+(\.\d+)?\]')
+    for i, line in enumerate(lines):
+        # 删除前几行的特定信息
+        if i<10 and check_lryics(line):
+            continue
+        # 检查是否包含有效的时间戳和歌词内容
+        if timestamp_pattern.match(line):
+            timestamp_end = line.rfind(']')
+            lyrics = line[timestamp_end + 1:].strip()
+            timestamps = line[:timestamp_end + 1]
+            if '：' in lyrics:
+                if len(lyrics.split("：")[0]) <=5:
+                     lyrics = "".join(lyrics.split("：")[1:])
+            # if lyrics:  # 确保歌词部分不是空的
+            #     lyric_part.append(lyrics)
+            #     timestamp_part.append(timestamps)
+    # print(processed_lyrics)
+    return timestamp_part, lyric_part
+def get_timestamps(timestamp_part):
+    # 转换为秒
+    timestamps = []
+    for line in timestamp_part:
+        match = re.match(r'\[(\d+):(\d+)(\.\d+)?\]', line)
+        if match:
+            minutes = int(match.group(1))
+            seconds = float(match.group(2))
+            millis = float(match.group(3)) if match.group(3) else 0
+            total_seconds = minutes * 60 + seconds + millis
+            timestamps.append(total_seconds)
+    return timestamps
+def process_lyrics_lrc(lyrics):
+    timestamp_part, lyric_part = process_lyrics(lyrics)
+    # print(timestamp_part)
+    # print(lyric_part)
+    timestamps = get_timestamps(timestamp_part)
+    # print(timestamps)
+    if len(timestamps) == 0:
+        # print(f'{lyric_path}')
+        return []
+    slice_start = timestamps[0]
+    slice_start_idx = 0
+    output_list = []
+    for i in range(1, len(timestamps)):
+        # 如果累积时间超过30秒，则进行切分, 如果整体小于30s, 整句会被丢掉
+        if timestamps[i] - slice_start > 30:
+            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
+            slice_start = timestamps[i]
+            slice_start_idx = i
+    return output_list
+def process_lyrics_yrc(lyrics):
+    timestamps, lyric_part = extract_lrc(lyrics)
+    # timestamp_part, lyric_part = process_lyrics(lyrics)
+    # import pdb; pdb.set_trace()
+    # print(timestamp_part)
+    # print(lyric_part)
+    # timestamps = get_timestamps(timestamp_part)
+    # print(timestamps)
+    if len(timestamps) == 0:
+        # print(f'{lyric_path}')
+        return []
+    slice_start = timestamps[0]
+    slice_start_idx = 0
+    output_list = []
+    for i in range(1, len(timestamps)):
+        # 如果累积时间超过30秒，则进行切分
+        if timestamps[i] - slice_start > 30:
+            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
+            slice_start = timestamps[i]
+            slice_start_idx = i
+    # import pdb; pdb.set_trace()
+    return output_list
+def extract_lrc(lyrics):
+    timestamp_part, lyric_part = [], []
+    for i,  text in enumerate(lyrics):
+        # 提取中括号内的内容
+        bracket_content = re.search(r'\[(.*?)\]', text).group(1)
+        bracket_content = bracket_content.split(',')
+        # 提取小括号内的内容
+        parentheses_content = re.findall(r'\((.*?)\)', text)
+        # 提取其他内容
+        other_content = re.sub(r'\[(.*?)\]|\((.*?)\)', '', text).strip()
+        # 数据怎么处理？
+        if i<10 and check_lryics(other_content):
+            continue
+        timestamp_part.append(float(bracket_content[0])/1000)
+        lyric_part.append(other_content)
+    return timestamp_part, lyric_part
+class WYYSongDataset(Dataset):
+    def __init__(self,
+                metadata_path: Union[str, List[str]],
+                sr:int = 0,
+                use_lang = ['en', 'zh-cn'],
+                num_examples = -1,
+                max_dur = 20,
+                min_dur=0,
+                add_music=False,
+                pad_to_max= True,
+                ):
+        self.sr = sr
+        self.use_lang = use_lang
+        self.data = []
+        if type(metadata_path) == str:
+            metadata_path = [metadata_path]
+        for _meta in metadata_path:
+            self._load_metadata(_meta)
+        self.max_dur = max_dur
+        self.min_dur = min_dur
+        self.pad_to_max = pad_to_max
+        self.add_music = add_music
+        # buffer
+        self.lyric_buffer = {}
+        if(num_examples<=0):
+            self.dataset_len = len(self.data)
+            self.random_slc = False
+        else:
+            self.dataset_len = num_examples
+            self.random_slc = True
+    # 读取jsonl文件
+    def _load_metadata(self, metadata_path):
+        with open(metadata_path) as fp:
+            lines = fp.readlines()
+            for line in lines:
+                item = json.loads(line)
+                if '伴奏' not in item['path']:
+                    # if "lang_type" in item and item['lang_type'] == 'en':
+                     if "lang_type" in item:
+                        self.data.append(item)
+    def __len__(self):
+        return self.dataset_len
+    def __getitem__(self, idx):
+        try_cnt = 0
+        while True:
+            if(self.random_slc):
+                idx = np.random.randint(0, len(self.data))
+            yrc_lyrics = []
+            lrc_lyrics = []
+            try:
+                info = self.data[idx]
+                # audio path
+                path = info["path"]
+                lang_type = info["lang_type"]
+                lyrics = info['lyrics'] # chinese
+                    # lyrics = info['lyrics_phone']
+                # 随机选取一个lyric段落
+                parsed_lyrics = []
+                # st_idx = np.random.randint(0, len(lyrics))
+                for ly_id in range(len(lyrics)):
+                    lyric = lyrics[ly_id].strip()
+                    st, et, lyric = self.parse_lyric(lyric)
+                    if et - st >= self.max_dur:
+                        continue #TODO 前后外沿 [MUSIC]
+                    if parsed_lyrics != []:
+                        if st - parsed_lyrics[-1][1] >= PARAGRAPH_GAP: # 大gap
+                            parsed_lyrics.append((parsed_lyrics[-1][1], st, '[GAP]'))
+                        elif self.add_music and st - parsed_lyrics[-1][1] >= MIN_MUSIC_LEN:
+                            parsed_lyrics.append((parsed_lyrics[-1][1], st, '[MUSIC]'))
+                    lyric = lyric.replace("\xa0", " ")
+                    lyric = " ".join(lyric.split())
+                    parsed_lyrics.append((st, et, lyric))
+                assert parsed_lyrics != []
+                # if parsed_lyrics[-1][1] - parsed_lyrics[0][0] > self.max_dur:
+                #     print(f"{parsed_lyrics[0][0]}-{parsed_lyrics[-1][1]} {parsed_lyrics}", file=open('tmp.txt', 'a'))
+                parsed_lyrics = [(0, parsed_lyrics[0][0], '[GAP]')] + parsed_lyrics
+                possible_starts = [e for e,i in enumerate(parsed_lyrics) if i[2]=='[GAP]']
+                st_idx = np.random.choice(possible_starts)
+                paraphrase = []
+                for i in parsed_lyrics[st_idx+1:]:
+                    if i[2] == '[GAP]':
+                        break
+                    paraphrase.append(i)
+                # print(paraphrase, lyrics)
+                while paraphrase[-1][1] - paraphrase[0][0] > self.max_dur:
+                    if np.random.rand() > 0.2:
+                        paraphrase.pop(-1) # 大概率从后面截断
+                    else:
+                        paraphrase.pop(0) # 小概率截前面
+                st, et, lyric = paraphrase[0][0], paraphrase[-1][1], ', '.join([i[2] for i in paraphrase]) # [SEP]
+                # print(st, et, lyric)
+                # import pdb; pdb.set_trace()
+                assert self.min_dur < et - st < self.max_dur, f"{st}-{et} {lyric}"
+                # print(et-st, lyric)
+                # import pdb; pdb.set_trace()
+                if info["lang_type"] == 'en':
+                    # print(len(lyric.split())/(et-st))
+                    char_num = sum([len(lrc[-1].split()) for lrc in paraphrase])
+                    assert 6 > char_num / (et-st) > 1
+                else:
+                    # print(len(lyric.split())/(et-st))
+                    char_num = sum([len(lrc[-1]) for lrc in paraphrase])
+                    assert 6 > char_num / (et-st) > 1
+                # 读取音频文件
+                cur_sample_rate = torchaudio.info(path).sample_rate
+                offset = int(cur_sample_rate*st)
+                num_frames = int(cur_sample_rate * (et -st))
+                chunk, _ = torchaudio.load(path, frame_offset=offset, num_frames=num_frames)
+                # chunk = torch.zeros(1, 48000*15)
+                if abs(chunk.shape[-1] - num_frames) > num_frames * 0.05: # 音频文件长度与歌词不一致
+                    print(f"fail to load {path} from {st} to {et} !")
+                    raise FileNotFoundError
+                # 随机选取一个channel
+                if(chunk.shape[0]>1):
+                    chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
+                else:
+                    chunk = chunk[[0],:].float()
+                if(cur_sample_rate!=self.sr):
+                    # print('a:',cur_sample_rate,chunk.shape)
+                    chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sr)
+                if self.pad_to_max:
+                    chunk = self.pad_2d_tensor(chunk, int(self.max_dur * self.sr), 0)
+                # print(self.sz_cnt)
+                return chunk, lyric, [st, et], path, lang_type
+            except (AssertionError, FileNotFoundError, RuntimeError) as e: # 其他Error不ok
+                    # print("Error loadding ", info["path"])
+                    try_cnt += 1
+                    idx  = np.random.randint(0, len(self.data))
+                    if(try_cnt>100):
+                        raise e
+    def parse_lyric(self, lyric):
+        pattern = r'\[(\d+\.\d+):(\d+\.\d+)\](.*)'
+        match = re.search(pattern, lyric)
+        start_time = float(match.group(1))
+        end_time = float(match.group(2))
+        content = match.group(3)
+        return start_time, end_time, content
+    def pad_2d_tensor(self, x, max_len, pad_id):
+        # 获取输入 tensor 的形状
+        batch_size, seq_len = x.size()
+        max_len = max(max_len, seq_len)
+        # 计算需要填充的长度
+        pad_len = max_len - seq_len
+        # 如果需要填充
+        if pad_len > 0:
+            # 创建填充 tensor
+            pad_tensor = torch.full((batch_size, pad_len), pad_id, dtype=x.dtype, device=x.device)
+            # 沿第二个维度（列）连接输入 tensor 和填充 tensor
+            padded_tensor = torch.cat([x, pad_tensor], dim=1)
+        else:
+            # 如果不需要填充，直接返回输入 tensor
+            padded_tensor = x
+        return padded_tensor
+def collect_data(data_list):
+    audios =  pad_sequence([data[0].t() for data in data_list], batch_first=True, padding_value=0).transpose(1,2)
+    lyrics = [data[1] for data in data_list]
+    st_et = [data[2] for data in data_list]
+    paths = [data[3] for data in data_list]
+    lang_types = [data[4] for data in data_list]
+    return audios, lyrics, st_et
+    # return audios, lyrics, st_et
+def build_dataset(train_jsonl_list, val_jsonl_list, min_dur=0, max_dur=20, add_music=False):
+    print(min_dur,max_dur)
+    print(train_jsonl_list)
+    # ["exp/wyy3_20240418_v2f.jsonl",
+                        # "exp/tme_lyric_baokuan.jsonl"]
+    train_dataset = WYYSongDataset(
+        metadata_path = train_jsonl_list,
+        sr = 48000,
+        use_lang = ['zh-cn', 'en'],
+        num_examples = 10*10000,
+        min_dur=min_dur,
+        max_dur=max_dur,
+        add_music=add_music
+    )
+    valid_dataset = WYYSongDataset(
+        metadata_path = val_jsonl_list,
+        sr = 48000,
+        use_lang = ['zh-cn', 'en'],
+        num_examples = 500,
+        min_dur=min_dur,
+        max_dur=max_dur,
+        add_music=add_music
+    )
+    print(train_jsonl_list, "\t total_song = ", len(train_dataset.data))
+    return train_dataset, valid_dataset

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_combined.py ADDED Viewed

	@@ -0,0 +1,830 @@

+from torch.utils.data import Dataset
+from beartype.typing import Sequence, Callable, Optional, Dict, Tuple, List
+from beartype import beartype
+from beartype.door import is_bearable
+import random
+import pandas as pd
+import os
+from torchaudio.functional import resample
+import torch
+import typing as tp
+from pathlib import Path
+import torchaudio as ta
+import torch.nn.functional as F
+import numpy as np
+import json
+import yaml
+import torchaudio
+import math
+import re
+from loguru import logger
+class Read_and_PadCrop_Normalized_T(torch.nn.Module):
+    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):
+        super().__init__()
+        self.n_samples = n_samples
+        self.sample_rate = sample_rate
+        self.randomize = randomize
+    def __call__(self, filename: str, duration: float, cur_sample_rate: int) -> Tuple[torch.Tensor, float, float, int, int]:
+        if(duration<(float(self.n_samples)/self.sample_rate+1)):
+            # print(duration,(float(self.n_samples)/self.sample_rate+1))
+            chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
+            t_start = 0.
+            t_end = min(1.0, float(self.n_samples) / float(self.sample_rate) / duration)
+            offset = 0
+            # print('c1:',chunk.shape)
+        else:
+            offset = np.random.randint(0,int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
+            t_start = offset / float(cur_sample_rate) / duration
+            t_end = t_start + float(self.n_samples) / float(self.sample_rate) / duration
+            chunk, _ = torchaudio.load(filename, frame_offset=offset, num_frames=int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
+            # print('offset:',offset)
+            # print('c0:',chunk.shape)
+        # Pad with silence if necessary.
+        if(chunk.shape[0]>1):
+            chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
+        else:
+            chunk = chunk[[0],:].float()
+        if(cur_sample_rate!=self.sample_rate):
+            # print('a:',cur_sample_rate,chunk.shape)
+            chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sample_rate)
+            # print('b:',self.sample_rate,chunk.shape)
+        if chunk.shape[-1] < self.n_samples:
+            chunk = torch.cat([chunk, torch.zeros((1, self.n_samples - chunk.shape[-1],))],-1)
+        else:
+            chunk = chunk[:,0:self.n_samples]
+        seconds_start = math.floor(offset / cur_sample_rate)
+        seconds_total = math.floor(duration)
+        return (
+            chunk,
+            t_start,
+            t_end,
+            seconds_start,
+            seconds_total
+        )
+USE_DUMMY_AUDIO = False #当测试代码时，可以将其置为True，这样就不会读取实际数据，而是用生成的静默音频代替
+if USE_DUMMY_AUDIO:
+    logger.warning("USE_DUMMY_AUDIO flag is True, don't use it when train or test!")
+class SafeAudioReader:
+    """
+       This class is an adaptor to Read_and_PadCrop_Normalized_T, make it safe to read audio data.
+    """
+    def __init__(self,
+                duration: float,  # 返回音频长度
+                sample_rate: int, # 返回音频的采样率，如与实际音频采样率不同，会作resample
+                randomize: bool = True
+                ):
+        self.n_samples = int(sample_rate * max(duration, 0))
+        self.reader = Read_and_PadCrop_Normalized_T(n_samples=self.n_samples, sample_rate=sample_rate, randomize=randomize)
+    #NOTE:这个是核心的函数，所有数据集读取音频都是调用的这个函数！
+    def __call__(self,
+                 filepath: os.PathLike,  # 音频路径
+                 origin_sample_rate: Optional[int] = None,  # 从json文件中读取的实际采样率，如果不给定，则会从文件头中读取
+                 origin_duration: float = None, # 从json文件中读取的实际时长，如果不给定，则会从文件头中读取
+                 ) -> torch.Tensor:
+        if USE_DUMMY_AUDIO:
+            wav = torch.zeros(self.n_samples, dtype=torch.float32)
+            return wav
+        try:
+            if origin_sample_rate is None or origin_duration is None:
+                audio_info = torchaudio.info(filepath)
+                origin_sample_rate = audio_info.sample_rate
+                origin_duration = audio_info.num_frames / origin_sample_rate
+            wav, *ignored = self.reader(filepath, origin_duration, origin_sample_rate)
+        except Exception as e:
+            logger.error(f"Error reading {filepath}: {e}")
+            wav = torch.zeros(self.n_samples, dtype=torch.float32)
+        return wav
+class PromptTemplate:
+    def __init__(self, template_text: str, tag_map: Dict[str, str], lang:str ='en'):
+        self.template_text = template_text
+        self.tag_map = tag_map
+        self.lang = lang
+    @property
+    def tags(self):
+        return tuple(self.tag_map.keys())
+    def apply(self, **kwargs):
+        for tag in list(kwargs.keys()):
+            if kwargs[tag] == '':
+                kwargs.pop(tag)
+        for tag in self.tags:
+            if tag in kwargs:
+                kwargs[tag] = self.tag_map[tag].format(**{tag: kwargs[tag]}).strip('[]')
+            else:
+                kwargs[tag] = ''
+        prompt = self.template_text.format(**kwargs)
+        return self.beautify(prompt)
+    def beautify(self, text):
+        if self.lang == 'en':
+            return self._beautify_en(text)
+        elif self.lang == 'zh':
+            return self._beautify_zh(text)
+        else:
+            raise ValueError(f'Unknown language {self.lang}')
+    @staticmethod
+    def _beautify_en(text):
+        # no continuous commas without content between them
+        text = re.sub(r'[,\s]*,[,\s]*', r', ', text)
+        # no continuous whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # the comma is NOT followed by whitespace, and should be followed by ONE whitespace
+        text = re.sub(r'\s+,', r',', text)
+        text = re.sub(r',\s+', r', ', text)
+        # no whitespace before the full stop
+        text = re.sub(r'\s+\.', r'.', text)
+        # strip whitespace, comma, and replace ',.'
+        text = text.strip(' ,')
+        text = text.replace(',.', '.')
+        return text
+    @staticmethod
+    def _beautify_zh(text):
+        # no continuous commas without content between them
+        text = re.sub(r'[，、\s]*，[，、\s]*', r'，', text)
+        text = re.sub(r'[，、\s]*、[，、\s]*', r'、', text)
+        # assume there should be NO whitespace in Chinese
+        text = re.sub(r'\s+', r'', text)
+        # strip whitespace, comma, and replace '，。'
+        text = text.strip('， 、')
+        text = text.replace('，。', '。')
+        return text
+    def __repr__(self):
+        return f'PromptTemplate({self.template_text!r}, {self.tag_map!r})'
+    __str__ = __repr__
+def parse_prompt_template(prompt_template_text, lang='en'):
+    span_pattern = re.compile(r'\[.*?{.+?}.*?\]', re.DOTALL)
+    tag_pattern = re.compile(r'{.+?}', re.DOTALL)
+    template_text = prompt_template_text.strip()
+    span_texts = span_pattern.findall(prompt_template_text)
+    tag_map = {}
+    for span_text in span_texts:
+        tag = tag_pattern.findall(span_text)[0].strip('{}')
+        tag_map[tag] = span_text
+        template_text = template_text.replace(span_text, '{'+tag+'}')
+    return PromptTemplate(template_text=template_text, tag_map=tag_map, lang=lang)
+def load_prompt_templates(path, num = 5, lang='en') -> List[PromptTemplate]:
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    cnt = 0
+    pts = []
+    for line in lines:
+        pt = parse_prompt_template(line, lang=lang)
+        cnt += 1
+        if len(pt.tags) < num:
+            logger.error(f'Not enough tags on {path} in line {cnt}: {pt.tags}')
+        pts.append(pt)
+    return pts
+def get_base_dir_file(key: os.PathLike):
+    base = os.path.basename(key)
+    dirname = os.path.basename(os.path.dirname(key))
+    return os.path.join(dirname, base)
+def read_jsonlike(path: os.PathLike):
+    #json or jsonl
+    if str(path).endswith(".json"):
+        with open(path, 'r', encoding='utf8') as f:
+            data = json.load(f)
+        return data
+    elif str(path).endswith(".jsonl"):
+        with open(path, 'r', encoding='utf8') as f:
+            data = [json.loads(line) for line in f.readlines()]
+        return data
+    else:
+        raise ValueError("Unknown file format")
+dist_prob_map = {
+    1: (1.0,),
+    2: (0.5, 0.5),
+    3: (0.3, 0.4, 0.3),
+    4: (0.2, 0.3, 0.3, 0.2),
+    5: (0.2, 0.2, 0.3, 0.2, 0.1),
+    6: (0.1, 0.15, 0.2, 0.2, 0.2, 0.15),
+    7: (0.05, 0.1, 0.1, 0.2, 0.25, 0.2, 0.1),
+    8: (0.03, 0.05, 0.1, 0.15, 0.25, 0.2, 0.1, 0.12),
+    9: (0.02, 0.1, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.08),
+    10: (0.01, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.05, 0.05, 0.09)
+}
+dist_prob_map_low = {
+    1: (1.0,),
+    2: (0.8, 0.2),
+    3: (0.8, 0.1, 0.1),
+    4: (0.7, 0.1, 0.1, 0.1),
+    5: (0.7, 0.1, 0.1, 0.05, 0.05),
+    6: (0.7, 0.1, 0.05, 0.05, 0.05, 0.05),
+}
+_bpm_range_rights = (
+    (40, '20-40'),
+    (60, '40-60'),
+    (66, '60-66'),
+    (76, '66-76'),
+    (108, '76-108'),
+    (120, '108-120'),
+    (168, '120-168'),
+    (176, '168-176'),
+    (200, '176-200')
+)
+_bpm_desc_map = {
+    '20-40': ("glacial pace", "extremely slow tempo", "crawl-like speed", "snail's pace", "almost motionless rhythm", "Larghissimo"),
+    '40-60': ("broad and slow", "spacious tempo", "unhurried pace", "calm rhythm", "relaxed speed", "Largo"),
+    '60-66': ("gentle tempo", "leisurely pace", "easy-going rhythm", "unrushed speed", "smooth and slow", 'Larghetto'),
+    '66-76': ("slow and steady", "deliberate tempo", "unhurried pace", "relaxed rhythm", "easy speed", 'Adagio'),
+    '76-108': ("walking pace", "moderate tempo", "steady rhythm", "balanced speed", "easy-flowing tempo", "Andante"),
+    '108-120': ("medium pace", "comfortable tempo", "even rhythm", "measured speed", "controlled tempo", 'Moderato'),
+    '120-168': ("quick and lively", "brisk pace", "energetic tempo", "upbeat rhythm", "spirited speed", 'Allegro'),
+    '168-176': ("lively and fast", "bright tempo", "sprightly pace", "vibrant rhythm", "animated speed", 'Vivace'),
+    '176-200': ("very fast tempo", "rapid pace", "high-speed rhythm", "hurried speed", "accelerated tempo", 'Presto'),
+    '>200': ("extremely fast", "breakneck speed", "blazing tempo", "lightning-fast rhythm", "supercharged pace", 'Prestissimo')
+}
+_bpm_desc_map_zh = {
+    '20-40': ("极度缓慢", "极慢的节奏", "悠长的旋律", "迟缓的节奏", "几乎静止的节奏", "甚缓"),
+    '40-60': ("宽广而缓慢", "宽敞的节奏", "从容不迫的速度", "平静的节奏", "轻松的速度", "广板"),
+    '60-66': ("柔和的节奏", "悠闲的速度", "轻松的节奏", "不慌不忙的速度", "平滑而缓慢", '小广板'),
+    '66-76': ("缓慢而稳定", "沉稳的旋律", "从容不迫的速度", "轻松的节奏", "轻松的速度", '慢板'),
+    '76-108': ("步行速度", "适中的节奏", "稳定的节奏", "平衡的速度", "流畅的节奏", "行板"),
+    '108-120': ("中等速度", "舒适的节奏", "均匀的节奏", "有节制的速度", "稳定的氛围", '中板'),
+    '120-168': ("快速而生动", "轻快的速度", "充满活力的节奏", "欢快的节奏", "富有精神的速度", '快板'),
+    '168-176': ("生动而快速", "明快的节奏", "活泼的速度", "充满活力的节奏", "生气勃勃的速度", '活泼的'),
+    '176-200': ("非常快的节奏", "快速的速度", "高速的节奏", "匆忙的速度", "加速的节奏", '急板'),
+    '>200': ("极快的速度", "极速旋律", "炽热的节奏", "闪电般的节奏", "疾驰的速度", '最急板')
+}
+def get_bpm_range(bpm):
+    bpm = int(bpm)
+    for right, tag in _bpm_range_rights:
+        if bpm <= right:
+            return tag
+    return '>200'
+def gen_bpm_descript(bpm, lang='en'):
+    bpm_range = get_bpm_range(bpm)
+    if lang == 'en':
+        return random.choice(_bpm_desc_map[bpm_range])
+    elif lang == 'zh':
+        return random.choice(_bpm_desc_map_zh[bpm_range])
+    else:
+        raise ValueError(f"Unknown language {lang}")
+def read_translate(translate: Optional[Dict[str, os.PathLike]]):
+    if translate is None:
+        return None
+    return {k: read_jsonlike(path) for k, path in translate.items()}
+class MagnaTagATuneDataset(Dataset):
+    def __init__(self):
+        pass
+def tags_to_desc(tag_list, sep=',') -> str:
+    if not isinstance(tag_list, Sequence):
+        return str(tag_list)
+    if isinstance(tag_list, str):
+        return tag_list
+    if len(tag_list) <= 0:
+        return ''
+    elif len(tag_list) <= 5:
+        probs = dist_prob_map[len(tag_list)]
+        tags_num = random.choices(range(1, len(tag_list)+1), probs)[0]
+        random.shuffle(tag_list)
+        tag_list = tag_list[:tags_num]
+        return sep.join(tag_list)
+    else:
+        probs = dist_prob_map[5]
+        tags_num = random.choices(range(1, 6), probs)[0]
+        random.shuffle(tag_list)
+        tag_list = tag_list[:tags_num]
+        return sep.join(tag_list)
+def get_sr_and_duration_info(item):
+    return item.get('sample_rate', None), item.get('duration', None)
+class MtgJamendoDatasetFromJson(Dataset):
+    def __init__(self,
+                data_dir:str,
+                json_path:str,
+                duration:float=10,
+                sr:int = 0,
+                *,
+                lang = 'en',
+                return_path = False,
+                prompt_template_path: os.PathLike = None,
+                tag_types = [],
+                translate:Optional[Dict[str, os.PathLike]] = None,
+                ):
+        self.audio_reader = SafeAudioReader(duration, sr)
+        self.data_dir = data_dir
+        self._load_metadata_json(json_path)
+        self.sr = sr
+        self.duration = duration
+        self.return_path = return_path
+        self.lang = lang
+        self.use_dynamic_prompt = prompt_template_path is not None
+        if self.use_dynamic_prompt:
+            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types))
+            self.tag_types = tag_types
+            self.translate = read_translate(translate)
+        if not self.use_dynamic_prompt and self.lang != 'en':
+            raise NotImplementedError
+    #这些tag被认为是弱语义的，会避免产生仅包含这些tag的文本提示
+    WEAK_TAG_LIST = ["title", "artist"]
+    def _load_metadata_json(self, json_path):
+        with open(json_path) as fp:
+            self.data = json.load(fp)
+    def convert_key_to_path(self, key):
+        return os.path.join(self.data_dir, get_base_dir_file(key))
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        path = self.convert_key_to_path(item['key'])
+        description = self.generate_description(item)
+        sr, duration = get_sr_and_duration_info(item)
+        audio = self.audio_reader(path, sr, duration)
+        if self.return_path:
+            return audio, description, path
+        return audio, description
+    def tags_to_desc(self, tag_list, tag_type) -> str:
+        if self.lang == 'en':
+            return tags_to_desc(tag_list)
+        elif self.lang == 'zh':
+            translator = self.translate[tag_type]
+            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
+            return tags_to_desc(translated_tag_list, sep='、')
+    def generate_description(self, item):
+        if self.use_dynamic_prompt:
+            # dynamically generate prompt from given prompt template
+            prompt_template = random.choice(self.prompt_templates)
+            description = self.generate_description_dynamic(item, prompt_template)
+        else:
+            # use ordinary static prompt instead
+            description = self.generate_description_ordinary(item)
+        return description
+    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
+        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
+        exists_weak_tag = list(filter(lambda t: t in self.WEAK_TAG_LIST, exists_tag))
+        exists_strong_tag = list(filter(lambda t: t not in self.WEAK_TAG_LIST, exists_tag))
+        if len(exists_strong_tag) > 0:
+            probs = dist_prob_map[len(exists_strong_tag)]
+            tags_num = random.choices(range(1, len(exists_strong_tag)+1), probs)[0]
+            random.shuffle(exists_strong_tag)
+            tags = exists_strong_tag[:tags_num]
+            weak_probs = dist_prob_map_low[len(exists_weak_tag) + 1]
+            weak_tags_num = random.choices(range(0, len(exists_weak_tag) + 1), weak_probs)[0]
+            random.shuffle(exists_weak_tag)
+            weak_tags = exists_weak_tag[:weak_tags_num]
+            tags += weak_tags
+            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
+            prompt = prompt_template.apply(**tags_args)
+        else:
+            # no strong tags, use all weak tags instead
+            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in exists_weak_tag}
+            prompt = prompt_template.apply(**tags_args)
+        return prompt
+    def generate_description_ordinary(self, data, thresh = 0.3):
+        # Initialize the description with title and artist
+        description = f'"{data["title"]+" is " if random.random() > thresh else ""}"a piece of music by {data["artist"]}'
+        # Add genre if available
+        if data["genre"] and random.random() > thresh:
+            genres = ', '.join(data["genre"])
+            description += f', belonging to the {genres} genres'
+        # Add moods if available
+        if data["moods"] and random.random() > thresh:
+            moods = ', '.join(data["moods"])
+            description += f'. This track conveys a {moods} mood'
+        # Add instruments if available
+        if data["instrument"] and random.random() > thresh:
+            instruments = ', '.join(data["instrument"])
+            description += f', and primarily features the following instruments: {instruments}'
+        # Add a period to end the description
+        description += '.'
+        return description
+class AudioStockDataset(Dataset):
+    def __init__(self,
+                metadata_path:str,
+                duration:float=10,
+                sr:int = 0,
+                return_path = False,
+                return_audio = True,
+                prompt_template_path: os.PathLike = None,
+                tag_types = [],
+                lang = 'en',
+                translate:Optional[Dict[str, os.PathLike]] = None
+                ):
+        self.audio_reader = SafeAudioReader(duration, sr)
+        self._load_metadata(metadata_path)
+        self.sr = sr
+        self.duration = duration
+        self.return_path = return_path
+        self.return_audio = return_audio
+        self.use_dynamic_prompt = prompt_template_path is not None
+        if self.use_dynamic_prompt:
+            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types), lang = lang)
+        self.tag_types = tag_types
+        self.lang = lang
+        self.translate = read_translate(translate)
+    def _load_metadata(self, metadata_path):
+        with open(metadata_path) as fp:
+            lines = fp.readlines()
+            self.data = []
+            for line in lines:
+                item = json.loads(line)
+                self.data.append(item)
+        self.is_info_recorded = bool('Tags' in self.data[0])
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        path:str = self.data[idx]["path"]
+        json_path = path[:path.rfind('.')] + ".json"
+        if self.is_info_recorded:
+            item = self.data[idx]
+        else:
+            try:
+                with open(json_path) as fp:
+                    item:dict = json.load(fp)
+            except Exception as e:
+                print(f"Error loading json file {json_path} :\n{e}")
+                item = {}
+        description = self.generate_description(item)
+        if self.return_audio:
+            sr, duration = get_sr_and_duration_info(item)
+            audio = self.audio_reader(path, sr, duration)
+        else:
+            audio = None
+        if self.return_path:
+            return audio, description, path
+        return audio, description
+    def generate_description(self, item):
+        if self.use_dynamic_prompt:
+            # dynamically generate prompt from given prompt template
+            prompt_template = random.choice(self.prompt_templates)
+            description = self.generate_description_dynamic(item, prompt_template)
+        else:
+            # use ordinary static prompt instead
+            description = self.generate_description_ordinary(item)
+        return description
+    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
+        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
+        if len(exists_tag) > 0:
+            probs = dist_prob_map[len(exists_tag)]
+            tags_num = random.choices(range(1, len(exists_tag)+1), probs)[0]
+            random.shuffle(exists_tag)
+            tags = exists_tag[:tags_num]
+            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
+            tags_args = self.handle_BPM_tag(tags_args)
+            prompt = prompt_template.apply(**tags_args)
+        else:
+            # no strong tags, use all weak tags instead
+            prompt = prompt_template.apply()
+        return prompt
+    def tags_to_desc(self, tag_list, tag_type) -> str:
+        if self.lang == 'en':
+            return tags_to_desc(tag_list)
+        elif self.lang == 'zh':
+            if tag_type == 'BPM':
+                return tags_to_desc(tag_list, sep='、')
+            translator = self.translate[tag_type]
+            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
+            return tags_to_desc(translated_tag_list, sep='、')
+    def handle_BPM_tag(self, tags_args):
+        if "BPM" in tags_args and 'BPMDescript' in  self.tag_types:
+            bpm = tags_args["BPM"]
+            del tags_args["BPM"]
+            tag_types_used = random.choice((('BPM',), ('BPMDescript',), ('BPM', 'BPMDescript')))
+            for tag_type in tag_types_used:
+                tags_args[tag_type] = bpm if tag_type == 'BPM' else gen_bpm_descript(bpm, lang=self.lang)
+        return tags_args
+    def generate_description_ordinary(self, data, thresh = 0.3):
+        if self.lang != 'en':
+            raise ValueError(f'Language {self.lang} is not supported for ordinary description generation')
+        description = f'a piece of music by {data["Artist"]}'
+        # Add genre if available
+        if data["Genre"] and random.random() > thresh:
+            genres = ', '.join(data["Genre"])
+            description += f', belonging to the {genres} genres'
+        # Add moods if available
+        if data["Tags"] and random.random() > thresh:
+            tags = ', '.join(data["Tags"])
+            description += f'. This track contains the tags:{tags}'
+        # Add moods if available
+        if data["Mood"] and random.random() > thresh:
+            moods = ', '.join(data["Mood"])
+            description += f'. This track conveys a {moods} mood.'
+        # Add instruments if available
+        if data["Instrument"] and random.random() > thresh:
+            instruments = ', '.join(data["Instrument"])
+            description += f'. and primarily features the following instruments: {instruments}'
+        # Add a period to end the description
+        description += '.'
+        return description
+def mp3_path_to_id(mp3_path):
+    return int(
+        mp3_path[mp3_path.rindex('/') + 1 : mp3_path.rindex('.mp3')]
+    )
+class TmeDataset(Dataset):
+    def __init__(self,
+                data_index:str,
+                music_info:str = None,
+                duration:float = 10,
+                sr:int = 0,
+                return_path = False,
+                return_audio = True,
+                prompt_format_path: os.PathLike = None,
+                tag_types = ['*'],
+                lang = 'zh',
+                translate: Optional[os.PathLike] = None,
+                prompt_dir: os.PathLike = None,
+                ):
+        self.audio_reader = SafeAudioReader(duration, sr)
+        self.sr = sr
+        self.duration = duration
+        self.return_path = return_path
+        self.return_audio = return_audio
+        self.lang = lang
+        self.use_ready_prompt = prompt_dir is not None
+        data_index = read_jsonlike(data_index)
+        self.data_index_dict = {mp3_path_to_id(d['path']) : d for d in data_index}
+        self.data_ids = list(self.data_index_dict.keys())
+        if not self.use_ready_prompt:
+            #读取音乐的信息文件
+            music_info = read_jsonlike(music_info)
+            if 'music' in music_info:
+                music_info = music_info['music']
+            self.music_info_dict = {d["歌曲ID"]:d for d in music_info}
+            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.music_info_dict}
+            self.data_ids = list(self.data_index_dict.keys())
+            with open(prompt_format_path) as fp:
+                self.prompt_formats = yaml.load(fp, Loader=yaml.FullLoader)
+            #加载tag types，并分成一般的tag_types和关键的key_tag_types
+            if '*' in tag_types:
+                self.tag_types = ['歌曲名', 'bpm', '专辑名', '歌手名', '作曲', 'tag']
+            else:
+                self.tag_types = tag_types
+            self.key_tag_types = []
+            if 'tag' in self.tag_types:
+                self.tag_types.remove('tag')
+                self.key_tag_types = list(self.prompt_formats['tag'].keys())
+            #加载translate翻译
+            if translate is not None:
+                self.translator = read_jsonlike(translate)
+        else:
+            data_ids_set = set(self.data_ids)
+            self.prompts_dict = {}
+            for fname in os.listdir(prompt_dir):
+                items = read_jsonlike(os.path.join(prompt_dir, fname))
+                for item in items:
+                    if item['ID'] not in data_ids_set or not self.is_valid_prompt_text(item['Text']):
+                        continue
+                    if item['ID'] not in self.prompts_dict:
+                        self.prompts_dict[item['ID']] = []
+                        self.prompts_dict[item['ID']].append(item['Text'])
+            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.prompts_dict}
+            self.data_ids = list(self.data_index_dict.keys())
+    def tags_to_desc(self, tag_list) -> str:
+        if is_bearable(tag_list, int):
+            return str(tag_list)
+        if self.lang == 'zh':
+            return tags_to_desc(tag_list, sep=self.sep)
+        else:
+            translated_tag_list = [self.translator[tag] for tag in tag_list if tag in self.translator ]
+            return tags_to_desc(translated_tag_list, sep=self.sep)
+    def gen_desc_of_tag(self, formats, tags):
+        fmt = random.choice(formats)
+        return fmt.format(self.tags_to_desc(tags))
+    @staticmethod
+    def check_valid(value):
+        if isinstance(value, int) or isinstance(value, float):
+            return value > 0
+        if (value is not None) and (not isinstance(value, Sequence) or len(value) > 0):
+            return True
+        return False
+    @staticmethod
+    def remove_repeat(data):
+        #若专辑名和歌曲名相同，则只使用后者
+        album_name = data.get('专辑名', None)
+        if album_name is not None and album_name == data.get('歌曲名', None):
+            del data['专辑名']
+        return data
+    @property
+    def comma(self):
+        if self.lang == 'zh':
+            return '，'
+        elif self.lang == 'en':
+            return ', '
+    @property
+    def sep(self):
+        if self.lang == 'zh':
+            return '、'
+        elif self.lang == 'en':
+            return ', '
+    def generate_description(self, data):
+        data = self.remove_repeat(data)
+        weak_tags = [key for key in data if (key in self.tag_types and self.check_valid(data[key]))] #弱语义的tag，这些tag的出现比例会放低
+        key_tags = [key for key in data['tag'] if (key in self.key_tag_types and self.check_valid(data['tag'][key]))] #关键的tag，这些tag必须出现至少一个
+        prompts = []
+        if len(weak_tags) > 0:
+            probs = dist_prob_map_low[len(weak_tags)]
+            if len(key_tags) > 0:
+                tags_num = random.choices(range(0, len(weak_tags)), probs)[0]
+            else:
+                tags_num = random.choices(range(1, len(weak_tags) + 1), probs)[0]
+            random.shuffle(weak_tags)
+            tags = weak_tags[:tags_num]
+            for tag_type in tags:
+                tag_desc = self.gen_desc_of_tag(self.prompt_formats[tag_type], int(data[tag_type]) if tag_type == 'bpm' else data[tag_type])
+                prompts.append(tag_desc)
+        if len(key_tags) > 0:
+            probs = dist_prob_map[len(key_tags)]
+            tags_num = random.choices(range(1, len(key_tags) + 1), probs)[0]
+            random.shuffle(key_tags)
+            tags = key_tags[:tags_num]
+            for tag_type in tags:
+                tag_desc = self.gen_desc_of_tag(self.prompt_formats['tag'][tag_type], data['tag'][tag_type])
+                prompts.append(tag_desc)
+        random.shuffle(prompts)
+        return self.comma.join(prompts)
+    def is_valid_prompt_text(self, text):
+        for bad in ('抱歉','sorry', 'Sorry'):
+            if bad in text:
+                return False
+        return True
+    def get_ready_prompt(self, path):
+        sid = mp3_path_to_id(path)
+        return random.choice(self.prompts_dict[sid])
+    def __len__(self):
+        return len(self.data_ids)
+    def __getitem__(self, idx):
+        data_id = self.data_ids[idx]
+        item = self.data_index_dict[data_id]
+        path = item['path']
+        if not self.use_ready_prompt:
+            info = self.music_info_dict[data_id]
+            description = self.generate_description(info)
+        else:
+            description = self.get_ready_prompt(path)
+        if self.return_audio:
+            sr, duration = get_sr_and_duration_info(item)
+            audio = self.audio_reader(path, sr, duration)
+        else:
+            audio = None
+        if self.return_path:
+            return audio, description, path
+        return audio, description
+class CombinedDataset(Dataset):
+    @beartype
+    def __init__(self, datasets: Sequence[Dataset], ratios: Sequence[int]):
+        self.datasets = datasets
+        self.datasets_index = []
+        for i,dataset in enumerate(datasets):
+            if dataset is None:
+                continue
+            for dup in range(ratios[i]):
+                for j in range(len(dataset)):
+                    self.datasets_index.append((i,j))
+    def __len__(self):
+        return len(self.datasets_index)
+    def __getitem__(self, idx):
+        index = self.datasets_index[idx]
+        i,j = index
+        return self.datasets[i][j]
+class CombinedDataset_random(Dataset):
+    @beartype
+    def __init__(self,
+        num_examples:int,
+        datasets: Sequence[Dataset], ratios: Sequence[int]
+    ):
+        self.datasets = datasets
+        self.datasets_index = []
+        for i,dataset in enumerate(datasets):
+            if dataset is None:
+                continue
+            for dup in range(ratios[i]):
+                for j in range(len(dataset)):
+                    self.datasets_index.append((i,j))
+        if num_examples > 0:
+            self.random_choose = True
+            self.dataset_len = num_examples
+        else:
+            self.random_choose = False
+            self.dataset_len = len(self.datasets_index)
+    def __len__(self):
+        return self.dataset_len
+    def __getitem__(self, idx):
+        first_try = True
+        try_cnt = 0
+        while True:
+            try:
+                if(self.random_choose or not first_try):
+                    index2 = []
+                    index2.append(np.random.randint(0,len(self.datasets)))
+                    index2.append(np.random.randint(0,len(self.datasets[index2[-1]])))
+                else:
+                    index2 = self.datasets_index[idx]
+                first_try = False
+                out = self.datasets[index2[0]][index2[1]]
+                if(len(out[0].shape)==1):out[0]=out[0][None,:]
+                return out
+            except:
+                print("Error loadding ", index2)
+                try_cnt += 1
+                if(try_cnt>10):
+                    raise ValueError()

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_combined_withset.py ADDED Viewed

	@@ -0,0 +1,994 @@

+from torch.utils.data import Dataset
+from beartype.typing import Sequence, Callable, Optional, Dict, Tuple, List
+from beartype import beartype
+from beartype.door import is_bearable
+import random
+import pandas as pd
+import os
+from torchaudio.functional import resample
+import torch
+import typing as tp
+from pathlib import Path
+import torchaudio as ta
+import torch.nn.functional as F
+import numpy as np
+import json
+import yaml
+import torchaudio
+import math
+import re
+from loguru import logger
+def gen_plain_prompt(key_list, sep=', '):
+    if len(key_list) == 0:
+        return 'none'
+    key_list = [k.strip() for k in key_list]
+    if len(key_list) > 10:
+        random.shuffle(key_list)
+        key_list = key_list[:10]
+    probs = dist_prob_map[len(key_list)]
+    num_tags = random.choices(range(1, len(key_list)+1), probs, k=1)[0]
+    random.shuffle(key_list)
+    tags = key_list[:num_tags]
+    tags_str = sep.join(tags)
+    return tags_str
+class Read_and_PadCrop_Normalized_T(torch.nn.Module):
+    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):
+        super().__init__()
+        self.n_samples = n_samples
+        self.sample_rate = sample_rate
+        self.randomize = randomize
+        self.prob = {"is_start":0.2, "is_end":0.9}
+        self.shift_secs = 5
+    def __call__(self, filename: str, duration: float, cur_sample_rate: int) -> Tuple[torch.Tensor, float, float, int, int]:
+        if(duration<(float(self.n_samples)/self.sample_rate+1)):
+            raise ValueError(duration,float(self.n_samples),self.sample_rate)
+            chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
+            t_start = 0.
+            t_end = min(1.0, float(self.n_samples) / float(self.sample_rate) / duration)
+            offset = 0
+            is_start = True
+            is_end = True
+        else:
+            prob = random.uniform(0,1)
+            if(prob<self.prob['is_start']):
+                is_start = True
+                is_end = False
+                offset = 0
+            elif(prob>self.prob['is_end']):
+                is_start = False
+                is_end = True
+                offset = int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate)
+            else:
+                is_start = False
+                is_end = False
+                offset = np.random.randint(self.shift_secs*cur_sample_rate, \
+                    int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate)-self.shift_secs*cur_sample_rate)
+            t_start = offset / float(cur_sample_rate) / duration
+            t_end = t_start + float(self.n_samples) / float(self.sample_rate) / duration
+            chunk, _ = torchaudio.load(filename, frame_offset=offset, num_frames=int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
+        if(chunk.shape[0]>1):
+            chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
+        else:
+            chunk = chunk[[0],:].float()
+        if(cur_sample_rate!=self.sample_rate):
+            # print('a:',cur_sample_rate,chunk.shape)
+            chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sample_rate)
+            # print('b:',self.sample_rate,chunk.shape)
+        if chunk.shape[-1] != self.n_samples:
+            raise ValueError(chunk.shape, self.n_samples, offset, int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
+        # if chunk.shape[-1] < self.n_samples:
+        #     chunk = torch.cat([chunk, torch.zeros((1, self.n_samples - chunk.shape[-1],))],-1)
+        # else:
+        #     chunk = chunk[:,0:self.n_samples]
+        seconds_start = math.floor(offset / cur_sample_rate)
+        seconds_total = math.floor(duration)
+        # # In this dataset, we do not introduce zeros
+        # if(is_start):
+        #     chunk = torch.cat([torch.zeros(1, self.shift_secs*self.sample_rate), chunk],1)[:,0:self.n_samples]
+        # elif(is_end):
+        #     chunk = torch.cat([chunk, torch.zeros(1, self.shift_secs*self.sample_rate)],1)[:,self.shift_secs*self.sample_rate:]
+        return (
+            chunk,
+            t_start,
+            t_end,
+            seconds_start,
+            seconds_total,
+            is_start,
+            is_end,
+        )
+USE_DUMMY_AUDIO = False #当测试代码时，可以将其置为True，这样就不会读取实际数据，而是用生成的静默音频代替
+if USE_DUMMY_AUDIO:
+    logger.warning("USE_DUMMY_AUDIO flag is True, don't use it when train or test!")
+class SafeAudioReader:
+    """
+       This class is an adaptor to Read_and_PadCrop_Normalized_T, make it safe to read audio data.
+    """
+    def __init__(self,
+                duration: float,  # 返回音频长度
+                sample_rate: int, # 返回音频的采样率，如与实际音频采样率不同，会作resample
+                randomize: bool = True
+                ):
+        self.n_samples = int(sample_rate * max(duration, 0))
+        self.reader = Read_and_PadCrop_Normalized_T(n_samples=self.n_samples, sample_rate=sample_rate, randomize=randomize)
+    #NOTE:这个是核心的函数，所有数据集读取音频都是调用的这个函数！
+    def __call__(self,
+                 filepath: os.PathLike,  # 音频路径
+                 origin_sample_rate: Optional[int] = None,  # 从json文件中读取的实际采样率，如果不给定，则会从文件头中读取
+                 origin_duration: float = None, # 从json文件中读取的实际时长，如果不给定，则会从文件头中读取
+                 ) -> torch.Tensor:
+        if USE_DUMMY_AUDIO:
+            wav = torch.zeros(self.n_samples, dtype=torch.float32)
+            return wav
+        try:
+            # if origin_sample_rate is None or origin_duration is None:
+            #     audio_info = torchaudio.info(filepath)
+            #     origin_sample_rate = audio_info.sample_rate
+            #     origin_duration = audio_info.num_frames / origin_sample_rate
+            audio_info = torchaudio.info(filepath)
+            origin_sample_rate = audio_info.sample_rate
+            origin_duration = audio_info.num_frames / origin_sample_rate
+            wav, *ignored, is_start, is_end = self.reader(filepath, origin_duration, origin_sample_rate)
+        except Exception as e:
+            logger.error(f"Error reading {filepath}: {e}")
+            raise FileNotFoundError(filepath)
+        return wav, is_start, is_end
+class PromptTemplate:
+    def __init__(self, template_text: str, tag_map: Dict[str, str], lang:str ='en'):
+        self.template_text = template_text
+        self.tag_map = tag_map
+        self.lang = lang
+    @property
+    def tags(self):
+        return tuple(self.tag_map.keys())
+    def apply(self, **kwargs):
+        for tag in list(kwargs.keys()):
+            if kwargs[tag] == '':
+                kwargs.pop(tag)
+        for tag in self.tags:
+            if tag in kwargs:
+                kwargs[tag] = self.tag_map[tag].format(**{tag: kwargs[tag]}).strip('[]')
+            else:
+                kwargs[tag] = ''
+        prompt = self.template_text.format(**kwargs)
+        return self.beautify(prompt)
+    def beautify(self, text):
+        if self.lang == 'en':
+            return self._beautify_en(text)
+        elif self.lang == 'zh':
+            return self._beautify_zh(text)
+        else:
+            raise ValueError(f'Unknown language {self.lang}')
+    @staticmethod
+    def _beautify_en(text):
+        # no continuous commas without content between them
+        text = re.sub(r'[,\s]*,[,\s]*', r', ', text)
+        # no continuous whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # the comma is NOT followed by whitespace, and should be followed by ONE whitespace
+        text = re.sub(r'\s+,', r',', text)
+        text = re.sub(r',\s+', r', ', text)
+        # no whitespace before the full stop
+        text = re.sub(r'\s+\.', r'.', text)
+        # strip whitespace, comma, and replace ',.'
+        text = text.strip(' ,')
+        text = text.replace(',.', '.')
+        return text
+    @staticmethod
+    def _beautify_zh(text):
+        # no continuous commas without content between them
+        text = re.sub(r'[，、\s]*，[，、\s]*', r'，', text)
+        text = re.sub(r'[，、\s]*、[，、\s]*', r'、', text)
+        # assume there should be NO whitespace in Chinese
+        text = re.sub(r'\s+', r'', text)
+        # strip whitespace, comma, and replace '，。'
+        text = text.strip('， 、')
+        text = text.replace('，。', '。')
+        return text
+    def __repr__(self):
+        return f'PromptTemplate({self.template_text!r}, {self.tag_map!r})'
+    __str__ = __repr__
+def parse_prompt_template(prompt_template_text, lang='en'):
+    span_pattern = re.compile(r'\[.*?{.+?}.*?\]', re.DOTALL)
+    tag_pattern = re.compile(r'{.+?}', re.DOTALL)
+    template_text = prompt_template_text.strip()
+    span_texts = span_pattern.findall(prompt_template_text)
+    tag_map = {}
+    for span_text in span_texts:
+        tag = tag_pattern.findall(span_text)[0].strip('{}')
+        tag_map[tag] = span_text
+        template_text = template_text.replace(span_text, '{'+tag+'}')
+    return PromptTemplate(template_text=template_text, tag_map=tag_map, lang=lang)
+def load_prompt_templates(path, num = 5, lang='en') -> List[PromptTemplate]:
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    cnt = 0
+    pts = []
+    for line in lines:
+        pt = parse_prompt_template(line, lang=lang)
+        cnt += 1
+        if len(pt.tags) < num:
+            logger.error(f'Not enough tags on {path} in line {cnt}: {pt.tags}')
+        pts.append(pt)
+    return pts
+def get_base_dir_file(key: os.PathLike):
+    base = os.path.basename(key)
+    dirname = os.path.basename(os.path.dirname(key))
+    return os.path.join(dirname, base)
+def read_jsonlike(path: os.PathLike):
+    #json or jsonl
+    if str(path).endswith(".json"):
+        with open(path, 'r', encoding='utf8') as f:
+            data = json.load(f)
+        return data
+    elif str(path).endswith(".jsonl"):
+        with open(path, 'r', encoding='utf8') as f:
+            data = [json.loads(line) for line in f.readlines()]
+        return data
+    else:
+        raise ValueError("Unknown file format")
+dist_prob_map = {
+    1: (1.0,),
+    2: (0.5, 0.5),
+    3: (0.3, 0.4, 0.3),
+    4: (0.2, 0.3, 0.3, 0.2),
+    5: (0.2, 0.2, 0.3, 0.2, 0.1),
+    6: (0.1, 0.15, 0.2, 0.2, 0.2, 0.15),
+    7: (0.05, 0.1, 0.1, 0.2, 0.25, 0.2, 0.1),
+    8: (0.03, 0.05, 0.1, 0.15, 0.25, 0.2, 0.1, 0.12),
+    9: (0.02, 0.1, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.08),
+    10: (0.01, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.05, 0.05, 0.09)
+}
+dist_prob_map_low = {
+    1: (1.0,),
+    2: (0.8, 0.2),
+    3: (0.8, 0.1, 0.1),
+    4: (0.7, 0.1, 0.1, 0.1),
+    5: (0.7, 0.1, 0.1, 0.05, 0.05),
+    6: (0.7, 0.1, 0.05, 0.05, 0.05, 0.05),
+}
+_bpm_range_rights = (
+    (40, '20-40'),
+    (60, '40-60'),
+    (66, '60-66'),
+    (76, '66-76'),
+    (108, '76-108'),
+    (120, '108-120'),
+    (168, '120-168'),
+    (176, '168-176'),
+    (200, '176-200')
+)
+_bpm_desc_map = {
+    '20-40': ("glacial pace", "extremely slow tempo", "crawl-like speed", "snail's pace", "almost motionless rhythm", "Larghissimo"),
+    '40-60': ("broad and slow", "spacious tempo", "unhurried pace", "calm rhythm", "relaxed speed", "Largo"),
+    '60-66': ("gentle tempo", "leisurely pace", "easy-going rhythm", "unrushed speed", "smooth and slow", 'Larghetto'),
+    '66-76': ("slow and steady", "deliberate tempo", "unhurried pace", "relaxed rhythm", "easy speed", 'Adagio'),
+    '76-108': ("walking pace", "moderate tempo", "steady rhythm", "balanced speed", "easy-flowing tempo", "Andante"),
+    '108-120': ("medium pace", "comfortable tempo", "even rhythm", "measured speed", "controlled tempo", 'Moderato'),
+    '120-168': ("quick and lively", "brisk pace", "energetic tempo", "upbeat rhythm", "spirited speed", 'Allegro'),
+    '168-176': ("lively and fast", "bright tempo", "sprightly pace", "vibrant rhythm", "animated speed", 'Vivace'),
+    '176-200': ("very fast tempo", "rapid pace", "high-speed rhythm", "hurried speed", "accelerated tempo", 'Presto'),
+    '>200': ("extremely fast", "breakneck speed", "blazing tempo", "lightning-fast rhythm", "supercharged pace", 'Prestissimo')
+}
+_bpm_desc_map_zh = {
+    '20-40': ("极度缓慢", "极慢的节奏", "悠长的旋律", "迟缓的节奏", "几乎静止的节奏", "甚缓"),
+    '40-60': ("宽广而缓慢", "宽敞的节奏", "从容不迫的速度", "平静的节奏", "轻松的速度", "广板"),
+    '60-66': ("柔和的节奏", "悠闲的速度", "轻松的节奏", "不慌不忙的速度", "平滑而缓慢", '小广板'),
+    '66-76': ("缓慢而稳定", "沉稳的旋律", "从容不迫的速度", "轻松的节奏", "轻松的速度", '慢板'),
+    '76-108': ("步行速度", "适中的节奏", "稳定的节奏", "平衡的速度", "流畅的节奏", "行板"),
+    '108-120': ("中等速度", "舒适的节奏", "均匀的节奏", "有节制的速度", "稳定的氛围", '中板'),
+    '120-168': ("快速而生动", "轻快的速度", "充满活力的节奏", "欢快的节奏", "富有精神的速度", '快板'),
+    '168-176': ("生动而快速", "明快的节奏", "活泼的速度", "充满活力的节奏", "生气勃勃的速度", '活泼的'),
+    '176-200': ("非常快的节奏", "快速的速度", "高速的节奏", "匆忙的速度", "加速的节奏", '急板'),
+    '>200': ("极快的速度", "极速旋律", "炽热的节奏", "闪电般的节奏", "疾驰的速度", '最急板')
+}
+def get_bpm_range(bpm):
+    bpm = int(bpm)
+    for right, tag in _bpm_range_rights:
+        if bpm <= right:
+            return tag
+    return '>200'
+def gen_bpm_descript(bpm, lang='en'):
+    bpm_range = get_bpm_range(bpm)
+    if lang == 'en':
+        return random.choice(_bpm_desc_map[bpm_range])
+    elif lang == 'zh':
+        return random.choice(_bpm_desc_map_zh[bpm_range])
+    else:
+        raise ValueError(f"Unknown language {lang}")
+def read_translate(translate: Optional[Dict[str, os.PathLike]]):
+    if translate is None:
+        return None
+    if isinstance(translate, str):
+        return read_jsonlike(translate)
+    return {k: read_jsonlike(path) for k, path in translate.items()}
+class MagnaTagATuneDataset(Dataset):
+    def __init__(self):
+        pass
+def tags_to_desc(tag_list, sep=',') -> str:
+    if not isinstance(tag_list, Sequence):
+        return str(tag_list)
+    if isinstance(tag_list, str):
+        return tag_list
+    if len(tag_list) <= 0:
+        return ''
+    elif len(tag_list) <= 5:
+        probs = dist_prob_map[len(tag_list)]
+        tags_num = random.choices(range(1, len(tag_list)+1), probs)[0]
+        random.shuffle(tag_list)
+        tag_list = tag_list[:tags_num]
+        return sep.join(tag_list)
+    else:
+        probs = dist_prob_map[5]
+        tags_num = random.choices(range(1, 6), probs)[0]
+        random.shuffle(tag_list)
+        tag_list = tag_list[:tags_num]
+        return sep.join(tag_list)
+def get_sr_and_duration_info(item):
+    return item.get('sample_rate', None), item.get('duration', None)
+class MtgJamendoDatasetFromJson(Dataset):
+    def __init__(self,
+                data_dir:str,
+                json_path:str,
+                duration:float=10,
+                sr:int = 0,
+                *,
+                lang = 'en',
+                return_path = False,
+                prompt_template_path: os.PathLike = None,
+                tag_types = [],
+                translate:Optional[Dict[str, os.PathLike]] = None,
+                ):
+        self.audio_reader = SafeAudioReader(duration, sr)
+        self.data_dir = data_dir
+        self._load_metadata_json(json_path)
+        self.sr = sr
+        self.duration = duration
+        self.return_path = return_path
+        self.lang = lang
+        self.use_dynamic_prompt = prompt_template_path is not None
+        if self.use_dynamic_prompt:
+            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types))
+            self.tag_types = tag_types
+            self.translate = read_translate(translate)
+        if not self.use_dynamic_prompt and self.lang != 'en':
+            raise NotImplementedError
+    #这些tag被认为是弱语义的，会避免产生仅包含这些tag的文本提示
+    WEAK_TAG_LIST = ["title", "artist"]
+    def _load_metadata_json(self, json_path):
+        with open(json_path) as fp:
+            self.data = json.load(fp)
+    def convert_key_to_path(self, key):
+        return os.path.join(self.data_dir, get_base_dir_file(key))
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        path = self.convert_key_to_path(item['key'])
+        description = self.generate_description(item)
+        sr, duration = get_sr_and_duration_info(item)
+        audio, is_start, is_end = self.audio_reader(path, sr, duration)
+        if self.return_path:
+            return audio, description, path
+        return audio, description, is_start, is_end
+    def tags_to_desc(self, tag_list, tag_type) -> str:
+        if self.lang == 'en':
+            return tags_to_desc(tag_list)
+        elif self.lang == 'zh':
+            translator = self.translate[tag_type]
+            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
+            return tags_to_desc(translated_tag_list, sep='、')
+    def generate_description(self, item):
+        if self.use_dynamic_prompt:
+            # dynamically generate prompt from given prompt template
+            prompt_template = random.choice(self.prompt_templates)
+            description = self.generate_description_dynamic(item, prompt_template)
+        else:
+            # use ordinary static prompt instead
+            description = self.generate_description_ordinary(item)
+        return description
+    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
+        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
+        exists_weak_tag = list(filter(lambda t: t in self.WEAK_TAG_LIST, exists_tag))
+        exists_strong_tag = list(filter(lambda t: t not in self.WEAK_TAG_LIST, exists_tag))
+        if len(exists_strong_tag) > 0:
+            probs = dist_prob_map[len(exists_strong_tag)]
+            tags_num = random.choices(range(1, len(exists_strong_tag)+1), probs)[0]
+            random.shuffle(exists_strong_tag)
+            tags = exists_strong_tag[:tags_num]
+            weak_probs = dist_prob_map_low[len(exists_weak_tag) + 1]
+            weak_tags_num = random.choices(range(0, len(exists_weak_tag) + 1), weak_probs)[0]
+            random.shuffle(exists_weak_tag)
+            weak_tags = exists_weak_tag[:weak_tags_num]
+            tags += weak_tags
+            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
+            prompt = prompt_template.apply(**tags_args)
+        else:
+            # no strong tags, use all weak tags instead
+            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in exists_weak_tag}
+            prompt = prompt_template.apply(**tags_args)
+        return prompt
+    def generate_description_ordinary(self, data, thresh = 0.3):
+        # Initialize the description with title and artist
+        description = f'"{data["title"]+" is " if random.random() > thresh else ""}"a piece of music by {data["artist"]}'
+        # Add genre if available
+        if data["genre"] and random.random() > thresh:
+            genres = ', '.join(data["genre"])
+            description += f', belonging to the {genres} genres'
+        # Add moods if available
+        if data["moods"] and random.random() > thresh:
+            moods = ', '.join(data["moods"])
+            description += f'. This track conveys a {moods} mood'
+        # Add instruments if available
+        if data["instrument"] and random.random() > thresh:
+            instruments = ', '.join(data["instrument"])
+            description += f', and primarily features the following instruments: {instruments}'
+        # Add a period to end the description
+        description += '.'
+        return description
+class AudioStockDataset(Dataset):
+    def __init__(self,
+                metadata_path:str,
+                duration:float=10,
+                sr:int = 0,
+                return_path = False,
+                return_audio = True,
+                prompt_template_path: os.PathLike = None,
+                tag_types = [],
+                lang = 'en',
+                translate:Optional[Dict[str, os.PathLike]] = None
+                ):
+        self.audio_reader = SafeAudioReader(duration, sr)
+        self.duration = duration
+        self._load_metadata(metadata_path)
+        self.sr = sr
+        self.return_path = return_path
+        self.return_audio = return_audio
+        self.use_dynamic_prompt = prompt_template_path is not None
+        if self.use_dynamic_prompt:
+            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types), lang = lang)
+        self.tag_types = tag_types
+        self.lang = lang
+        self.translate = read_translate(translate)
+    def _load_metadata(self, metadata_path):
+        with open(metadata_path) as fp:
+            lines = fp.readlines()
+            self.data = []
+            for line in lines:
+                item = json.loads(line)
+                if(item['duration']>self.duration+10):
+                    self.data.append(item)
+        self.is_info_recorded = bool('Tags' in self.data[0])
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        path:str = self.data[idx]["path"]
+        json_path = path[:path.rfind('.')] + ".json"
+        if self.is_info_recorded:
+            item = self.data[idx]
+        else:
+            try:
+                with open(json_path) as fp:
+                    item:dict = json.load(fp)
+            except Exception as e:
+                print(f"Error loading json file {json_path} :\n{e}")
+                item = {}
+        description = self.generate_description(item)
+        if self.return_audio:
+            sr, duration = get_sr_and_duration_info(item)
+            audio, is_start, is_end = self.audio_reader(path, sr, duration)
+        else:
+            audio = None
+        if self.return_path:
+            return audio, description, path, is_start, is_end
+        else:
+            return audio, description, is_start, is_end
+    def generate_description(self, item):
+        if self.use_dynamic_prompt:
+            # dynamically generate prompt from given prompt template
+            prompt_template = random.choice(self.prompt_templates)
+            description = self.generate_description_dynamic(item, prompt_template)
+        else:
+            # use ordinary static prompt instead
+            description = self.generate_description_ordinary(item)
+        return description
+    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
+        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
+        if len(exists_tag) > 0:
+            probs = dist_prob_map[len(exists_tag)]
+            tags_num = random.choices(range(1, len(exists_tag)+1), probs)[0]
+            random.shuffle(exists_tag)
+            tags = exists_tag[:tags_num]
+            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
+            tags_args = self.handle_BPM_tag(tags_args)
+            prompt = prompt_template.apply(**tags_args)
+        else:
+            # no strong tags, use all weak tags instead
+            prompt = prompt_template.apply()
+        return prompt
+    def tags_to_desc(self, tag_list, tag_type) -> str:
+        if self.lang == 'en':
+            return tags_to_desc(tag_list)
+        elif self.lang == 'zh':
+            if tag_type == 'BPM':
+                return tags_to_desc(tag_list, sep='、')
+            translator = self.translate[tag_type]
+            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
+            return tags_to_desc(translated_tag_list, sep='、')
+    def handle_BPM_tag(self, tags_args):
+        if "BPM" in tags_args and 'BPMDescript' in  self.tag_types:
+            bpm = tags_args["BPM"]
+            del tags_args["BPM"]
+            tag_types_used = random.choice((('BPM',), ('BPMDescript',), ('BPM', 'BPMDescript')))
+            for tag_type in tag_types_used:
+                tags_args[tag_type] = bpm if tag_type == 'BPM' else gen_bpm_descript(bpm, lang=self.lang)
+        return tags_args
+    def generate_description_ordinary(self, data, thresh = 0.3):
+        if self.lang != 'en':
+            raise ValueError(f'Language {self.lang} is not supported for ordinary description generation')
+        description = f'a piece of music by {data["Artist"]}'
+        # Add genre if available
+        if data["Genre"] and random.random() > thresh:
+            genres = ', '.join(data["Genre"])
+            description += f', belonging to the {genres} genres'
+        # Add moods if available
+        if data["Tags"] and random.random() > thresh:
+            tags = ', '.join(data["Tags"])
+            description += f'. This track contains the tags:{tags}'
+        # Add moods if available
+        if data["Mood"] and random.random() > thresh:
+            moods = ', '.join(data["Mood"])
+            description += f'. This track conveys a {moods} mood.'
+        # Add instruments if available
+        if data["Instrument"] and random.random() > thresh:
+            instruments = ', '.join(data["Instrument"])
+            description += f'. and primarily features the following instruments: {instruments}'
+        # Add a period to end the description
+        description += '.'
+        return description
+def mp3_path_to_id(mp3_path):
+    return int(
+        mp3_path[mp3_path.rindex('/') + 1 : mp3_path.rindex('.mp3')]
+    )
+class TmeDataset(Dataset):
+    def __init__(self,
+                data_index:str,
+                music_info:str = None,
+                duration:float = 10,
+                sr:int = 0,
+                return_path = False,
+                return_audio = True,
+                prompt_format_path: os.PathLike = None,
+                tag_types = ['*'],
+                lang = 'zh',
+                translate: Optional[os.PathLike] = None,
+                prompt_dir: os.PathLike = None,
+                ):
+        self.audio_reader = SafeAudioReader(duration, sr)
+        self.sr = sr
+        self.duration = duration
+        self.return_path = return_path
+        self.return_audio = return_audio
+        self.lang = lang
+        self.use_ready_prompt = prompt_dir is not None
+        data_index = read_jsonlike(data_index)
+        data_index = [d for d in data_index if d['duration']>self.duration+10]
+        self.data_index_dict = {mp3_path_to_id(d['path']) : d for d in data_index}
+        self.data_ids = list(self.data_index_dict.keys())
+        if not self.use_ready_prompt:
+            #读取音乐的信息文件
+            music_info = read_jsonlike(music_info)
+            if 'music' in music_info:
+                music_info = music_info['music']
+            self.music_info_dict = {d["歌曲ID"]:d for d in music_info}
+            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.music_info_dict}
+            self.data_ids = list(self.data_index_dict.keys())
+            with open(prompt_format_path) as fp:
+                self.prompt_formats = yaml.load(fp, Loader=yaml.FullLoader)
+            #加载tag types，并分成一般的tag_types和关键的key_tag_types
+            if '*' in tag_types:
+                self.tag_types = ['歌曲名', 'bpm', '专辑名', '歌手名', '作曲', 'tag']
+            else:
+                self.tag_types = tag_types
+            self.key_tag_types = []
+            if 'tag' in self.tag_types:
+                self.tag_types.remove('tag')
+                self.key_tag_types = list(self.prompt_formats['tag'].keys())
+            #加载translate翻译
+            if translate is not None:
+                self.translator = read_jsonlike(translate)
+        else:
+            data_ids_set = set(self.data_ids)
+            self.prompts_dict = {}
+            for fname in os.listdir(prompt_dir):
+                items = read_jsonlike(os.path.join(prompt_dir, fname))
+                for item in items:
+                    if item['ID'] not in data_ids_set or not self.is_valid_prompt_text(item['Text']):
+                        continue
+                    if item['ID'] not in self.prompts_dict:
+                        self.prompts_dict[item['ID']] = []
+                        self.prompts_dict[item['ID']].append(item['Text'])
+            self.data_index_dict = {k:v for k,v in self.data_index_dict.items() if k in self.prompts_dict}
+            self.data_ids = list(self.data_index_dict.keys())
+    def tags_to_desc(self, tag_list) -> str:
+        if is_bearable(tag_list, int):
+            return str(tag_list)
+        if self.lang == 'zh':
+            return tags_to_desc(tag_list, sep=self.sep)
+        else:
+            translated_tag_list = [self.translator[tag] for tag in tag_list if tag in self.translator ]
+            return tags_to_desc(translated_tag_list, sep=self.sep)
+    def gen_desc_of_tag(self, formats, tags):
+        fmt = random.choice(formats)
+        return fmt.format(self.tags_to_desc(tags))
+    @staticmethod
+    def check_valid(value):
+        if isinstance(value, int) or isinstance(value, float):
+            return value > 0
+        if (value is not None) and (not isinstance(value, Sequence) or len(value) > 0):
+            return True
+        return False
+    @staticmethod
+    def remove_repeat(data):
+        #若专辑名和歌曲名相同，则只使用后者
+        album_name = data.get('专辑名', None)
+        if album_name is not None and album_name == data.get('歌曲名', None):
+            del data['专辑名']
+        return data
+    @property
+    def comma(self):
+        if self.lang == 'zh':
+            return '，'
+        elif self.lang == 'en':
+            return ', '
+    @property
+    def sep(self):
+        if self.lang == 'zh':
+            return '、'
+        elif self.lang == 'en':
+            return ', '
+    def generate_description(self, data):
+        data = self.remove_repeat(data)
+        weak_tags = [key for key in data if (key in self.tag_types and self.check_valid(data[key]))] #弱语义的tag，这些tag的出现比例会放低
+        key_tags = [key for key in data['tag'] if (key in self.key_tag_types and self.check_valid(data['tag'][key]))] #关键的tag，这些tag必须出现至少一个
+        prompts = []
+        if len(weak_tags) > 0:
+            probs = dist_prob_map_low[len(weak_tags)]
+            if len(key_tags) > 0:
+                tags_num = random.choices(range(0, len(weak_tags)), probs)[0]
+            else:
+                tags_num = random.choices(range(1, len(weak_tags) + 1), probs)[0]
+            random.shuffle(weak_tags)
+            tags = weak_tags[:tags_num]
+            for tag_type in tags:
+                tag_desc = self.gen_desc_of_tag(self.prompt_formats[tag_type], int(data[tag_type]) if tag_type == 'bpm' else data[tag_type])
+                prompts.append(tag_desc)
+        if len(key_tags) > 0:
+            probs = dist_prob_map[len(key_tags)]
+            tags_num = random.choices(range(1, len(key_tags) + 1), probs)[0]
+            random.shuffle(key_tags)
+            tags = key_tags[:tags_num]
+            for tag_type in tags:
+                tag_desc = self.gen_desc_of_tag(self.prompt_formats['tag'][tag_type], data['tag'][tag_type])
+                prompts.append(tag_desc)
+        random.shuffle(prompts)
+        return self.comma.join(prompts)
+    def is_valid_prompt_text(self, text):
+        for bad in ('抱歉','sorry', 'Sorry'):
+            if bad in text:
+                return False
+        return True
+    def get_ready_prompt(self, path):
+        sid = mp3_path_to_id(path)
+        return random.choice(self.prompts_dict[sid])
+    def __len__(self):
+        return len(self.data_ids)
+    def __getitem__(self, idx):
+        data_id = self.data_ids[idx]
+        item = self.data_index_dict[data_id]
+        path = item['path']
+        if not self.use_ready_prompt:
+            info = self.music_info_dict[data_id]
+            description = self.generate_description(info)
+        else:
+            description = self.get_ready_prompt(path)
+        if self.return_audio:
+            sr, duration = get_sr_and_duration_info(item)
+            audio, is_start, is_end = self.audio_reader(path, sr, duration)
+        else:
+            audio = None
+        if self.return_path:
+            return audio, description, path, is_start, is_end
+        else:
+            return audio, description, is_start, is_end
+class Pond5Dataset(Dataset):
+    MAX_PROMPT_LEN = 200
+    def __init__(self,
+                metadata_path:str,
+                index_path:str,
+                duration:float=10,
+                sr:int = 0,
+                plain_rate = 0,
+                return_path = False,
+                return_audio = True,
+                lang = 'en',
+                translate:Optional[Dict[str, os.PathLike]] = None,
+                use_literal_none = True,
+                use_avoid_watermark_policy = None,
+                ):
+        if use_avoid_watermark_policy is None:
+            raise ValueError("`use_avoid_watermark_policy` is an important param, you need to explicitly specify it with bool type")
+        self.use_avoid_watermark_policy = use_avoid_watermark_policy
+        assert self.use_avoid_watermark_policy is False
+        self.audio_reader = SafeAudioReader(duration, sr)
+        self.duration = duration
+        self._load_metadata(metadata_path, index_path)
+        self.sr = sr
+        self.plain_rate = plain_rate
+        self.return_path = return_path
+        self.return_audio = return_audio
+        self.use_literal_none = use_literal_none
+        self.lang = lang
+        self.translate = read_translate(translate)
+    def _load_metadata(self, metadata_path, index_path):
+        data_index = read_jsonlike(index_path)
+        data_ids = set([item['id'] for item in data_index])
+        with open(metadata_path) as fp:
+            lines = fp.readlines()
+        append_ids = set()
+        self.data = []
+        for line in lines:
+            item = json.loads(line)
+            if item['id'] in data_ids and item['id'] not in append_ids and item["details"]["duration"] is not None and item["details"]["duration"]>self.duration+10:
+                self.data.append(item)
+                append_ids.add(item['id'])
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        path:str = item["path"]
+        description = self.generate_description(item)
+        if self.return_audio:
+            sr, duration = get_sr_and_duration_info(item)
+            audio, is_start, is_end = self.audio_reader(path, sr, duration)
+        else:
+            audio = None
+        if self.return_path:
+            return audio, description, path
+        return audio, description, is_start, is_end
+    @property
+    def keysep(self):
+        if self.lang == 'zh':
+            return '，' if random.random() > 0.5 else '、'
+        elif self.lang == 'en':
+            return ', '
+    def generate_description(self, item):
+        if random.random() > self.plain_rate:
+            # dynamically generate prompt from given prompt template
+            description = self.generate_description_dynamic(item)
+        else:
+            # use plain prompt, i.e. tags sequence separated by comma
+            description = self.generate_description_plain(item)
+        return description
+    def get_translation(self, k):
+        k = k.strip()
+        if k in self.translate:
+            return self.translate[k]
+        else:
+            return k
+    def generate_description_plain(self, item):
+        keywords = item['keywords']
+        if self.lang != 'en':
+            keywords = [self.get_translation(k) for k in keywords]
+        return gen_plain_prompt(keywords, sep=self.keysep)
+    def generate_description_dynamic(self,item):
+        desc = item.get('desc', 'none')
+        if desc is None:
+            desc = 'none'
+        desc = desc.strip()
+        if len(desc) > self.MAX_PROMPT_LEN:
+            shorter_desc = desc[:self.MAX_PROMPT_LEN]
+            # find last stop
+            stop_idx = shorter_desc.rfind('.')
+            if stop_idx == -1:
+                stop_idx = shorter_desc.rfind('!')
+            if stop_idx == -1:
+                stop_idx = shorter_desc.rfind(',')
+            if stop_idx == -1:
+                stop_idx = self.MAX_PROMPT_LEN - 1
+            desc = desc[:stop_idx+1]
+        return desc
+class CombinedDataset(Dataset):
+    @beartype
+    def __init__(self, datasets: Sequence[Dataset], ratios: Sequence[int]):
+        self.datasets = datasets
+        self.datasets_index = []
+        for i,dataset in enumerate(datasets):
+            if dataset is None:
+                continue
+            for dup in range(ratios[i]):
+                for j in range(len(dataset)):
+                    self.datasets_index.append((i,j))
+    def __len__(self):
+        return len(self.datasets_index)
+    def __getitem__(self, idx):
+        index = self.datasets_index[idx]
+        i,j = index
+        return self.datasets[i][j]
+class CombinedDataset_random(Dataset):
+    @beartype
+    def __init__(self,
+        num_examples:int,
+        datasets: Sequence[Dataset], ratios: Sequence[int]
+    ):
+        self.datasets = datasets
+        self.datasets_index = []
+        for i,dataset in enumerate(datasets):
+            if dataset is None:
+                continue
+            for dup in range(ratios[i]):
+                for j in range(len(dataset)):
+                    self.datasets_index.append((i,j))
+        if num_examples > 0:
+            self.random_choose = True
+            self.dataset_len = num_examples
+        else:
+            self.random_choose = False
+            self.dataset_len = len(self.datasets_index)
+    def __len__(self):
+        return self.dataset_len
+    def __getitem__(self, idx):
+        first_try = True
+        try_cnt = 0
+        while True:
+            try:
+                if(self.random_choose or not first_try):
+                    index2 = []
+                    index2.append(np.random.randint(0,len(self.datasets)))
+                    index2.append(np.random.randint(0,len(self.datasets[index2[-1]])))
+                else:
+                    index2 = self.datasets_index[idx]
+                first_try = False
+                out = self.datasets[index2[0]][index2[1]]
+                if(len(out[0].shape)==1):out[0]=out[0][None,:]
+                return out
+            except:
+                print("Error loadding ", index2)
+                try_cnt += 1
+                if(try_cnt>10):
+                    raise FileNotFoundError()

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import re
+import sys
+import json
+from torch.utils.data import Dataset
+import torchaudio
+from torchaudio.functional import resample
+import torch
+import numpy as np
+from torch.nn.utils.rnn import pad_sequence
+def check_lryics(lyric):
+    _FILTER_STRING = [
+        '作词', '作曲', '编曲', '【', '策划',
+        '录音', '混音', '母带', '：', '制作',
+        '版权', '校对', '演奏', '制作', '伴奏'
+    ]
+    for item in _FILTER_STRING:
+        if item in lyric:
+            return True
+    return False
+def process_lyrics(lines):
+    lyric_part = []
+    timestamp_part = []
+    timestamp_pattern = re.compile(r'\[\d+:\d+(\.\d+)?\]')
+    for i, line in enumerate(lines):
+        # 删除前几行的特定信息
+        if i<10 and check_lryics(line):
+            continue
+        # 检查是否包含有效的时间戳和歌词内容
+        if timestamp_pattern.match(line):
+            timestamp_end = line.rfind(']')
+            lyrics = line[timestamp_end + 1:].strip()
+            timestamps = line[:timestamp_end + 1]
+            if '：' in lyrics:
+                if len(lyrics.split("：")[0]) <=5:
+                     lyrics = "".join(lyrics.split("：")[1:])
+            # if lyrics:  # 确保歌词部分不是空的
+            #     lyric_part.append(lyrics)
+            #     timestamp_part.append(timestamps)
+    # print(processed_lyrics)
+    return timestamp_part, lyric_part
+def get_timestamps(timestamp_part):
+    # 转换为秒
+    timestamps = []
+    for line in timestamp_part:
+        match = re.match(r'\[(\d+):(\d+)(\.\d+)?\]', line)
+        if match:
+            minutes = int(match.group(1))
+            seconds = float(match.group(2))
+            millis = float(match.group(3)) if match.group(3) else 0
+            total_seconds = minutes * 60 + seconds + millis
+            timestamps.append(total_seconds)
+    return timestamps
+def process_lyrics_lrc(lyrics):
+    timestamp_part, lyric_part = process_lyrics(lyrics)
+    # print(timestamp_part)
+    # print(lyric_part)
+    timestamps = get_timestamps(timestamp_part)
+    # print(timestamps)
+    if len(timestamps) == 0:
+        # print(f'{lyric_path}')
+        return []
+    slice_start = timestamps[0]
+    slice_start_idx = 0
+    output_list = []
+    for i in range(1, len(timestamps)):
+        # 如果累积时间超过30秒，则进行切分, 如果整体小于30s, 整句会被丢掉
+        if timestamps[i] - slice_start > 30:
+            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
+            slice_start = timestamps[i]
+            slice_start_idx = i
+    return output_list
+def process_lyrics_yrc(lyrics):
+    timestamps, lyric_part = extract_lrc(lyrics)
+    # timestamp_part, lyric_part = process_lyrics(lyrics)
+    # import pdb; pdb.set_trace()
+    # print(timestamp_part)
+    # print(lyric_part)
+    # timestamps = get_timestamps(timestamp_part)
+    # print(timestamps)
+    if len(timestamps) == 0:
+        # print(f'{lyric_path}')
+        return []
+    slice_start = timestamps[0]
+    slice_start_idx = 0
+    output_list = []
+    for i in range(1, len(timestamps)):
+        # 如果累积时间超过30秒，则进行切分
+        if timestamps[i] - slice_start > 30:
+            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
+            slice_start = timestamps[i]
+            slice_start_idx = i
+    # import pdb; pdb.set_trace()
+    return output_list
+def extract_lrc(lyrics):
+    timestamp_part, lyric_part = [], []
+    for i,  text in enumerate(lyrics):
+        # 提取中括号内的内容
+        bracket_content = re.search(r'\[(.*?)\]', text).group(1)
+        bracket_content = bracket_content.split(',')
+        # 提取小括号内的内容
+        parentheses_content = re.findall(r'\((.*?)\)', text)
+        # 提取其他内容
+        other_content = re.sub(r'\[(.*?)\]|\((.*?)\)', '', text).strip()
+        # 数据怎么处理？
+        # import pdb; pdb.set_trace()
+        if i<10 and check_lryics(other_content):
+            continue
+        # import pdb; pdb.set_trace()
+        timestamp_part.append(float(bracket_content[0])/1000)
+        lyric_part.append(other_content)
+    # import pdb; pdb.set_trace()
+    return timestamp_part, lyric_part
+class WYYSongDataset(Dataset):
+    def __init__(self,
+                metadata_path:str,
+                sr:int = 0,
+                use_lang = ['en', 'zh-cn'],
+                num_examples = -1,
+                ):
+        self.sr = sr
+        self.use_lang = use_lang
+        self._load_metadata(metadata_path)
+        # buffer
+        self.lyric_buffer = {}
+        if(num_examples<=0):
+            self.dataset_len = len(self.data)
+            self.random_slc = False
+        else:
+            self.dataset_len = num_examples
+            self.random_slc = True
+    # 读取jsonl文件
+    def _load_metadata(self, metadata_path):
+        with open(metadata_path) as fp:
+            lines = fp.readlines()
+            self.data = []
+            for line in lines:
+                item = json.loads(line)
+                # if item['lrc-lyric'] is not None and item['yrc-lyric'] is not None:
+                if 'lyrics' in item and 'lang_info' in item:
+                    if len(item['lyrics']) > 0:
+                        for lang in self.use_lang:
+                            if lang in item['lang_info'] and item['lang_info'][lang]['proportion'] > 0.8 and item['lang_info'][lang]['probability'] > 0.9:
+                                # if '伴奏' not in item['path'] and "cloud" in item['path']:
+                                if '伴奏' not in item['path']:
+                                    self.data.append(item)
+    def __len__(self):
+        return self.dataset_len
+    def __getitem__(self, idx):
+        try_cnt = 0
+        while True:
+            if(self.random_slc):
+                idx = np.random.randint(0, len(self.data))
+            yrc_lyrics = []
+            lrc_lyrics = []
+            try:
+                info = self.data[idx]
+                # audio path
+                path:str = info["path"]
+                # 读取歌词段落
+                if 'lyrics' not in info:
+                    if idx not in self.lyric_buffer:
+                        # 字级别align的歌词
+                        if info['yrc-lyric'] is not None:
+                            with open(info['yrc-lyric']) as f_in:
+                                yrc_lyric = json.load(f_in)
+                            yrc_lyrics = process_lyrics_yrc(yrc_lyric['lyrics'][:-1])
+                        # 句子级align的歌词
+                        if info['lrc-lyric'] is not None:
+                            with open(info['lrc-lyric']) as f_in:
+                                lrc_lyric = json.load(f_in)
+                            lrc_lyrics = process_lyrics_lrc(lrc_lyric['lyrics'][:-1])
+                        # 优先使用字级别align的歌词
+                        if len(yrc_lyrics) > 0:
+                            lyrics = yrc_lyrics
+                        else:
+                            lyrics = lrc_lyrics
+                        self.lyric_buffer[idx] = lyrics
+                        # TODO 每段歌词进行长度筛选，过滤掉太长和太短的歌曲
+                    else:
+                        lyrics = self.lyric_buffer[idx]
+                else:
+                    lyrics = info['lyrics']
+                # 随机选取一个lyric段落
+                ly_id = torch.randint(low=1, high=len(lyrics), size=(1,))[0].item()
+                # ly_id = 0
+                lyric = lyrics[ly_id]
+                st, et, lyric = self.parse_lyric(lyric)
+                assert et - st < 40
+                # 文本过滤
+                lyric = re.sub(r'【.*?】', '', lyric)
+                if 'zh-cn' in info['lang_info'] and info['lang_info']['zh-cn']['proportion'] > 0.8:
+                    assert  200 > len(lyric.replace(" ", "")) > 30
+                    if '：' in lyrics:
+                        if len(lyrics.split("：")[0]) <=5:
+                            lyrics = "".join(lyrics.split("：")[1:])
+                    if ':' in lyrics:
+                        if len(lyrics.split("：")[0]) <=5:
+                            lyrics = "".join(lyrics.split(":")[1:])
+                if 'en' in info['lang_info'] and info['lang_info']['en']['proportion'] > 0.8:
+                    assert  200 > len(lyric.split()) > 20
+                    if '：' in lyrics:
+                        if len(lyrics.split("：")[0].split()) <=3:
+                            lyrics = "".join(lyrics.split("：")[1:])
+                    if ':' in lyrics:
+                        if len(lyrics.split("：")[0].split()) <=3:
+                            lyrics = "".join(lyrics.split(":")[1:])
+                # 读取音频文件
+                cur_sample_rate = torchaudio.info(path).sample_rate
+                offset = int(cur_sample_rate*st)
+                num_frames = int(cur_sample_rate * (et -st))
+                chunk, _ = torchaudio.load(path, frame_offset=offset, num_frames=num_frames)
+                # 随机选取一个channel
+                if(chunk.shape[0]>1):
+                    chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
+                else:
+                    chunk = chunk[[0],:].float()
+                if(cur_sample_rate!=self.sr):
+                    # print('a:',cur_sample_rate,chunk.shape)
+                    chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sr)
+                return chunk, lyric, [st, et], path
+            except:
+                    print("Error loadding ", info["path"])
+                    try_cnt += 1
+                    idx  = np.random.randint(0, len(self.data))
+                    if(try_cnt>10):
+                        raise FileNotFoundError()
+    def parse_lyric(self, lyric):
+        pattern = r'\[(\d+\.\d+):(\d+\.\d+)\](.*)'
+        match = re.search(pattern, lyric)
+        start_time = float(match.group(1))
+        end_time = float(match.group(2))
+        content = match.group(3)
+        return start_time, end_time, content
+def collect_song(data_list):
+    audios =  pad_sequence([data[0].t() for data in data_list], batch_first=True, padding_value=0).transpose(1,2)
+    lyrics = [data[1] for data in data_list]
+    st_et = [data[2] for data in data_list]
+    paths = [data[3] for data in data_list]
+    return audios, lyrics, st_et

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song_20s.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import re
+import sys
+import json
+from torch.utils.data import Dataset
+import torchaudio
+from torchaudio.functional import resample
+import torch
+import numpy as np
+from torch.nn.utils.rnn import pad_sequence
+def check_lryics(lyric):
+    _FILTER_STRING = [
+        '作词', '作曲', '编曲', '【', '策划',
+        '录音', '混音', '母带', '：', '制作',
+        '版权', '校对', '演奏', '制作', '伴奏'
+    ]
+    for item in _FILTER_STRING:
+        if item in lyric:
+            return True
+    return False
+def process_lyrics(lines):
+    lyric_part = []
+    timestamp_part = []
+    timestamp_pattern = re.compile(r'\[\d+:\d+(\.\d+)?\]')
+    for i, line in enumerate(lines):
+        # 删除前几行的特定信息
+        if i<10 and check_lryics(line):
+            continue
+        # 检查是否包含有效的时间戳和歌词内容
+        if timestamp_pattern.match(line):
+            timestamp_end = line.rfind(']')
+            lyrics = line[timestamp_end + 1:].strip()
+            timestamps = line[:timestamp_end + 1]
+            if '：' in lyrics:
+                if len(lyrics.split("：")[0]) <=5:
+                     lyrics = "".join(lyrics.split("：")[1:])
+            # if lyrics:  # 确保歌词部分不是空的
+            #     lyric_part.append(lyrics)
+            #     timestamp_part.append(timestamps)
+    # print(processed_lyrics)
+    return timestamp_part, lyric_part
+def get_timestamps(timestamp_part):
+    # 转换为秒
+    timestamps = []
+    for line in timestamp_part:
+        match = re.match(r'\[(\d+):(\d+)(\.\d+)?\]', line)
+        if match:
+            minutes = int(match.group(1))
+            seconds = float(match.group(2))
+            millis = float(match.group(3)) if match.group(3) else 0
+            total_seconds = minutes * 60 + seconds + millis
+            timestamps.append(total_seconds)
+    return timestamps
+def process_lyrics_lrc(lyrics):
+    timestamp_part, lyric_part = process_lyrics(lyrics)
+    # print(timestamp_part)
+    # print(lyric_part)
+    timestamps = get_timestamps(timestamp_part)
+    # print(timestamps)
+    if len(timestamps) == 0:
+        # print(f'{lyric_path}')
+        return []
+    slice_start = timestamps[0]
+    slice_start_idx = 0
+    output_list = []
+    for i in range(1, len(timestamps)):
+        # 如果累积时间超过30秒，则进行切分, 如果整体小于30s, 整句会被丢掉
+        if timestamps[i] - slice_start > 30:
+            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
+            slice_start = timestamps[i]
+            slice_start_idx = i
+    return output_list
+def process_lyrics_yrc(lyrics):
+    timestamps, lyric_part = extract_lrc(lyrics)
+    # timestamp_part, lyric_part = process_lyrics(lyrics)
+    # import pdb; pdb.set_trace()
+    # print(timestamp_part)
+    # print(lyric_part)
+    # timestamps = get_timestamps(timestamp_part)
+    # print(timestamps)
+    if len(timestamps) == 0:
+        # print(f'{lyric_path}')
+        return []
+    slice_start = timestamps[0]
+    slice_start_idx = 0
+    output_list = []
+    for i in range(1, len(timestamps)):
+        # 如果累积时间超过30秒，则进行切分
+        if timestamps[i] - slice_start > 30:
+            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
+            slice_start = timestamps[i]
+            slice_start_idx = i
+    # import pdb; pdb.set_trace()
+    return output_list
+def extract_lrc(lyrics):
+    timestamp_part, lyric_part = [], []
+    for i,  text in enumerate(lyrics):
+        # 提取中括号内的内容
+        bracket_content = re.search(r'\[(.*?)\]', text).group(1)
+        bracket_content = bracket_content.split(',')
+        # 提取小括号内的内容
+        parentheses_content = re.findall(r'\((.*?)\)', text)
+        # 提取其他内容
+        other_content = re.sub(r'\[(.*?)\]|\((.*?)\)', '', text).strip()
+        # 数据怎么处理？
+        # import pdb; pdb.set_trace()
+        if i<10 and check_lryics(other_content):
+            continue
+        # import pdb; pdb.set_trace()
+        timestamp_part.append(float(bracket_content[0])/1000)
+        lyric_part.append(other_content)
+    # import pdb; pdb.set_trace()
+    return timestamp_part, lyric_part
+class WYYSongDataset(Dataset):
+    def __init__(self,
+                metadata_path:str,
+                sr:int = 0,
+                use_lang = ['en', 'zh-cn'],
+                num_examples = -1,
+                ):
+        self.sr = sr
+        self.use_lang = use_lang
+        self._load_metadata(metadata_path)
+        # buffer
+        self.lyric_buffer = {}
+        if(num_examples<=0):
+            self.dataset_len = len(self.data)
+            self.random_slc = False
+        else:
+            self.dataset_len = num_examples
+            self.random_slc = True
+    # 读取jsonl文件
+    def _load_metadata(self, metadata_path):
+        with open(metadata_path) as fp:
+            lines = fp.readlines()
+            self.data = []
+            for line in lines:
+                item = json.loads(line)
+                # if item['lrc-lyric'] is not None and item['yrc-lyric'] is not None:
+                if 'lyrics' in item and 'lang_info' in item:
+                    if len(item['lyrics']) > 0:
+                        for lang in self.use_lang:
+                            if lang in item['lang_info'] and item['lang_info'][lang]['proportion'] > 0.8 and item['lang_info'][lang]['probability'] > 0.9:
+                                # if '伴奏' not in item['path'] and "cloud" in item['path']:
+                                if '伴奏' not in item['path']:
+                                    self.data.append(item)
+    def __len__(self):
+        return self.dataset_len
+    def __getitem__(self, idx):
+        try_cnt = 0
+        while True:
+            if(self.random_slc):
+                idx = np.random.randint(0, len(self.data))
+            yrc_lyrics = []
+            lrc_lyrics = []
+            try:
+                info = self.data[idx]
+                # audio path
+                path:str = info["path"]
+                # 读取歌词段落
+                if 'lyrics' not in info:
+                    if idx not in self.lyric_buffer:
+                        # 字级别align的歌词
+                        if info['yrc-lyric'] is not None:
+                            with open(info['yrc-lyric']) as f_in:
+                                yrc_lyric = json.load(f_in)
+                            yrc_lyrics = process_lyrics_yrc(yrc_lyric['lyrics'][:-1])
+                        # 句子级align的歌词
+                        if info['lrc-lyric'] is not None:
+                            with open(info['lrc-lyric']) as f_in:
+                                lrc_lyric = json.load(f_in)
+                            lrc_lyrics = process_lyrics_lrc(lrc_lyric['lyrics'][:-1])
+                        # 优先使用字级别align的歌词
+                        if len(yrc_lyrics) > 0:
+                            lyrics = yrc_lyrics
+                        else:
+                            lyrics = lrc_lyrics
+                        self.lyric_buffer[idx] = lyrics
+                        # TODO 每段歌词进行长度筛选，过滤掉太长和太短的歌曲
+                    else:
+                        lyrics = self.lyric_buffer[idx]
+                else:
+                    lyrics = info['lyrics']
+                # 随机选取一个lyric段落
+                ly_id = torch.randint(low=1, high=len(lyrics), size=(1,))[0].item()
+                # ly_id = 0
+                lyric = lyrics[ly_id]
+                st, et, lyric = self.parse_lyric(lyric)
+                assert et - st < 20
+                # 文本过滤
+                lyric = re.sub(r'【.*?】', '', lyric)
+                if 'zh-cn' in info['lang_info'] and info['lang_info']['zh-cn']['proportion'] > 0.8:
+                    assert  100 > len(lyric.replace(" ", "")) > 5
+                    if '：' in lyrics:
+                        if len(lyrics.split("：")[0]) <=5:
+                            lyrics = "".join(lyrics.split("：")[1:])
+                    if ':' in lyrics:
+                        if len(lyrics.split("：")[0]) <=5:
+                            lyrics = "".join(lyrics.split(":")[1:])
+                if 'en' in info['lang_info'] and info['lang_info']['en']['proportion'] > 0.8:
+                    assert  100 > len(lyric.split()) > 5
+                    if '：' in lyrics:
+                        if len(lyrics.split("：")[0].split()) <=3:
+                            lyrics = "".join(lyrics.split("：")[1:])
+                    if ':' in lyrics:
+                        if len(lyrics.split("：")[0].split()) <=3:
+                            lyrics = "".join(lyrics.split(":")[1:])
+                # 读取音频文件
+                cur_sample_rate = torchaudio.info(path).sample_rate
+                offset = int(cur_sample_rate*st)
+                num_frames = int(cur_sample_rate * (et -st))
+                chunk, _ = torchaudio.load(path, frame_offset=offset, num_frames=num_frames)
+                # 随机选取一个channel
+                if(chunk.shape[0]>1):
+                    chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
+                else:
+                    chunk = chunk[[0],:].float()
+                if(cur_sample_rate!=self.sr):
+                    # print('a:',cur_sample_rate,chunk.shape)
+                    chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sr)
+                return chunk, lyric, [st, et], path
+            except:
+                    print("Error loadding ", info["path"])
+                    try_cnt += 1
+                    idx  = np.random.randint(0, len(self.data))
+                    if(try_cnt>10):
+                        raise FileNotFoundError()
+    def parse_lyric(self, lyric):
+        pattern = r'\[(\d+\.\d+):(\d+\.\d+)\](.*)'
+        match = re.search(pattern, lyric)
+        start_time = float(match.group(1))
+        end_time = float(match.group(2))
+        content = match.group(3)
+        return start_time, end_time, content
+def collect_song(data_list):
+    audios =  pad_sequence([data[0].t() for data in data_list], batch_first=True, padding_value=0).transpose(1,2)
+    lyrics = [data[1] for data in data_list]
+    st_et = [data[2] for data in data_list]
+    paths = [data[3] for data in data_list]
+    return audios, lyrics, st_et

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_song_new_429.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import re
+import sys
+import json
+from torch.utils.data import Dataset
+import torchaudio
+from torchaudio.functional import resample
+import torch
+import numpy as np
+from torch.nn.utils.rnn import pad_sequence
+def check_lryics(lyric):
+    _FILTER_STRING = [
+        '作词', '作曲', '编曲', '【', '策划',
+        '录音', '混音', '母带', '：', '制作',
+        '版权', '校对', '演奏', '制作', '伴奏'
+    ]
+    for item in _FILTER_STRING:
+        if item in lyric:
+            return True
+    return False
+def process_lyrics(lines):
+    lyric_part = []
+    timestamp_part = []
+    timestamp_pattern = re.compile(r'\[\d+:\d+(\.\d+)?\]')
+    for i, line in enumerate(lines):
+        # 删除前几行的特定信息
+        if i<10 and check_lryics(line):
+            continue
+        # 检查是否包含有效的时间戳和歌词内容
+        if timestamp_pattern.match(line):
+            timestamp_end = line.rfind(']')
+            lyrics = line[timestamp_end + 1:].strip()
+            timestamps = line[:timestamp_end + 1]
+            if '：' in lyrics:
+                if len(lyrics.split("：")[0]) <=5:
+                     lyrics = "".join(lyrics.split("：")[1:])
+            # if lyrics:  # 确保歌词部分不是空的
+            #     lyric_part.append(lyrics)
+            #     timestamp_part.append(timestamps)
+    # print(processed_lyrics)
+    return timestamp_part, lyric_part
+def get_timestamps(timestamp_part):
+    # 转换为秒
+    timestamps = []
+    for line in timestamp_part:
+        match = re.match(r'\[(\d+):(\d+)(\.\d+)?\]', line)
+        if match:
+            minutes = int(match.group(1))
+            seconds = float(match.group(2))
+            millis = float(match.group(3)) if match.group(3) else 0
+            total_seconds = minutes * 60 + seconds + millis
+            timestamps.append(total_seconds)
+    return timestamps
+def process_lyrics_lrc(lyrics):
+    timestamp_part, lyric_part = process_lyrics(lyrics)
+    # print(timestamp_part)
+    # print(lyric_part)
+    timestamps = get_timestamps(timestamp_part)
+    # print(timestamps)
+    if len(timestamps) == 0:
+        # print(f'{lyric_path}')
+        return []
+    slice_start = timestamps[0]
+    slice_start_idx = 0
+    output_list = []
+    for i in range(1, len(timestamps)):
+        # 如果累积时间超过30秒，则进行切分, 如果整体小于30s, 整句会被丢掉
+        if timestamps[i] - slice_start > 30:
+            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
+            slice_start = timestamps[i]
+            slice_start_idx = i
+    return output_list
+def process_lyrics_yrc(lyrics):
+    timestamps, lyric_part = extract_lrc(lyrics)
+    # timestamp_part, lyric_part = process_lyrics(lyrics)
+    # import pdb; pdb.set_trace()
+    # print(timestamp_part)
+    # print(lyric_part)
+    # timestamps = get_timestamps(timestamp_part)
+    # print(timestamps)
+    if len(timestamps) == 0:
+        # print(f'{lyric_path}')
+        return []
+    slice_start = timestamps[0]
+    slice_start_idx = 0
+    output_list = []
+    for i in range(1, len(timestamps)):
+        # 如果累积时间超过30秒，则进行切分
+        if timestamps[i] - slice_start > 30:
+            output_list.append(f'[{str(slice_start)}:{str(timestamps[i])}]' + ", ".join(lyric_part[slice_start_idx:i]))
+            slice_start = timestamps[i]
+            slice_start_idx = i
+    # import pdb; pdb.set_trace()
+    return output_list
+def extract_lrc(lyrics):
+    timestamp_part, lyric_part = [], []
+    for i,  text in enumerate(lyrics):
+        # 提取中括号内的内容
+        bracket_content = re.search(r'\[(.*?)\]', text).group(1)
+        bracket_content = bracket_content.split(',')
+        # 提取小括号内的内容
+        parentheses_content = re.findall(r'\((.*?)\)', text)
+        # 提取其他内容
+        other_content = re.sub(r'\[(.*?)\]|\((.*?)\)', '', text).strip()
+        # 数据怎么处理？
+        if i<10 and check_lryics(other_content):
+            continue
+        timestamp_part.append(float(bracket_content[0])/1000)
+        lyric_part.append(other_content)
+    return timestamp_part, lyric_part
+class WYYSongDataset(Dataset):
+    def __init__(self,
+                metadata_path:str,
+                sr:int = 0,
+                use_lang = ['en', 'zh-cn'],
+                num_examples = -1,
+                max_dur = 20,
+                pad_to_max= True,
+                ):
+        self.sr = sr
+        self.use_lang = use_lang
+        self._load_metadata(metadata_path)
+        self.max_dur = max_dur
+        self.pad_to_max = pad_to_max
+        # buffer
+        self.lyric_buffer = {}
+        if(num_examples<=0):
+            self.dataset_len = len(self.data)
+            self.random_slc = False
+        else:
+            self.dataset_len = num_examples
+            self.random_slc = True
+    # 读取jsonl文件
+    def _load_metadata(self, metadata_path):
+        with open(metadata_path) as fp:
+            lines = fp.readlines()
+            self.data = []
+            for line in lines:
+                item = json.loads(line)
+                if '伴奏' not in item['path']:
+                    # if "lang_type" in item and item['lang_type'] == 'en':
+                     if "lang_type" in item:
+                        self.data.append(item)
+    def __len__(self):
+        return self.dataset_len
+    def __getitem__(self, idx):
+        try_cnt = 0
+        while True:
+            if(self.random_slc):
+                idx = np.random.randint(0, len(self.data))
+            yrc_lyrics = []
+            lrc_lyrics = []
+            try:
+                info = self.data[idx]
+                # audio path
+                path = info["path"]
+                lang_type = info["lang_type"]
+                if info["lang_type"] == 'en':
+                    lyrics = info['lyrics']
+                else:
+                    lyrics = info['lyrics_phone']
+                # 随机选取一个lyric段落
+                ly_id = torch.randint(low=1, high=len(lyrics), size=(1,))[0].item()
+                lyric = lyrics[ly_id].strip()
+                st, et, lyric = self.parse_lyric(lyric)
+                lyric = lyric.replace("\xa0", " ")
+                lyric = " ".join(lyric.split())
+                assert et - st < self.max_dur
+                if info["lang_type"] == 'en':
+                    # print(len(lyric.split())/(et-st))
+                    assert 6 > len(lyric.split())/(et-st) > 1
+                else:
+                    # print(len(lyric.split())/(et-st))
+                    lyric = lyric.replace("-", "")
+                    assert 6 > len(lyric.split())/(et-st) > 1
+                # 读取音频文件
+                cur_sample_rate = torchaudio.info(path).sample_rate
+                offset = int(cur_sample_rate*st)
+                num_frames = int(cur_sample_rate * (et -st))
+                chunk, _ = torchaudio.load(path, frame_offset=offset, num_frames=num_frames)
+                # chunk = torch.zeros(1, 48000*15)
+                # 随机选取一个channel
+                if(chunk.shape[0]>1):
+                    chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
+                else:
+                    chunk = chunk[[0],:].float()
+                if(cur_sample_rate!=self.sr):
+                    # print('a:',cur_sample_rate,chunk.shape)
+                    chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sr)
+                if self.pad_to_max:
+                    chunk = self.pad_2d_tensor(chunk, int(self.max_dur * self.sr), 0)
+                return chunk, lyric, et-st, path, lang_type
+            except:
+                    # print("Error loadding ", info["path"])
+                    try_cnt += 1
+                    idx  = np.random.randint(0, len(self.data))
+                    if(try_cnt>20):
+                        raise FileNotFoundError()
+    def parse_lyric(self, lyric):
+        pattern = r'\[(\d+\.\d+):(\d+\.\d+)\](.*)'
+        match = re.search(pattern, lyric)
+        start_time = float(match.group(1))
+        end_time = float(match.group(2))
+        content = match.group(3)
+        return start_time, end_time, content
+    def pad_2d_tensor(self, x, max_len, pad_id):
+        # 获取输入 tensor 的形状
+        batch_size, seq_len = x.size()
+        max_len = max(max_len, seq_len)
+        # 计算需要填充的长度
+        pad_len = max_len - seq_len
+        # 如果需要填充
+        if pad_len > 0:
+            # 创建填充 tensor
+            pad_tensor = torch.full((batch_size, pad_len), pad_id, dtype=x.dtype, device=x.device)
+            # 沿第二个维度（列）连接输入 tensor 和填充 tensor
+            padded_tensor = torch.cat([x, pad_tensor], dim=1)
+        else:
+            # 如果不需要填充，直接返回输入 tensor
+            padded_tensor = x
+        return padded_tensor
+def collect_data(data_list):
+    audios =  pad_sequence([data[0].t() for data in data_list], batch_first=True, padding_value=0).transpose(1,2)
+    lyrics = [data[1] for data in data_list]
+    st_et = [data[2] for data in data_list]
+    paths = [data[3] for data in data_list]
+    lang_types = [data[4] for data in data_list]
+    return audios, lyrics, st_et, lang_types
+    # return audios, lyrics, st_et
+def build_dataset():
+    train_dataset = WYYSongDataset(
+        metadata_path = "train.jsonl",
+        sr = 48000,
+        use_lang = ['zh-cn', 'en'],
+        num_examples = 10*10000
+    )
+    valid_dataset = WYYSongDataset(
+        metadata_path = "valid.jsonl",
+        sr = 48000,
+        use_lang = ['zh-cn', 'en'],
+        num_examples = 500
+    )
+    return train_dataset, valid_dataset

codeclm/tokenizer/Flow1dVAE/libs/datasets/dataset_stock.py ADDED Viewed

	@@ -0,0 +1,461 @@

+from torch.utils.data import Dataset
+from beartype.typing import Sequence, Callable, Optional, Dict, List
+from beartype.door import is_bearable
+import random
+import os
+from torchaudio.functional import resample
+import torch
+import typing as tp
+from pathlib import Path
+import torchaudio as ta
+import torch.nn.functional as F
+import soundfile
+import numpy as np
+import json
+import yaml
+import random
+import librosa
+from loguru import logger
+import re
+def _av_read(filepath, seek_time=0, duration=None):
+    if duration is not None:
+        sr = librosa.get_samplerate(filepath)
+        offset = seek_time
+        num_samples = int(duration * sr)
+        wav, _ = librosa.load(filepath, sr=sr, offset=offset, duration=duration)
+    else:
+        wav, sr = librosa.load(filepath, sr=None, offset=seek_time)
+    return wav, sr
+def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
+               duration: float = -1., pad: bool = True) -> tp.Tuple[torch.Tensor, int]:
+    """Read audio by picking the most appropriate backend tool based on the audio format.
+    Args:
+        filepath (str or Path): Path to audio file to read.
+        seek_time (float): Time at which to start reading in the file.
+        duration (float): Duration to read from the file. If set to -1, the whole file is read.
+        pad (bool): Pad output audio if not reaching expected duration.
+    Returns:
+        tuple of torch.Tensor, int: Tuple containing audio data and sample rate.
+    """
+    fp = Path(filepath)
+    if fp.suffix in ['.flac', '.ogg']:  # TODO: check if we can safely use av_read for .ogg
+        # There is some bug with ffmpeg and reading flac
+        info = soundfile.info(filepath)
+        frames = -1 if duration <= 0 else int(duration * info.samplerate)
+        frame_offset = int(seek_time * info.samplerate)
+        wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32)
+        assert info.samplerate == sr, f"Mismatch of sample rates {info.samplerate} {sr}"
+        wav = torch.from_numpy(wav).t().contiguous()
+        if len(wav.shape) == 1:
+            wav = torch.unsqueeze(wav, 0)
+    elif (
+        fp.suffix in ['.wav', '.mp3'] and fp.suffix[1:] in ta.utils.sox_utils.list_read_formats()
+        and duration <= 0 and seek_time == 0
+    ):
+        # Torchaudio is faster if we load an entire file at once.
+        wav, sr = librosa.load(fp, sr=None, mono=True)
+    else:
+        wav, sr = _av_read(filepath, seek_time, duration)
+    if pad and duration > 0:
+        expected_frames = int(duration * sr)
+        wav = F.pad(torch.tensor(wav), (0, expected_frames - wav.shape[-1]))
+    if not isinstance(wav, torch.Tensor):
+        wav = torch.tensor(wav)
+    return wav, sr
+def random_seek_read(filepath, duration):
+    if duration > 0:
+        total_duration = librosa.get_duration(path=filepath)
+        acceptable_start = max(0, total_duration - duration)
+        wav, sr = audio_read(filepath, random.uniform(0, acceptable_start), duration, pad=True)
+    else:
+        wav, sr = audio_read(filepath, 0, -1, pad=False)
+    return wav, sr
+def safe_random_seek_read(filepath, duration, sample_rate):
+    try:
+        wav, sr = random_seek_read(filepath, duration)
+        if sr != sample_rate:
+            wav = resample(wav, sr, sample_rate)
+            sr = sample_rate
+    except Exception as e:
+        logger.error(f"Error reading {filepath}: {e}")
+        sr = sample_rate
+        wav = torch.zeros(sr * max(duration, 0), dtype=torch.float32)
+    return wav, sr
+def read_jsonlike(path: os.PathLike):
+    #json or jsonl
+    if str(path).endswith(".json"):
+        with open(path, 'r', encoding='utf8') as f:
+            data = json.load(f)
+        return data
+    elif str(path).endswith(".jsonl"):
+        with open(path, 'r', encoding='utf8') as f:
+            data = [json.loads(line) for line in f.readlines()]
+        return data
+    else:
+        raise ValueError("Unknown file format")
+dist_prob_map = {
+    1: (1.0,),
+    2: (0.5, 0.5),
+    3: (0.3, 0.4, 0.3),
+    4: (0.2, 0.3, 0.3, 0.2),
+    5: (0.2, 0.2, 0.3, 0.2, 0.1),
+    6: (0.1, 0.15, 0.2, 0.2, 0.2, 0.15),
+    7: (0.05, 0.1, 0.1, 0.2, 0.25, 0.2, 0.1),
+    8: (0.03, 0.05, 0.1, 0.15, 0.25, 0.2, 0.1, 0.12),
+    9: (0.02, 0.1, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.08),
+    10: (0.01, 0.1, 0.1, 0.15, 0.2, 0.15, 0.1, 0.05, 0.05, 0.09)
+}
+dist_prob_map_low = {
+    1: (1.0,),
+    2: (0.8, 0.2),
+    3: (0.8, 0.1, 0.1),
+    4: (0.7, 0.1, 0.1, 0.1),
+    5: (0.7, 0.1, 0.1, 0.05, 0.05),
+    6: (0.7, 0.1, 0.05, 0.05, 0.05, 0.05),
+}
+_bpm_range_rights = (
+    (40, '20-40'),
+    (60, '40-60'),
+    (66, '60-66'),
+    (76, '66-76'),
+    (108, '76-108'),
+    (120, '108-120'),
+    (168, '120-168'),
+    (176, '168-176'),
+    (200, '176-200')
+)
+_bpm_desc_map = {
+    '20-40': ("glacial pace", "extremely slow tempo", "crawl-like speed", "snail's pace", "almost motionless rhythm", "Larghissimo"),
+    '40-60': ("broad and slow", "spacious tempo", "unhurried pace", "calm rhythm", "relaxed speed", "Largo"),
+    '60-66': ("gentle tempo", "leisurely pace", "easy-going rhythm", "unrushed speed", "smooth and slow", 'Larghetto'),
+    '66-76': ("slow and steady", "deliberate tempo", "unhurried pace", "relaxed rhythm", "easy speed", 'Adagio'),
+    '76-108': ("walking pace", "moderate tempo", "steady rhythm", "balanced speed", "easy-flowing tempo", "Andante"),
+    '108-120': ("medium pace", "comfortable tempo", "even rhythm", "measured speed", "controlled tempo", 'Moderato'),
+    '120-168': ("quick and lively", "brisk pace", "energetic tempo", "upbeat rhythm", "spirited speed", 'Allegro'),
+    '168-176': ("lively and fast", "bright tempo", "sprightly pace", "vibrant rhythm", "animated speed", 'Vivace'),
+    '176-200': ("very fast tempo", "rapid pace", "high-speed rhythm", "hurried speed", "accelerated tempo", 'Presto'),
+    '>200': ("extremely fast", "breakneck speed", "blazing tempo", "lightning-fast rhythm", "supercharged pace", 'Prestissimo')
+}
+_bpm_desc_map_zh = {
+    '20-40': ("极度缓慢", "极慢的节奏", "悠长的旋律", "迟缓的节奏", "几乎静止的节奏", "甚缓"),
+    '40-60': ("宽广而缓慢", "宽敞的节奏", "从容不迫的速度", "平静的节奏", "轻松的速度", "广板"),
+    '60-66': ("柔和的节奏", "悠闲的速度", "轻松的节奏", "不慌不忙的速度", "平滑而缓慢", '小广板'),
+    '66-76': ("缓慢而稳定", "沉稳的旋律", "从容不迫的速度", "轻松的节奏", "轻松的速度", '慢板'),
+    '76-108': ("步行速度", "适中的节奏", "稳定的节奏", "平衡的速度", "流畅的节奏", "行板"),
+    '108-120': ("中等速度", "舒适的节奏", "均匀的节奏", "有节制的速度", "稳定的氛围", '中板'),
+    '120-168': ("快速而生动", "轻快的速度", "充满活力的节奏", "欢快的节奏", "富有精神的速度", '快板'),
+    '168-176': ("生动而快速", "明快的节奏", "活泼的速度", "充满活力的节奏", "生气勃勃的速度", '活泼的'),
+    '176-200': ("非常快的节奏", "快速的速度", "高速的节奏", "匆忙的速度", "加速的节奏", '急板'),
+    '>200': ("极快的速度", "极速旋律", "炽热的节奏", "闪电般的节奏", "疾驰的速度", '最急板')
+}
+def get_bpm_range(bpm):
+    bpm = int(bpm)
+    for right, tag in _bpm_range_rights:
+        if bpm <= right:
+            return tag
+    return '>200'
+def gen_bpm_descript(bpm, lang='en'):
+    bpm_range = get_bpm_range(bpm)
+    if lang == 'en':
+        return random.choice(_bpm_desc_map[bpm_range])
+    elif lang == 'zh':
+        return random.choice(_bpm_desc_map_zh[bpm_range])
+    else:
+        raise ValueError(f"Unknown language {lang}")
+def read_translate(translate: Optional[Dict[str, os.PathLike]]):
+    if translate is None:
+        return None
+    return {k: read_jsonlike(path) for k, path in translate.items()}
+def tags_to_desc(tag_list, sep=',') -> str:
+    if not isinstance(tag_list, Sequence):
+        return str(tag_list)
+    if isinstance(tag_list, str):
+        return tag_list
+    if len(tag_list) <= 0:
+        return ''
+    elif len(tag_list) <= 5:
+        probs = dist_prob_map[len(tag_list)]
+        tags_num = random.choices(range(1, len(tag_list)+1), probs)[0]
+        random.shuffle(tag_list)
+        tag_list = tag_list[:tags_num]
+        return sep.join(tag_list)
+    else:
+        probs = dist_prob_map[5]
+        tags_num = random.choices(range(1, 6), probs)[0]
+        random.shuffle(tag_list)
+        tag_list = tag_list[:tags_num]
+        return sep.join(tag_list)
+class PromptTemplate:
+    def __init__(self, template_text: str, tag_map: Dict[str, str], lang:str ='en'):
+        self.template_text = template_text
+        self.tag_map = tag_map
+        self.lang = lang
+    @property
+    def tags(self):
+        return tuple(self.tag_map.keys())
+    def apply(self, **kwargs):
+        for tag in list(kwargs.keys()):
+            if kwargs[tag] == '':
+                kwargs.pop(tag)
+        for tag in self.tags:
+            if tag in kwargs:
+                kwargs[tag] = self.tag_map[tag].format(**{tag: kwargs[tag]}).strip('[]')
+            else:
+                kwargs[tag] = ''
+        prompt = self.template_text.format(**kwargs)
+        return self.beautify(prompt)
+    def beautify(self, text):
+        if self.lang == 'en':
+            return self._beautify_en(text)
+        elif self.lang == 'zh':
+            return self._beautify_zh(text)
+        else:
+            raise ValueError(f'Unknown language {self.lang}')
+    @staticmethod
+    def _beautify_en(text):
+        # no continuous commas without content between them
+        text = re.sub(r'[,\s]*,[,\s]*', r', ', text)
+        # no continuous whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # the comma is NOT followed by whitespace, and should be followed by ONE whitespace
+        text = re.sub(r'\s+,', r',', text)
+        text = re.sub(r',\s+', r', ', text)
+        # no whitespace before the full stop
+        text = re.sub(r'\s+\.', r'.', text)
+        # strip whitespace, comma, and replace ',.'
+        text = text.strip(' ,')
+        text = text.replace(',.', '.')
+        return text
+    @staticmethod
+    def _beautify_zh(text):
+        # no continuous commas without content between them
+        text = re.sub(r'[，、\s]*，[，、\s]*', r'，', text)
+        text = re.sub(r'[，、\s]*、[，、\s]*', r'、', text)
+        # assume there should be NO whitespace in Chinese
+        text = re.sub(r'\s+', r'', text)
+        # strip whitespace, comma, and replace '，。'
+        text = text.strip('， 、')
+        text = text.replace('，。', '。')
+        return text
+    def __repr__(self):
+        return f'PromptTemplate({self.template_text!r}, {self.tag_map!r})'
+    __str__ = __repr__
+def parse_prompt_template(prompt_template_text, lang='en'):
+    span_pattern = re.compile(r'\[.*?{.+?}.*?\]', re.DOTALL)
+    tag_pattern = re.compile(r'{.+?}', re.DOTALL)
+    template_text = prompt_template_text.strip()
+    span_texts = span_pattern.findall(prompt_template_text)
+    tag_map = {}
+    for span_text in span_texts:
+        tag = tag_pattern.findall(span_text)[0].strip('{}')
+        tag_map[tag] = span_text
+        template_text = template_text.replace(span_text, '{'+tag+'}')
+    return PromptTemplate(template_text=template_text, tag_map=tag_map, lang=lang)
+def load_prompt_templates(path, num = 5, lang='en') -> List[PromptTemplate]:
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    cnt = 0
+    pts = []
+    for line in lines:
+        pt = parse_prompt_template(line, lang=lang)
+        cnt += 1
+        if len(pt.tags) < num:
+            logger.error(f'Not enough tags on {path} in line {cnt}: {pt.tags}')
+        pts.append(pt)
+    return pts
+class AudioStockDataset(Dataset):
+    def __init__(self,
+                num_examples:int,
+                metadata_path:str,
+                duration:float=60,
+                sr:int = 0,
+                return_path = False,
+                return_audio = True,
+                prompt_template_path: os.PathLike = None,
+                tag_types = [],
+                lang = 'en',
+                translate:Optional[Dict[str, os.PathLike]] = None
+                ):
+        self.duration = duration
+        self.MAX_DURATION = 360
+        self._load_metadata(metadata_path)
+        if num_examples > 0:
+            self.random_choose = True
+            self.dataset_len = num_examples
+        else:
+            self.random_choose = False
+            self.dataset_len = len(self.data)
+        self.sr = sr
+        self.return_path = return_path
+        self.return_audio = return_audio
+        self.use_dynamic_prompt = prompt_template_path is not None
+        if self.use_dynamic_prompt:
+            self.prompt_templates = load_prompt_templates(prompt_template_path, num = len(tag_types), lang = lang)
+        self.tag_types = tag_types
+        self.lang = lang
+        self.translate = read_translate(translate)
+    def _load_metadata(self, metadata_path):
+        total_len = 0; valid_len = 0
+        with open(metadata_path) as fp:
+            lines = fp.readlines()
+            self.data = []
+            for line in lines:
+                item = json.loads(line)
+                total_len += 1
+                if(item['duration']>self.duration and item['duration']<self.MAX_DURATION):
+                    valid_len += 1
+                    self.data.append(item)
+        print("Filter data from {} to {}".format(total_len, valid_len))
+        self.is_info_recorded = bool('Tags' in self.data[0])
+    def __len__(self):
+        return self.dataset_len
+    def __getitem__(self, idx):
+        first_try = True
+        try_cnt = 0
+        while True:
+            try:
+                if(self.random_choose or not first_try):
+                    index2 = np.random.randint(0,len(self.data))
+                else:
+                    index2 = idx
+                first_try = False
+                return self.getitem_main(index2)
+            except:
+                print("Error loadding ", self.data[idx]["path"])
+                try_cnt += 1
+                if(try_cnt>10):
+                    raise ValueError()
+    def getitem_main(self, idx):
+        path:str = self.data[idx]["path"]
+        json_path = path[:path.rfind('.')] + ".json"
+        if self.is_info_recorded:
+            item = self.data[idx]
+        else:
+            with open(json_path) as fp:
+                item:dict = json.load(fp)
+        description = self.generate_description(item)
+        if self.return_audio:
+            audio, sr = safe_random_seek_read(path, duration=self.duration, sample_rate=self.sr)
+        else:
+            audio = None
+        if self.return_path:
+            return audio, description, path
+        return audio, description
+    def generate_description(self, item):
+        if self.use_dynamic_prompt:
+            # dynamically generate prompt from given prompt template
+            prompt_template = random.choice(self.prompt_templates)
+            description = self.generate_description_dynamic(item, prompt_template)
+        else:
+            # use ordinary static prompt instead
+            description = self.generate_description_ordinary(item)
+        return description
+    def generate_description_dynamic(self, data, prompt_template: PromptTemplate):
+        exists_tag = [key for key in data if (key in self.tag_types) and (data[key] is not None) and (len(data[key]) > 0)]
+        if len(exists_tag) > 0:
+            probs = dist_prob_map[len(exists_tag)]
+            tags_num = random.choices(range(1, len(exists_tag)+1), probs)[0]
+            random.shuffle(exists_tag)
+            tags = exists_tag[:tags_num]
+            tags_args = {tag: self.tags_to_desc(data[tag], tag) for tag in tags}
+            tags_args = self.handle_BPM_tag(tags_args)
+            prompt = prompt_template.apply(**tags_args)
+        else:
+            # no strong tags, use all weak tags instead
+            prompt = prompt_template.apply()
+        return prompt
+    def tags_to_desc(self, tag_list, tag_type) -> str:
+        if self.lang == 'en':
+            return tags_to_desc(tag_list)
+        elif self.lang == 'zh':
+            if tag_type == 'BPM':
+                return tags_to_desc(tag_list, sep='、')
+            translator = self.translate[tag_type]
+            translated_tag_list = [translator[tag] for tag in tag_list if tag in translator ]
+            return tags_to_desc(translated_tag_list, sep='、')
+    def handle_BPM_tag(self, tags_args):
+        if "BPM" in tags_args and 'BPMDescript' in  self.tag_types:
+            bpm = tags_args["BPM"]
+            del tags_args["BPM"]
+            tag_types_used = random.choice((('BPM',), ('BPMDescript',), ('BPM', 'BPMDescript')))
+            for tag_type in tag_types_used:
+                tags_args[tag_type] = bpm if tag_type == 'BPM' else gen_bpm_descript(bpm, lang=self.lang)
+        return tags_args
+    def generate_description_ordinary(self, data, thresh = 0.3):
+        if self.lang != 'en':
+            raise ValueError(f'Language {self.lang} is not supported for ordinary description generation')
+        description = f'a piece of music by {data["Artist"]}'
+        # Add genre if available
+        if data["Genre"] and random.random() > thresh:
+            genres = ', '.join(data["Genre"])
+            description += f', belonging to the {genres} genres'
+        # Add moods if available
+        if data["Tags"] and random.random() > thresh:
+            tags = ', '.join(data["Tags"])
+            description += f'. This track contains the tags:{tags}'
+        # Add moods if available
+        if data["Mood"] and random.random() > thresh:
+            moods = ', '.join(data["Mood"])
+            description += f'. This track conveys a {moods} mood.'
+        # Add instruments if available
+        if data["Instrument"] and random.random() > thresh:
+            instruments = ', '.join(data["Instrument"])
+            description += f'. and primarily features the following instruments: {instruments}'
+        # Add a period to end the description
+        description += '.'
+        return description

codeclm/tokenizer/Flow1dVAE/libs/fsq/fsq.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505
+Code adapted from Jax version in Appendix A.1
+"""
+from __future__ import annotations
+from functools import wraps, partial
+from contextlib import nullcontext
+from typing import List, Tuple
+import torch
+import torch.nn as nn
+from torch.nn import Module
+from torch import Tensor, int32
+from torch.amp import autocast
+from einops import rearrange, pack, unpack
+# helper functions
+def exists(v):
+    return v is not None
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+def maybe(fn):
+    @wraps(fn)
+    def inner(x, *args, **kwargs):
+        if not exists(x):
+            return x
+        return fn(x, *args, **kwargs)
+    return inner
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+# tensor helpers
+def round_ste(z: Tensor) -> Tensor:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z).detach()
+# main class
+class FSQ(Module):
+    def __init__(
+        self,
+        levels: List[int],
+        dim: int | None = None,
+        num_codebooks = 1,
+        keep_num_codebooks_dim: bool | None = None,
+        scale: float | None = None,
+        allowed_dtypes: Tuple[torch.dtype, ...] = (torch.float32, torch.float64),
+        channel_first: bool = False,
+        projection_has_bias: bool = True,
+        return_indices = True,
+        force_quantization_f32 = True
+    ):
+        super().__init__()
+        _levels = torch.tensor(levels, dtype=int32)
+        self.register_buffer("_levels", _levels, persistent = False)
+        _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32)
+        self.register_buffer("_basis", _basis, persistent = False)
+        self.scale = scale
+        codebook_dim = len(levels)
+        self.codebook_dim = codebook_dim
+        effective_codebook_dim = codebook_dim * num_codebooks
+        self.num_codebooks = num_codebooks
+        self.effective_codebook_dim = effective_codebook_dim
+        keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
+        assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        self.dim = default(dim, len(_levels) * num_codebooks)
+        self.channel_first = channel_first
+        has_projections = self.dim != effective_codebook_dim
+        self.project_in = nn.Linear(self.dim, effective_codebook_dim, bias = projection_has_bias) if has_projections else nn.Identity()
+        self.project_out = nn.Linear(effective_codebook_dim, self.dim, bias = projection_has_bias) if has_projections else nn.Identity()
+        self.has_projections = has_projections
+        self.return_indices = return_indices
+        if return_indices:
+            self.codebook_size = self._levels.prod().item()
+            implicit_codebook = self._indices_to_codes(torch.arange(self.codebook_size))
+            self.register_buffer("implicit_codebook", implicit_codebook, persistent = False)
+        self.allowed_dtypes = allowed_dtypes
+        self.force_quantization_f32 = force_quantization_f32
+    def bound(self, z, eps: float = 1e-3):
+        """ Bound `z`, an array of shape (..., d). """
+        half_l = (self._levels - 1) * (1 + eps) / 2
+        offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = (offset / half_l).atanh()
+        return (z + shift).tanh() * half_l - offset
+    def quantize(self, z):
+        """ Quantizes z, returns quantized zhat, same shape as z. """
+        quantized = round_ste(self.bound(z))
+        half_width = self._levels // 2 # Renormalize to [-1, 1].
+        return quantized / half_width
+    def _scale_and_shift(self, zhat_normalized):
+        half_width = self._levels // 2
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat):
+        half_width = self._levels // 2
+        return (zhat - half_width) / half_width
+    def _indices_to_codes(self, indices):
+        level_indices = self.indices_to_level_indices(indices)
+        codes = self._scale_and_shift_inverse(level_indices)
+        return codes
+    def codes_to_indices(self, zhat):
+        """ Converts a `code` to an index in the codebook. """
+        assert zhat.shape[-1] == self.codebook_dim
+        zhat = self._scale_and_shift(zhat)
+        return (zhat * self._basis).sum(dim=-1).to(int32)
+    def indices_to_level_indices(self, indices):
+        """ Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings """
+        indices = rearrange(indices, '... -> ... 1')
+        codes_non_centered = (indices // self._basis) % self._levels
+        return codes_non_centered
+    def indices_to_codes(self, indices):
+        """ Inverse of `codes_to_indices`. """
+        assert exists(indices)
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        codes = self._indices_to_codes(indices)
+        if self.keep_num_codebooks_dim:
+            codes = rearrange(codes, '... c d -> ... (c d)')
+        codes = self.project_out(codes)
+        if is_img_or_video or self.channel_first:
+            codes = rearrange(codes, 'b ... d -> b d ...')
+        return codes
+    def forward(self, z):
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension
+        c - number of codebook dim
+        """
+        is_img_or_video = z.ndim >= 4
+        need_move_channel_last = is_img_or_video or self.channel_first
+        # standardize image or video into (batch, seq, dimension)
+        if need_move_channel_last:
+            z = rearrange(z, 'b d ... -> b ... d')
+            z, ps = pack_one(z, 'b * d')
+        assert z.shape[-1] == self.dim, f'expected dimension of {self.dim} but found dimension of {z.shape[-1]}'
+        z = self.project_in(z)
+        z = rearrange(z, 'b n (c d) -> b n c d', c = self.num_codebooks)
+        # whether to force quantization step to be full precision or not
+        force_f32 = self.force_quantization_f32
+        quantization_context = partial(autocast, 'cuda', enabled = False) if force_f32 else nullcontext
+        with quantization_context():
+            orig_dtype = z.dtype
+            if force_f32 and orig_dtype not in self.allowed_dtypes:
+                z = z.float()
+            codes = self.quantize(z)
+            # returning indices could be optional
+            indices = None
+            if self.return_indices:
+                indices = self.codes_to_indices(codes)
+            codes = rearrange(codes, 'b n c d -> b n (c d)')
+            codes = codes.type(orig_dtype)
+        # project out
+        out = self.project_out(codes)
+        # reconstitute image or video dimensions
+        if need_move_channel_last:
+            out = unpack_one(out, ps, 'b * d')
+            out = rearrange(out, 'b ... d -> b d ...')
+            indices = maybe(unpack_one)(indices, ps, 'b * c')
+        if not self.keep_num_codebooks_dim and self.return_indices:
+            indices = maybe(rearrange)(indices, '... 1 -> ...')
+        # return quantized output and indices
+        return out, indices
+if __name__ == '__main__':
+    # test
+    fsq = FSQ([4, 4, 4],dim=1024)
+    z = torch.randn(2, 3, 1024)
+    out, indices = fsq(z)
+    print(out.shape, indices.shape)
+    # print(out, indices)

codeclm/tokenizer/Flow1dVAE/libs/rvq/core_vq.py ADDED Viewed

	@@ -0,0 +1,366 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# This implementation is inspired from
+# https://github.com/lucidrains/vector-quantize-pytorch
+# which is released under MIT License. Hereafter, the original license:
+# MIT License
+#
+# Copyright (c) 2020 Phil Wang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Core vector quantization implementation."""
+import typing as tp
+from einops import rearrange, repeat
+import torch
+from torch import nn
+import torch.nn.functional as F
+# from .. import distrib
+def default(val: tp.Any, d: tp.Any) -> tp.Any:
+    return val if val is not None else d
+def ema_inplace(moving_avg, new, decay: float):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)
+def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    return samples[indices]
+def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+    means = sample_vectors(samples, num_clusters)
+    for _ in range(num_iters):
+        diffs = rearrange(samples, "n d -> n () d") - rearrange(
+            means, "c d -> () c d"
+        )
+        dists = -(diffs ** 2).sum(dim=-1)
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+        means = torch.where(zero_mask[..., None], means, new_means)
+    return means, bins
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance.
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
+        embed = init_fn(codebook_size, dim)
+        self.codebook_size = codebook_size
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
+        self.register_buffer("cluster_size", torch.zeros(codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+        self.runed_steps = 0
+        self.stop_steps = 50_000
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+        # Make sure all buffers across workers are in sync after initialization
+        distrib.broadcast_tensors(self.buffers())
+    def replace_(self, samples, mask):
+        modified_codebook = torch.where(
+            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        )
+        self.embed.data.copy_(modified_codebook)
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+        expired_codes = self.cluster_size < self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+        batch_samples = rearrange(batch_samples, "... d -> (...) d")
+        self.replace_(batch_samples, mask=expired_codes)
+        # distrib.broadcast_tensors(self.buffers())
+    def preprocess(self, x):
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(
+            x.pow(2).sum(1, keepdim=True)
+            - 2 * x @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+    def encode(self, x):
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+        # self.init_embed_(x)
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+        self.runed_steps += 1
+        if self.training and self.runed_steps < self.stop_steps:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+        return quantize, embed_ind
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation.
+    Currently supports only euclidean distance.
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        commitment_weight (float): Weight for commitment loss.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+        commitment_weight: float = 1.,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+        requires_projection = _codebook_dim != dim
+        self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity())
+        self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity())
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+        self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size,
+                                           kmeans_init=kmeans_init, kmeans_iters=kmeans_iters,
+                                           decay=decay, epsilon=epsilon,
+                                           threshold_ema_dead_code=threshold_ema_dead_code)
+        self.codebook_size = codebook_size
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    def encode(self, x):
+        x = rearrange(x, "b d n -> b n d")
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+    def forward(self, x, do_debug=False):
+        device = x.device
+        x = rearrange(x, "b d n -> b n d")
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x)
+        if self.training:
+            quantize = x + (quantize - x).detach()
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+        if self.training:
+            if self.commitment_weight > 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+        quantize = self.project_out(quantize)
+        quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize, embed_ind, loss
+class ResidualVectorQuantization(nn.Module):
+    """Residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *, num_quantizers, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+    def forward(self, x, n_q: tp.Optional[int] = None):
+        quantized_out = 0.0
+        residual = x
+        all_losses = []
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layerinx, layer in enumerate(self.layers[:n_q]):
+            print("Layer {} Used ratio {:.1f}".format(layerinx, (layer._codebook.cluster_size > 1.0).sum() / layer._codebook.cluster_size.shape[0] * 100.))
+            quantized, indices, loss = layer(residual)
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+        return quantized_out, out_indices, out_losses
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
+        residual = x
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out

codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize.py ADDED Viewed

	@@ -0,0 +1,268 @@

+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+class VectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo:
+    https://github.com/karpathy/deep-vector-quantization
+    Additionally uses following tricks from Improved VQGAN
+    (https://arxiv.org/pdf/2110.04627.pdf):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
+        self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
+        self.codebook = nn.Embedding(codebook_size, codebook_dim)
+    def forward(self, z):
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
+        z_e = self.in_proj(z)  # z_e : (B x D x T)
+        z_q, indices = self.decode_latents(z_e)
+        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_proj(z_q)
+        return z_q, commitment_loss, codebook_loss, indices, z_e
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight  # codebook: (N x D)
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance with codebook
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        return z_q, indices
+class ResidualVectorQuantize(nn.Module):
+    """
+    Introduced in SoundStream: An end2end neural audio codec
+    https://arxiv.org/abs/2107.03312
+    """
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: float = 0.0,
+    ):
+        super().__init__()
+        if isinstance(codebook_dim, int):
+            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
+        self.n_codebooks = n_codebooks
+        self.codebook_dim = codebook_dim
+        self.codebook_size = codebook_size
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(input_dim, codebook_size, codebook_dim[i])
+                for i in range(n_codebooks)
+            ]
+        )
+        self.quantizer_dropout = quantizer_dropout
+    def forward(self, z, n_quantizers: int = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+        """
+        z_q = 0
+        residual = z
+        commitment_loss = 0
+        codebook_loss = 0
+        codebook_indices = []
+        latents = []
+        if n_quantizers is None:
+            n_quantizers = self.n_codebooks
+        if self.training:
+            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
+            dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
+            n_dropout = int(z.shape[0] * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            if self.training is False and i >= n_quantizers:
+                break
+            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
+                residual
+            )
+            # Create mask to apply quantizer dropout
+            mask = (
+                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            )
+            z_q = z_q + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+            # Sum losses
+            commitment_loss += (commitment_loss_i * mask).mean()
+            codebook_loss += (codebook_loss_i * mask).mean()
+            codebook_indices.append(indices_i)
+            latents.append(z_e_i)
+        codes = torch.stack(codebook_indices, dim=1)
+        latents = torch.cat(latents, dim=1)
+        encodings = F.one_hot(codes, self.codebook_size).float() # B N T 1024
+        for n in range(encodings.shape[1]):
+            print("Lyaer {}, Ratio of unused vector : {:.1f}".format(n,
+                (encodings[:,n,:,:].sum(0).sum(0) < 1.0).sum()/torch.numel(encodings[:,n,:,:].sum(0).sum(0) < 1.0) * 100.
+            ))
+        return z_q, codes, latents, commitment_loss, codebook_loss
+    def from_codes(self, codes: torch.Tensor):
+        """Given the quantized codes, reconstruct the continuous representation
+        Parameters
+        ----------
+        codes : Tensor[B x N x T]
+            Quantized discrete representation of input
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        """
+        z_q = 0.0
+        z_p = []
+        n_codebooks = codes.shape[1]
+        for i in range(n_codebooks):
+            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
+            z_p.append(z_p_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), codes
+    def from_latents(self, latents: torch.Tensor):
+        """Given the unquantized latents, reconstruct the
+        continuous representation after quantization.
+        Parameters
+        ----------
+        latents : Tensor[B x N x T]
+            Continuous representation of input after projection
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized representation of full-projected space
+        Tensor[B x D x T]
+            Quantized representation of latent space
+        """
+        z_q = 0
+        z_p = []
+        codes = []
+        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
+            0
+        ]
+        for i in range(n_codebooks):
+            j, k = dims[i], dims[i + 1]
+            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
+            z_p.append(z_p_i)
+            codes.append(codes_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
+if __name__ == "__main__":
+    rvq = ResidualVectorQuantize(quantizer_dropout=True)
+    x = torch.randn(16, 512, 80)
+    y = rvq(x)
+    print(y["latents"].shape)

codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize2.py ADDED Viewed

	@@ -0,0 +1,290 @@

+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+class VectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo:
+    https://github.com/karpathy/deep-vector-quantization
+    Additionally uses following tricks from Improved VQGAN
+    (https://arxiv.org/pdf/2110.04627.pdf):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int, stale_tolerance: int = 100):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
+        self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
+        self.codebook = nn.Embedding(codebook_size, codebook_dim)
+        self.register_buffer("stale_counter", torch.zeros(self.codebook_size,))
+        self.stale_tolerance = stale_tolerance
+    def forward(self, z):
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
+        z_e = self.in_proj(z)  # z_e : (B x D x T)
+        z_q, indices = self.decode_latents(z_e)
+        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_proj(z_q)
+        return z_q, commitment_loss, codebook_loss, indices, z_e
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight  # codebook: (N x D)
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance with codebook
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        if(self.training):
+            onehots = torch.nn.functional.one_hot(indices, self.codebook_size).float()  # B, T, codebook_size
+            stale_codes = (onehots.sum(0).sum(0) == 0).float()
+            self.stale_counter = self.stale_counter * stale_codes + stale_codes
+            # random replace codes that haven't been used for a while
+            replace_code = (self.stale_counter == self.stale_tolerance).float() # codebook_size
+            if replace_code.sum(-1) > 0:
+                print("Replace {} codes".format(replace_code.sum(-1)))
+                random_input_idx = torch.randperm(encodings.shape[0])
+                random_input = encodings[random_input_idx].view(encodings.shape)
+                if random_input.shape[0] < self.codebook_size:
+                    random_input = torch.cat([random_input]*(self.codebook_size // random_input.shape[0] + 1), 0)
+                random_input = random_input[:self.codebook_size,:].contiguous()  # codebook_size, dim
+                self.codebook.weight.data = self.codebook.weight.data * (1 - replace_code).unsqueeze(-1) + random_input * replace_code.unsqueeze(-1)
+                self.stale_counter = self.stale_counter * (1 - replace_code)
+        return z_q, indices
+class ResidualVectorQuantize(nn.Module):
+    """
+    Introduced in SoundStream: An end2end neural audio codec
+    https://arxiv.org/abs/2107.03312
+    """
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: float = 0.0,
+        stale_tolerance: int = 100,
+    ):
+        super().__init__()
+        if isinstance(codebook_dim, int):
+            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
+        self.n_codebooks = n_codebooks
+        self.codebook_dim = codebook_dim
+        self.codebook_size = codebook_size
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(input_dim, codebook_size, codebook_dim[i], stale_tolerance=stale_tolerance)
+                for i in range(n_codebooks)
+            ]
+        )
+        self.quantizer_dropout = quantizer_dropout
+    def forward(self, z, n_quantizers: int = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+        """
+        z_q = 0
+        residual = z
+        commitment_loss = 0
+        codebook_loss = 0
+        codebook_indices = []
+        latents = []
+        if n_quantizers is None:
+            n_quantizers = self.n_codebooks
+        if self.training:
+            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
+            dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
+            n_dropout = int(z.shape[0] * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            if self.training is False and i >= n_quantizers:
+                break
+            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
+                residual
+            )
+            # Create mask to apply quantizer dropout
+            mask = (
+                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            )
+            z_q = z_q + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+            # Sum losses
+            commitment_loss += (commitment_loss_i * mask).mean()
+            codebook_loss += (codebook_loss_i * mask).mean()
+            codebook_indices.append(indices_i)
+            latents.append(z_e_i)
+        codes = torch.stack(codebook_indices, dim=1)
+        latents = torch.cat(latents, dim=1)
+        encodings = F.one_hot(codes, self.codebook_size).float() # B N T 1024
+        for n in range(encodings.shape[1]):
+            print("Lyaer {}, Ratio of unused vector : {:.1f}".format(n,
+                (encodings[:,n,:,:].sum(0).sum(0) < 1.0).sum()/torch.numel(encodings[:,n,:,:].sum(0).sum(0) < 1.0) * 100.
+            ))
+        return z_q, codes, latents, commitment_loss, codebook_loss
+    def from_codes(self, codes: torch.Tensor):
+        """Given the quantized codes, reconstruct the continuous representation
+        Parameters
+        ----------
+        codes : Tensor[B x N x T]
+            Quantized discrete representation of input
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        """
+        z_q = 0.0
+        z_p = []
+        n_codebooks = codes.shape[1]
+        for i in range(n_codebooks):
+            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
+            z_p.append(z_p_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), codes
+    def from_latents(self, latents: torch.Tensor):
+        """Given the unquantized latents, reconstruct the
+        continuous representation after quantization.
+        Parameters
+        ----------
+        latents : Tensor[B x N x T]
+            Continuous representation of input after projection
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized representation of full-projected space
+        Tensor[B x D x T]
+            Quantized representation of latent space
+        """
+        z_q = 0
+        z_p = []
+        codes = []
+        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
+            0
+        ]
+        for i in range(n_codebooks):
+            j, k = dims[i], dims[i + 1]
+            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
+            z_p.append(z_p_i)
+            codes.append(codes_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
+if __name__ == "__main__":
+    rvq = ResidualVectorQuantize(quantizer_dropout=True)
+    x = torch.randn(16, 512, 80)
+    y = rvq(x)
+    print(y["latents"].shape)

codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize3.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# compared with `descript_quantize2`, we use rvq & random_dropout
+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+class VectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo:
+    https://github.com/karpathy/deep-vector-quantization
+    Additionally uses following tricks from Improved VQGAN
+    (https://arxiv.org/pdf/2110.04627.pdf):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int, stale_tolerance: int = 100):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
+        self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
+        self.codebook = nn.Embedding(codebook_size, codebook_dim)
+        self.register_buffer("stale_counter", torch.zeros(self.codebook_size,))
+        self.stale_tolerance = stale_tolerance
+    def forward(self, z):
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
+        z_e = self.in_proj(z)  # z_e : (B x D x T)
+        z_q, indices = self.decode_latents(z_e)
+        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_proj(z_q)
+        return z_q, commitment_loss, codebook_loss, indices, z_e
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight  # codebook: (N x D)
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance with codebook
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        if(self.training):
+            onehots = torch.nn.functional.one_hot(indices, self.codebook_size).float()  # B, T, codebook_size
+            stale_codes = (onehots.sum(0).sum(0) == 0).float()
+            self.stale_counter = self.stale_counter * stale_codes + stale_codes
+            # random replace codes that haven't been used for a while
+            replace_code = (self.stale_counter == self.stale_tolerance).float() # codebook_size
+            if replace_code.sum(-1) > 0:
+                print("Replace {} codes".format(replace_code.sum(-1)))
+                random_input_idx = torch.randperm(encodings.shape[0])
+                random_input = encodings[random_input_idx].view(encodings.shape)
+                if random_input.shape[0] < self.codebook_size:
+                    random_input = torch.cat([random_input]*(self.codebook_size // random_input.shape[0] + 1), 0)
+                random_input = random_input[:self.codebook_size,:].contiguous()  # codebook_size, dim
+                self.codebook.weight.data = self.codebook.weight.data * (1 - replace_code).unsqueeze(-1) + random_input * replace_code.unsqueeze(-1)
+                self.stale_counter = self.stale_counter * (1 - replace_code)
+        return z_q, indices
+class ResidualVectorQuantize(nn.Module):
+    """
+    Introduced in SoundStream: An end2end neural audio codec
+    https://arxiv.org/abs/2107.03312
+    """
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: float = 0.0,
+        stale_tolerance: int = 100,
+    ):
+        super().__init__()
+        if isinstance(codebook_dim, int):
+            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
+        self.n_codebooks = n_codebooks
+        self.codebook_dim = codebook_dim
+        self.codebook_size = codebook_size
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(input_dim, codebook_size, codebook_dim[i], stale_tolerance=stale_tolerance)
+                for i in range(n_codebooks)
+            ]
+        )
+        self.quantizer_dropout = quantizer_dropout
+    def forward(self, z, n_quantizers: int = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+        """
+        z_q = 0
+        residual = z
+        commitment_loss = 0
+        codebook_loss = 0
+        codebook_indices = []
+        latents = []
+        if n_quantizers is None:
+            n_quantizers = self.n_codebooks
+        if self.training:
+            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
+            dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
+            n_dropout = int(z.shape[0] * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(z.device)
+        else:
+            n_quantizers = torch.ones((z.shape[0],)) * n_quantizers + 1
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            # if self.training is False and i >= n_quantizers:
+            #     break
+            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
+                residual
+            )
+            # Create mask to apply quantizer dropout
+            mask = (
+                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            )
+            z_q = z_q + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+            # Sum losses
+            commitment_loss += (commitment_loss_i * mask).mean()
+            codebook_loss += (codebook_loss_i * mask).mean()
+            codebook_indices.append(indices_i)
+            latents.append(z_e_i)
+        codes = torch.stack(codebook_indices, dim=1)
+        latents = torch.cat(latents, dim=1)
+        encodings = F.one_hot(codes, self.codebook_size).float() # B N T 1024
+        # for n in range(encodings.shape[1]):
+        #     print("Lyaer {}, Ratio of unused vector : {:.1f}".format(n,
+        #         (encodings[:,n,:,:].sum(0).sum(0) < 1.0).sum()/torch.numel(encodings[:,n,:,:].sum(0).sum(0) < 1.0) * 100.
+        #     ))
+        return z_q, codes, latents, commitment_loss, codebook_loss, n_quantizers.clamp(max=self.n_codebooks).long() - 1
+    def from_codes(self, codes: torch.Tensor):
+        """Given the quantized codes, reconstruct the continuous representation
+        Parameters
+        ----------
+        codes : Tensor[B x N x T]
+            Quantized discrete representation of input
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        """
+        z_q = 0.0
+        z_p = []
+        n_codebooks = codes.shape[1]
+        for i in range(n_codebooks):
+            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
+            z_p.append(z_p_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), codes
+    def from_latents(self, latents: torch.Tensor):
+        """Given the unquantized latents, reconstruct the
+        continuous representation after quantization.
+        Parameters
+        ----------
+        latents : Tensor[B x N x T]
+            Continuous representation of input after projection
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized representation of full-projected space
+        Tensor[B x D x T]
+            Quantized representation of latent space
+        """
+        z_q = 0
+        z_p = []
+        codes = []
+        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
+            0
+        ]
+        for i in range(n_codebooks):
+            j, k = dims[i], dims[i + 1]
+            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
+            z_p.append(z_p_i)
+            codes.append(codes_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
+if __name__ == "__main__":
+    rvq = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 4, codebook_size = 1024, codebook_dim = 32, quantizer_dropout = 0.0)
+    x = torch.randn(16, 1024, 80)
+    quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = rvq(x)
+    print(quantized_prompt_embeds.shape)
+    print(codes.shape)
+    # w/o reconstruction
+    loss = commitment_loss * 0.25 + codebook_loss * 1.0
+    # w/ reconstruction
+    loss = commitment_loss * 0.25 + codebook_loss * 1.0 + (x - quantized_prompt_embeds).abs().mean()

codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize3_4layer.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# compared with `descript_quantize2`, we use rvq & random_dropout
+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+import random
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+class VectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo:
+    https://github.com/karpathy/deep-vector-quantization
+    Additionally uses following tricks from Improved VQGAN
+    (https://arxiv.org/pdf/2110.04627.pdf):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int, stale_tolerance: int = 100):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
+        self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
+        self.codebook = nn.Embedding(codebook_size, codebook_dim)
+        self.register_buffer("stale_counter", torch.zeros(self.codebook_size,))
+        self.stale_tolerance = stale_tolerance
+    def forward(self, z):
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
+        z_e = self.in_proj(z)  # z_e : (B x D x T)
+        z_q, indices = self.decode_latents(z_e)
+        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_proj(z_q)
+        return z_q, commitment_loss, codebook_loss, indices, z_e
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight  # codebook: (N x D)
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance with codebook
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        if(self.training):
+            onehots = torch.nn.functional.one_hot(indices, self.codebook_size).float()  # B, T, codebook_size
+            stale_codes = (onehots.sum(0).sum(0) == 0).float()
+            self.stale_counter = self.stale_counter * stale_codes + stale_codes
+            # random replace codes that haven't been used for a while
+            replace_code = (self.stale_counter == self.stale_tolerance).float() # codebook_size
+            if replace_code.sum(-1) > 0:
+                print("Replace {} codes".format(replace_code.sum(-1)))
+                random_input_idx = torch.randperm(encodings.shape[0])
+                random_input = encodings[random_input_idx].view(encodings.shape)
+                if random_input.shape[0] < self.codebook_size:
+                    random_input = torch.cat([random_input]*(self.codebook_size // random_input.shape[0] + 1), 0)
+                random_input = random_input[:self.codebook_size,:].contiguous()  # codebook_size, dim
+                self.codebook.weight.data = self.codebook.weight.data * (1 - replace_code).unsqueeze(-1) + random_input * replace_code.unsqueeze(-1)
+                self.stale_counter = self.stale_counter * (1 - replace_code)
+        return z_q, indices
+class ResidualVectorQuantize(nn.Module):
+    """
+    Introduced in SoundStream: An end2end neural audio codec
+    https://arxiv.org/abs/2107.03312
+    """
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: float = 0.0,
+        stale_tolerance: int = 100,
+    ):
+        super().__init__()
+        if isinstance(codebook_dim, int):
+            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
+        self.n_codebooks = n_codebooks
+        self.codebook_dim = codebook_dim
+        self.codebook_size = codebook_size
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(input_dim, codebook_size, codebook_dim[i], stale_tolerance=stale_tolerance)
+                for i in range(n_codebooks)
+            ]
+        )
+        self.quantizer_dropout = quantizer_dropout
+    def forward(self, z, n_quantizers: int = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+        """
+        z_q = 0
+        residual = z
+        commitment_loss = 0
+        codebook_loss = 0
+        codebook_indices = []
+        latents = []
+        if n_quantizers is None:
+            n_quantizers = self.n_codebooks
+        if self.training:
+            random_num = random.random()
+            if random_num<0.6:
+                n_quantizers = torch.ones((z.shape[0],)) * 1
+            elif random_num<0.8:
+                n_quantizers = torch.ones((z.shape[0],)) * 2
+            else:
+                n_quantizers = torch.ones((z.shape[0],)) * 4
+            n_quantizers = n_quantizers.to(z.device)
+        else:
+            n_quantizers = torch.ones((z.shape[0],)) * n_quantizers
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            # if self.training is False and i >= n_quantizers:
+            #     break
+            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
+                residual
+            )
+            # Create mask to apply quantizer dropout
+            mask = (
+                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            )
+            z_q = z_q + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+            # Sum losses
+            commitment_loss += (commitment_loss_i * mask).mean()
+            codebook_loss += (codebook_loss_i * mask).mean()
+            codebook_indices.append(indices_i)
+            latents.append(z_e_i)
+        codes = torch.stack(codebook_indices, dim=1)
+        latents = torch.cat(latents, dim=1)
+        encodings = F.one_hot(codes, self.codebook_size).float() # B N T 1024
+        for n in range(encodings.shape[1]):
+            print("Lyaer {}, Ratio of unused vector : {:.1f}".format(n,
+                (encodings[:,n,:,:].sum(0).sum(0) < 1.0).sum()/torch.numel(encodings[:,n,:,:].sum(0).sum(0) < 1.0) * 100.
+            ))
+        return z_q, codes, latents, commitment_loss, codebook_loss, n_quantizers.clamp(max=self.n_codebooks).long() - 1
+    def from_codes(self, codes: torch.Tensor):
+        """Given the quantized codes, reconstruct the continuous representation
+        Parameters
+        ----------
+        codes : Tensor[B x N x T]
+            Quantized discrete representation of input
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        """
+        z_q = 0.0
+        z_p = []
+        n_codebooks = codes.shape[1]
+        for i in range(n_codebooks):
+            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
+            z_p.append(z_p_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), codes
+    def from_latents(self, latents: torch.Tensor):
+        """Given the unquantized latents, reconstruct the
+        continuous representation after quantization.
+        Parameters
+        ----------
+        latents : Tensor[B x N x T]
+            Continuous representation of input after projection
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized representation of full-projected space
+        Tensor[B x D x T]
+            Quantized representation of latent space
+        """
+        z_q = 0
+        z_p = []
+        codes = []
+        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
+            0
+        ]
+        for i in range(n_codebooks):
+            j, k = dims[i], dims[i + 1]
+            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
+            z_p.append(z_p_i)
+            codes.append(codes_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
+if __name__ == "__main__":
+    rvq = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 4, codebook_size = 1024, codebook_dim = 32, quantizer_dropout = 0.0)
+    x = torch.randn(16, 1024, 80)
+    quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = rvq(x)
+    print(quantized_prompt_embeds.shape)
+    print(codes.shape)
+    # w/o reconstruction
+    loss = commitment_loss * 0.25 + codebook_loss * 1.0
+    # w/ reconstruction
+    loss = commitment_loss * 0.25 + codebook_loss * 1.0 + (x - quantized_prompt_embeds).abs().mean()

codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize3_4layer_freezelayer1.py ADDED Viewed

	@@ -0,0 +1,301 @@

+# compared with `descript_quantize2`, we use rvq & random_dropout
+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+import random
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+class VectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo:
+    https://github.com/karpathy/deep-vector-quantization
+    Additionally uses following tricks from Improved VQGAN
+    (https://arxiv.org/pdf/2110.04627.pdf):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int, stale_tolerance: int = 100):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
+        self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
+        self.codebook = nn.Embedding(codebook_size, codebook_dim)
+        self.register_buffer("stale_counter", torch.zeros(self.codebook_size,))
+        self.stale_tolerance = stale_tolerance
+    def forward(self, z):
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
+        z_e = self.in_proj(z)  # z_e : (B x D x T)
+        z_q, indices = self.decode_latents(z_e)
+        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_proj(z_q)
+        return z_q, commitment_loss, codebook_loss, indices, z_e
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight  # codebook: (N x D)
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance with codebook
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        if(self.training):
+            onehots = torch.nn.functional.one_hot(indices, self.codebook_size).float()  # B, T, codebook_size
+            stale_codes = (onehots.sum(0).sum(0) == 0).float()
+            self.stale_counter = self.stale_counter * stale_codes + stale_codes
+            # random replace codes that haven't been used for a while
+            replace_code = (self.stale_counter == self.stale_tolerance).float() # codebook_size
+            if replace_code.sum(-1) > 0:
+                print("Replace {} codes".format(replace_code.sum(-1)))
+                random_input_idx = torch.randperm(encodings.shape[0])
+                random_input = encodings[random_input_idx].view(encodings.shape)
+                if random_input.shape[0] < self.codebook_size:
+                    random_input = torch.cat([random_input]*(self.codebook_size // random_input.shape[0] + 1), 0)
+                random_input = random_input[:self.codebook_size,:].contiguous()  # codebook_size, dim
+                self.codebook.weight.data = self.codebook.weight.data * (1 - replace_code).unsqueeze(-1) + random_input * replace_code.unsqueeze(-1)
+                self.stale_counter = self.stale_counter * (1 - replace_code)
+        return z_q, indices
+class ResidualVectorQuantize(nn.Module):
+    """
+    Introduced in SoundStream: An end2end neural audio codec
+    https://arxiv.org/abs/2107.03312
+    """
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: float = 0.0,
+        stale_tolerance: int = 100,
+    ):
+        super().__init__()
+        if isinstance(codebook_dim, int):
+            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
+        self.n_codebooks = n_codebooks
+        self.codebook_dim = codebook_dim
+        self.codebook_size = codebook_size
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(input_dim, codebook_size, codebook_dim[i], stale_tolerance=stale_tolerance)
+                for i in range(n_codebooks)
+            ]
+        )
+        self.quantizer_dropout = quantizer_dropout
+    def forward(self, z, n_quantizers: int = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+        """
+        z_q = 0
+        residual = z
+        commitment_loss = 0
+        codebook_loss = 0
+        codebook_indices = []
+        latents = []
+        if n_quantizers is None:
+            n_quantizers = self.n_codebooks
+        if self.training:
+            random_num = random.random()
+            if random_num<0.6:
+                n_quantizers = torch.ones((z.shape[0],)) * 2
+            else:
+                n_quantizers = torch.ones((z.shape[0],)) * 4
+            n_quantizers = n_quantizers.to(z.device)
+        else:
+            n_quantizers = torch.ones((z.shape[0],)) * n_quantizers
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            # if self.training is False and i >= n_quantizers:
+            #     break
+            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
+                residual
+            )
+            # Create mask to apply quantizer dropout
+            mask = (
+                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            )
+            z_q = z_q + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+            # Sum losses
+            commitment_loss += (commitment_loss_i * mask).mean()
+            codebook_loss += (codebook_loss_i * mask).mean()
+            codebook_indices.append(indices_i)
+            latents.append(z_e_i)
+        codes = torch.stack(codebook_indices, dim=1)
+        latents = torch.cat(latents, dim=1)
+        encodings = F.one_hot(codes, self.codebook_size).float() # B N T 1024
+        # for n in range(encodings.shape[1]):
+        #     print("Lyaer {}, Ratio of unused vector : {:.1f}".format(n,
+        #         (encodings[:,n,:,:].sum(0).sum(0) < 1.0).sum()/torch.numel(encodings[:,n,:,:].sum(0).sum(0) < 1.0) * 100.
+        #     ))
+        return z_q, codes, latents, commitment_loss, codebook_loss, n_quantizers.clamp(max=self.n_codebooks).long() - 1
+    def from_codes(self, codes: torch.Tensor):
+        """Given the quantized codes, reconstruct the continuous representation
+        Parameters
+        ----------
+        codes : Tensor[B x N x T]
+            Quantized discrete representation of input
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        """
+        z_q = 0.0
+        z_p = []
+        n_codebooks = codes.shape[1]
+        for i in range(n_codebooks):
+            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
+            z_p.append(z_p_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), codes
+    def from_latents(self, latents: torch.Tensor):
+        """Given the unquantized latents, reconstruct the
+        continuous representation after quantization.
+        Parameters
+        ----------
+        latents : Tensor[B x N x T]
+            Continuous representation of input after projection
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized representation of full-projected space
+        Tensor[B x D x T]
+            Quantized representation of latent space
+        """
+        z_q = 0
+        z_p = []
+        codes = []
+        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
+            0
+        ]
+        for i in range(n_codebooks):
+            j, k = dims[i], dims[i + 1]
+            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
+            z_p.append(z_p_i)
+            codes.append(codes_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
+if __name__ == "__main__":
+    rvq = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 4, codebook_size = 1024, codebook_dim = 32, quantizer_dropout = 0.0)
+    x = torch.randn(16, 1024, 80)
+    quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = rvq(x)
+    print(quantized_prompt_embeds.shape)
+    print(codes.shape)
+    # w/o reconstruction
+    loss = commitment_loss * 0.25 + codebook_loss * 1.0
+    # w/ reconstruction
+    loss = commitment_loss * 0.25 + codebook_loss * 1.0 + (x - quantized_prompt_embeds).abs().mean()

codeclm/tokenizer/Flow1dVAE/libs/rvq/descript_quantize3_4layer_return_layer.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# compared with `descript_quantize2`, we use rvq & random_dropout
+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+import random
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+class VectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo:
+    https://github.com/karpathy/deep-vector-quantization
+    Additionally uses following tricks from Improved VQGAN
+    (https://arxiv.org/pdf/2110.04627.pdf):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int, stale_tolerance: int = 100):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
+        self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
+        self.codebook = nn.Embedding(codebook_size, codebook_dim)
+        self.register_buffer("stale_counter", torch.zeros(self.codebook_size,))
+        self.stale_tolerance = stale_tolerance
+    def forward(self, z):
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
+        z_e = self.in_proj(z)  # z_e : (B x D x T)
+        z_q, indices = self.decode_latents(z_e)
+        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_proj(z_q)
+        return z_q, commitment_loss, codebook_loss, indices, z_e
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight  # codebook: (N x D)
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance with codebook
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        if(self.training):
+            onehots = torch.nn.functional.one_hot(indices, self.codebook_size).float()  # B, T, codebook_size
+            stale_codes = (onehots.sum(0).sum(0) == 0).float()
+            self.stale_counter = self.stale_counter * stale_codes + stale_codes
+            # random replace codes that haven't been used for a while
+            replace_code = (self.stale_counter == self.stale_tolerance).float() # codebook_size
+            if replace_code.sum(-1) > 0:
+                print("Replace {} codes".format(replace_code.sum(-1)))
+                random_input_idx = torch.randperm(encodings.shape[0])
+                random_input = encodings[random_input_idx].view(encodings.shape)
+                if random_input.shape[0] < self.codebook_size:
+                    random_input = torch.cat([random_input]*(self.codebook_size // random_input.shape[0] + 1), 0)
+                random_input = random_input[:self.codebook_size,:].contiguous()  # codebook_size, dim
+                self.codebook.weight.data = self.codebook.weight.data * (1 - replace_code).unsqueeze(-1) + random_input * replace_code.unsqueeze(-1)
+                self.stale_counter = self.stale_counter * (1 - replace_code)
+        return z_q, indices
+class ResidualVectorQuantize(nn.Module):
+    """
+    Introduced in SoundStream: An end2end neural audio codec
+    https://arxiv.org/abs/2107.03312
+    """
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: float = 0.0,
+        stale_tolerance: int = 100,
+    ):
+        super().__init__()
+        if isinstance(codebook_dim, int):
+            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
+        self.n_codebooks = n_codebooks
+        self.codebook_dim = codebook_dim
+        self.codebook_size = codebook_size
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(input_dim, codebook_size, codebook_dim[i], stale_tolerance=stale_tolerance)
+                for i in range(n_codebooks)
+            ]
+        )
+        self.quantizer_dropout = quantizer_dropout
+    def forward(self, z, n_quantizers: int = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+        """
+        z_q = 0
+        residual = z
+        commitment_loss = 0
+        codebook_loss = 0
+        layer = self.n_codebooks
+        codebook_indices = []
+        latents = []
+        if n_quantizers is None:
+            n_quantizers = self.n_codebooks
+        if self.training:
+            random_num = random.random()
+            if random_num<0.6:
+                n_quantizers = torch.ones((z.shape[0],)) * 1
+            elif random_num<0.8:
+                n_quantizers = torch.ones((z.shape[0],)) * 2
+                layer = 2
+            else:
+                n_quantizers = torch.ones((z.shape[0],)) * 4
+                layer = 4
+            n_quantizers = n_quantizers.to(z.device)
+        else:
+            n_quantizers = torch.ones((z.shape[0],)) * n_quantizers
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            # if self.training is False and i >= n_quantizers:
+            #     break
+            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
+                residual
+            )
+            # Create mask to apply quantizer dropout
+            mask = (
+                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            )
+            z_q = z_q + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+            # Sum losses
+            commitment_loss += (commitment_loss_i * mask).mean()
+            codebook_loss += (codebook_loss_i * mask).mean()
+            codebook_indices.append(indices_i)
+            latents.append(z_e_i)
+        codes = torch.stack(codebook_indices, dim=1)
+        latents = torch.cat(latents, dim=1)
+        encodings = F.one_hot(codes, self.codebook_size).float() # B N T 1024
+        for n in range(encodings.shape[1]):
+            print("Lyaer {}, Ratio of unused vector : {:.1f}".format(n,
+                (encodings[:,n,:,:].sum(0).sum(0) < 1.0).sum()/torch.numel(encodings[:,n,:,:].sum(0).sum(0) < 1.0) * 100.
+            ))
+        return z_q, codes, latents, commitment_loss, codebook_loss, n_quantizers.clamp(max=self.n_codebooks).long() - 1,layer
+    def from_codes(self, codes: torch.Tensor):
+        """Given the quantized codes, reconstruct the continuous representation
+        Parameters
+        ----------
+        codes : Tensor[B x N x T]
+            Quantized discrete representation of input
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        """
+        z_q = 0.0
+        z_p = []
+        n_codebooks = codes.shape[1]
+        for i in range(n_codebooks):
+            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
+            z_p.append(z_p_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), codes
+    def from_latents(self, latents: torch.Tensor):
+        """Given the unquantized latents, reconstruct the
+        continuous representation after quantization.
+        Parameters
+        ----------
+        latents : Tensor[B x N x T]
+            Continuous representation of input after projection
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized representation of full-projected space
+        Tensor[B x D x T]
+            Quantized representation of latent space
+        """
+        z_q = 0
+        z_p = []
+        codes = []
+        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
+            0
+        ]
+        for i in range(n_codebooks):
+            j, k = dims[i], dims[i + 1]
+            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
+            z_p.append(z_p_i)
+            codes.append(codes_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
+if __name__ == "__main__":
+    rvq = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 4, codebook_size = 1024, codebook_dim = 32, quantizer_dropout = 0.0)
+    x = torch.randn(16, 1024, 80)
+    quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = rvq(x)
+    print(quantized_prompt_embeds.shape)
+    print(codes.shape)
+    # w/o reconstruction
+    loss = commitment_loss * 0.25 + codebook_loss * 1.0
+    # w/ reconstruction
+    loss = commitment_loss * 0.25 + codebook_loss * 1.0 + (x - quantized_prompt_embeds).abs().mean()