Spaces:

shenyunhang
/

VITA-Audio

Runtime error

App Files Files Community

shenyunhang commited on May 6

Commit

52e4f53

0 Parent(s):

-a

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +164 -0
LICENSE +180 -0
README.md +278 -0
app.py +378 -0
configs/sts_finetune_stage1.yaml +273 -0
configs/sts_finetune_stage2.yaml +273 -0
evaluation/compute-acc-of-contain.py +85 -0
evaluation/compute-cer.py +559 -0
evaluation/compute-wer.py +553 -0
evaluation/evaluate_asr.py +379 -0
evaluation/evaluate_libritts.py +384 -0
evaluation/evaluate_seedtts.py +394 -0
evaluation/evaluate_sqa.py +451 -0
evaluation/get_chat_template.py +59 -0
requirements.txt +1 -0
requirements_ds_gpu.txt +44 -0
scripts/deepspeed/ds_config_zero1.json +61 -0
scripts/deepspeed/ds_config_zero2.json +61 -0
scripts/deepspeed/ds_config_zero2_no_optimizer.json +52 -0
scripts/deepspeed/ds_config_zero2_offload.json +61 -0
scripts/deepspeed/ds_config_zero3.json +63 -0
scripts/deepspeed/ds_config_zero3_offload.json +75 -0
scripts/deepspeed/evaluate_sts.sh +348 -0
scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage1.sh +137 -0
scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage2.sh +137 -0
scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp1_stage1.sh +137 -0
scripts/deepspeed/sts_qwen25/finetune_glm4voice_stage1.sh +136 -0
scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp10_stage1.sh +138 -0
scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp10_stage2.sh +138 -0
scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp1_stage1.sh +138 -0
scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_stage1.sh +137 -0
scripts/set_env_ds_gpu.sh +53 -0
setup.py +12 -0
third_party/GLM-4-Voice/.gitignore +4 -0
third_party/GLM-4-Voice/.gitmodules +3 -0
third_party/GLM-4-Voice/LICENSE +201 -0
third_party/GLM-4-Voice/README.md +159 -0
third_party/GLM-4-Voice/README_en.md +148 -0
third_party/GLM-4-Voice/audio_process.py +93 -0
third_party/GLM-4-Voice/cosyvoice/__init__.py +0 -0
third_party/GLM-4-Voice/cosyvoice/bin/inference.py +114 -0
third_party/GLM-4-Voice/cosyvoice/bin/train.py +140 -0
third_party/GLM-4-Voice/cosyvoice/cli/__init__.py +0 -0
third_party/GLM-4-Voice/cosyvoice/cli/cosyvoice.py +83 -0
third_party/GLM-4-Voice/cosyvoice/cli/frontend.py +168 -0
third_party/GLM-4-Voice/cosyvoice/cli/model.py +95 -0
third_party/GLM-4-Voice/cosyvoice/dataset/__init__.py +0 -0
third_party/GLM-4-Voice/cosyvoice/dataset/dataset.py +160 -0
third_party/GLM-4-Voice/cosyvoice/dataset/processor.py +965 -0
third_party/GLM-4-Voice/cosyvoice/flow/decoder.py +222 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+#
+*.sw*

LICENSE ADDED Viewed

	@@ -0,0 +1,180 @@

+Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved. The below software and/or models in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) THL A29 Limited.
+License Terms of the  VITA1.5:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this Software and associated documentation files, to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sublicense copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+- You agree to use the VITA1.5 only for academic, research and education purposes, and refrain from using it for any commercial or production purposes under any circumstances.
+- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+For avoidance of doubts, "Software" means the VITA1.5 model inference-enabling code, and weights made available under this license.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+Other dependencies and licenses:
+Open Source Model Licensed under the Apache License Version 2.0:
+The below software in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"), as model weights provided for the VITA1.5 Project hereunder is fine-tuned with the assistance of below model.
+All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+--------------------------------------------------------------------
+1. Qwen2-7B-Instruct
+Copyright 2024 Alibaba Cloud
+Terms of the Apache License Version 2.0:
+--------------------------------------------------------------------
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+Open Source Model/Software Licensed under the Apache License Version 2.0:
+The below software in this distribution may have been modified by THL A29 Limited ("Tencent Modifications").  All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+--------------------------------------------------------------------
+1. ModelLink
+Copyright (c) 2024, HUAWEI CORPORATION.  All rights reserved.
+A copy of the Apache License Version 2.0 is included in this file.
+Open Source Model/Software Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. opencv
+Copyright (C) 2000-2022, Intel Corporation, all rights reserved.
+Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved.
+Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+Copyright (C) 2015-2023, OpenCV Foundation, all rights reserved.
+Copyright (C) 2008-2016, Itseez Inc., all rights reserved.
+Copyright (C) 2019-2023, Xperience AI, all rights reserved.
+Copyright (C) 2019-2022, Shenzhen Institute of Artificial Intelligence and Robotics for Society, all rights reserved.
+Copyright (C) 2022-2023, Southern University of Science And Technology, all rights reserved.
+A copy of the Apache 2.0 is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://github.com/opencv/opencv/tree/4.10.0/3rdparty
+Open Source Model/Software Licensed under the BSD 3-Clause License:
+--------------------------------------------------------------------
+1. flask
+Copyright 2010 Pallets
+2. flask-restful
+Copyright (c) 2013, Twilio, Inc.
+All rights reserved.
+Terms of the BSD 3-Clause License:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Open Source Model/Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. Megatron-LM
+Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+A copy of the BSD 3-Clause is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE
+Open Source Model/Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. MindSpeed
+Copyright (c) 2024, Bytedance Inc.
+Copyright (c) 2023, Huawei Technologies Co., Ltd
+Copyright (c) 2022, NVIDIA CORPORATION.
+All rights reserved.
+A copy of the BSD 3-Clause is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://gitee.com/ascend/MindSpeed/blob/master/LICENSE
+Open Source Model/Software Licensed under the MIT License:
+--------------------------------------------------------------------
+1. natsort
+Copyright (c) 2012-2023 Seth M. Morton
+Terms of the MIT License:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+VertiTab

README.md ADDED Viewed

	@@ -0,0 +1,278 @@

+# VITA-Audio: Fast Interleaved Audio-Text Token Generation for Efficient Large Speech-Language Model
+<p align="center">
+    <img src="asset/VITA_audio_logos.png" width="50%" height="50%">
+</p>
+<p align="center">
+    <a href="https://arxiv.org/abs/2502.05177" target="_blank"><img src="https://img.shields.io/badge/VITA%20Audio-Report-b5212f.svg?logo=arxiv" /></a>
+    <a href="https://huggingface.co/collections/VITA-MLLM/vita-audio-680f036c174441e7cdf02575" target="_blank"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-ffc107?color=ffc107&logoColor=white" /></a>
+ </p>
+## :fire: News
+* **`2025.05.06`** 🌟 We are proud to launch VITA-Audio, an end-to-end large speech model with fast audio-text token generation.
+## 📄 Contents <!-- omit in toc -->
+- [Highlights](#-highlights)
+- [Exhibition](#-exhibition)
+- [Models](#-models)
+- [Experimental Results](#-experimental-results)
+- [Training](#-training)
+- [Inference](#-inference)
+- [Evaluation](#-evaluation)
+## ✨ Highlights
+- **Low Latency**. VITA-Audio is the first end-to-end speech model capable of generating audio during the initial forward pass. By utilizing a set of 32 prefill tokens, VITA-Audio reduces the time required to generate the first audio token chunk from 217 ms to just 47 ms.
+- **Fast Inference**. VITA-Audio achieves an inference speedup of 3-5x at the 7B parameter scale.
+- **Open Source**. VITA-Audio is trained on **open-source data** only, consisting of 200k hours of publicly available audio.
+- **Strong Performance**. VITA-Audio achieves competitive results on ASR,TTS and SQA benchmarks among cutting-edge models under 7B parameters.
+## 📌 Exhibition
+### Inference Acceleration
+Model inference speed under different inference modes.
+<p align="center">
+  <img src="./asset/qa_speed.gif" alt="demogif" width="48%" style="display: inline-block; margin-right: 2%;">
+  <img src="./asset/tts_speed.gif" alt="second_gif" width="48%" style="display: inline-block;">
+</p>
+### Time to Generate the First Audio Segment In Streaming Inference
+<div align="center">
+  <img width="400" alt="first audio generate time" src="https://github.com/user-attachments/assets/165f943e-ac53-443f-abba-e5eb1e0c0f40" />
+</div>
+### Generated Audio Case
+> 打南边来了个哑巴，腰里别了个喇叭；打北边来了个喇嘛，手里提了个獭犸。
+> 提着獭犸的喇嘛要拿獭犸换别着喇叭的哑巴的喇叭；别着喇叭的哑巴不愿拿喇叭换提着獭玛的喇嘛的獭犸。
+> 不知是别着喇叭的哑巴打了提着獭玛的喇嘛一喇叭；还是提着獭玛的喇嘛打了别着喇叭的哑巴一獭玛。
+> 喇嘛回家炖獭犸；哑巴嘀嘀哒哒吹喇叭。
+https://github.com/user-attachments/assets/38da791f-5d72-4d9c-a9b2-cec97c2f2b2b
+---
+> To be or not to be--to live intensely and richly,
+> merely to exist, that depends on ourselves. Let widen and intensify our relations.
+> While we live, let live!
+https://github.com/user-attachments/assets/fd478065-4041-4eb8-b331-0c03b304d853
+---
+> The hair has been so little, don't think about it, go to bed early, for your hair. Good night!
+https://github.com/user-attachments/assets/4cfe4742-e237-42bd-9f17-7935b2285799
+---
+> 两个黄鹂鸣翠柳，
+> 一行白鹭上青天。
+> 窗含西岭千秋雪，
+> 门泊东吴万里船。
+https://github.com/user-attachments/assets/382620ee-bb2a-488e-9e00-71afd2342b56
+---
+## 🔔 Models
+| Model                   | LLM Size | Huggingface Weights                                           |
+|-------------------------|----------|---------------------------------------------------------------|
+| VITA-Audio-Boost        | 7B       | https://huggingface.co/VITA-MLLM/VITA-Audio-Boost             |
+| VITA-Audio-Balance      | 7B       | https://huggingface.co/VITA-MLLM/VITA-Audio-Balance           |
+| VITA-Audio-Plus-Vanilla | 7B       | https://huggingface.co/VITA-MLLM/VITA-Audio-Plus-Vanilla      |
+## 📈 Experimental Results
+- **Comparison of Spoken Question Answering**.
+![Clipboard_Screenshot_1746531780](https://github.com/user-attachments/assets/3adcad15-0333-4b92-bfdf-b753b330a3e2)
+- **Comparison of Text to Speech**.
+![image](https://github.com/user-attachments/assets/09cf8fd3-d7a5-4b77-be49-5a0ace308f3f)
+- **Comparison of Automatic Speech Recognition**.
+![Clipboard_Screenshot_1746532039](https://github.com/user-attachments/assets/d950cae0-c065-4da9-b37a-a471d28158a0)
+![Clipboard_Screenshot_1746532022](https://github.com/user-attachments/assets/929f45cd-693a-4ff6-af73-ceec6e875706)
+- **Effectiveness of Inference Acceleration**.
+![Clipboard_Screenshot_1746532167](https://github.com/user-attachments/assets/ad8b9e90-cd3c-4968-8653-998811a50006)
+![Image](https://github.com/user-attachments/assets/4aa5db8c-362d-4152-8090-92292b9a84c0)
+## 📔 Requirements and Installation
+### Prepare Environment
+```
+docker pull shenyunhang/pytorch:24.11-py3_2024-1224
+```
+### Get the Code
+```
+git clone https://github.com/VITA-MLLM/VITA-Audio.git
+cd VITA-Audio
+pip install -r requirements_ds_gpu.txt
+pip install -e .
+```
+### Prepare Pre-trained Weight
+#### LLM
+- Download the LLM from https://huggingface.co/Qwen/Qwen2.5-7B-Instruct.
+- Put it into '../models/Qwen/Qwen2.5-7B-Instruct/'
+#### Audio Encoder and Audio Decoder
+- Download the Audio Encoder from https://huggingface.co/THUDM/glm-4-voice-tokenizer.
+- Put it into '../models/THUDM/glm-4-voice-tokenizer'
+- Download the Audio Decoder from https://huggingface.co/THUDM/glm-4-voice-decoder.
+- Put it into '../models/THUDM/glm-4-voice-decoder'
+### Data Format
+#### **Speech QA Interleaved Data Format**
+> This format shows how text and audio sequences are interleaved in a structured JSON conversation between a user and an assistant.
+```jsonc
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "<|begin_of_audio|> audio_sequence <|end_of_audio|>"
+    },
+    {
+      "role": "assistant",
+      "content": "text_sequence_1 <|begin_of_audio|> audio_sequence_1 <|end_of_audio|> text_sequence_2 <|begin_of_audio|> audio_sequence_2 <|end_of_audio|>"
+    }
+  ]
+}
+```
+## 🎲 Training
+The following tutorial will take `VITA-Audio-Boost` as an example.
+- To train `VITA-Audio-Balance` and other variants, you should modify the `text-audio-interval-ratio`.
+  VITA-Audio-Boost:
+  ```
+  --text-audio-interval-ratio 1 10 4 10 \
+  ```
+  VITA-Audio-Balance:
+  ```
+  --text-audio-interval-ratio 1 4 3 8 4 10 \
+  ```
+- To train `VITA-Audio-Plus-*`, you should use the script like `scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice...`
+### Stage-1 (Audio-Text Alignment)
+```
+bash scripts/deepspeed/sts_qwen25/finetune_glm4voice_stage1.sh 8192 `date +'%Y%m%d_%H%M%S'`
+```
+The above script may need some adjustments.
+- Set `ROOT_PATH` to your code root folder.
+- Set `LOCAL_ROOT_PATH` to a temporary code root folder.
+- Modify other variables as needed for your environment.
+### Stage-2 (Single MCTP Module Training)
+```
+bash scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp1_stage1.sh 8192 `date +'%Y%m%d_%H%M%S'`
+```
+The above script may need some adjustments.
+- Set `ROOT_PATH` to your code root folder.
+- Set `LOCAL_ROOT_PATH` to a temporary code root folder.
+- Set `MODEL_NAME_OR_PATH` to the path of the model trained in Stage 1.
+- Modify other variables as needed for your environment.
+### Stage-3 (Multiple MCTP Modules Training)
+```
+bash scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage1.sh 8192 `date +'%Y%m%d_%H%M%S'`
+```
+The above script may need some adjustments.
+- Set `ROOT_PATH` to your code root folder.
+- Set `LOCAL_ROOT_PATH` to a temporary code root folder.
+- Set `MODEL_NAME_OR_PATH` to the path of the model trained in Stage 2.
+- Modify other variables as needed for your environment.
+### Stage-4 (Supervised Fine-tuning)
+```
+bash scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage2.sh 2048 `date +'%Y%m%d_%H%M%S'`
+```
+The above script may need some adjustments.
+- Set `ROOT_PATH` to your code root folder.
+- Set `LOCAL_ROOT_PATH` to a temporary code root folder.
+- Set `MODEL_NAME_OR_PATH` to the path of the model trained in Stage 3.
+- Modify other variables as needed for your environment.
+## 📐 Inference
+Here we implement a simple script for inference.
+It includes examples of speech-to-speech, ASR, and TTS tasks, as well as inference speed testing.
+```
+python tools/inference_sts.py
+```
+- Set `model_name_or_path` to VITA-Audio weights.
+- Set `audio_tokenizer_path` to the path of the audio encoder.
+- Set `flow_path` to the path of the audio decoder.
+## 🔎 Evaluation
+Evaluate SQA, ASR, and TTS benchmarks
+```
+bash scripts/deepspeed/evaluate_sts.sh
+```

app.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import torch
+import os
+import numpy as np
+import copy
+import gradio as gr
+import sys
+from vita_audio.tokenizer import get_audio_tokenizer
+from vita_audio.data.processor.audio_processor import add_audio_input_contiguous
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, AutoConfig
+from transformers.generation import GenerationConfig
+PUNCTUATION = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
+import math
+from numba import jit
+@jit
+def float_to_int16(audio: np.ndarray) -> np.ndarray:
+    am = int(math.ceil(float(np.abs(audio).max())) * 32768)
+    am = 32767 * 32768 // am
+    return np.multiply(audio, am).astype(np.int16)
+def is_wav(file_path):
+    wav_extensions = {'.wav'}
+    _, ext = os.path.splitext(file_path)
+    return ext.lower() in wav_extensions
+def _parse_text(text):
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split("`")
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = "<br></code></pre>"
+        else:
+            if i > 0 and count % 2 == 1:
+                line = line.replace("`", r"\`")
+                line = line.replace("<", "&lt;")
+                line = line.replace(">", "&gt;")
+                line = line.replace(" ", "&nbsp;")
+                line = line.replace("*", "&ast;")
+                line = line.replace("_", "&lowbar;")
+                line = line.replace("-", "&#45;")
+                line = line.replace(".", "&#46;")
+                line = line.replace("!", "&#33;")
+                line = line.replace("(", "&#40;")
+                line = line.replace(")", "&#41;")
+                line = line.replace("$", "&#36;")
+            lines[i] = "<br>" + line
+    return "".join(lines)
+def _launch_demo(model, tokenizer, audio_tokenizer):
+    def predict(_chatbot, task_history,task):
+        chat_query = task_history[-1][0]
+        print(task_history)
+        messages = []
+        audio_path_list =[]
+        if task == 'Spoken QA':
+            messages = [
+            {
+                "role": "system",
+                #"content": "Your Name: Luke\nYour Gender: male\n\nRespond in a text-audio interleaved manner.",
+                # "content": "Your Name: Lucy\nYour Gender: female\nRespond in a text-audio interleaved manner.",
+                "content": "Your Name: Omni\nYour Gender: female\nRespond in a text-audio interleaved manner.",
+            },
+            ]
+            for i, (q, a) in enumerate(task_history):
+                if isinstance(q, (tuple, list)) and is_wav(q[0]):
+                    audio_path_list.append(q[0])
+                    messages = messages + [
+                    {
+                        "role": "user",
+                        "content": f"\n<|audio|>",
+                    },
+                ]
+                else:
+                    messages = messages + [
+                        {
+                            "role": "user",
+                            "content": q ,
+                        },
+                    ]
+                if a != None:
+                    messages = messages + [
+                        {
+                            "role": "assistant",
+                            "content": a ,
+                        },
+                    ]
+            model.generation_config.do_sample = False
+        elif task == 'TTS':
+            for i, (q, a) in enumerate(task_history):
+                if isinstance(q, (tuple, list)) and is_wav(q[0]):
+                    audio_path_list.append(q[0])
+                    messages = messages + [
+                        {
+                            "role": "user",
+                            "content": f"\n<|audio|>",
+                        },
+                    ]
+                else:
+                    messages = messages + [
+                        {
+                            "role": "user",
+                            "content": f'Convert the text to speech.\n{q}' ,
+                        },
+                    ]
+                if a != None:
+                    messages = messages + [
+                        {
+                            "role": "assistant",
+                            "content": a ,
+                        },
+                    ]
+            model.generation_config.do_sample = True
+        elif task == 'ASR':
+            for i, (q, a) in enumerate(task_history):
+                if isinstance(q, (tuple, list)) and is_wav(q[0]):
+                    audio_path_list.append(q[0])
+                    messages = messages + [
+                        {
+                            "role": "user",
+                            "content": f"Convert the speech to text.\n<|audio|>",
+                        },
+                    ]
+                else:
+                    messages = messages + [
+                        {
+                            "role": "user",
+                            "content": f"{q}" ,
+                        },
+                    ]
+                if a != None:
+                    messages = messages + [
+                        {
+                            "role": "assistant",
+                            "content": a ,
+                        },
+                    ]
+                model.generation_config.do_sample = False
+        add_generation_prompt =True
+        input_ids = tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=add_generation_prompt,
+            # return_tensors="pt",
+        )
+        input_ids, audios, audio_indices = add_audio_input_contiguous(
+            input_ids, audio_path_list, tokenizer, audio_tokenizer
+        )
+        input_ids = torch.tensor([input_ids], dtype=torch.long).to("cuda")
+        # print("input", tokenizer.decode(input_ids[0], skip_special_tokens=False), flush=True)
+        if audio_path_list == []:
+            audios = None
+            audio_indices = None
+        outputs = model.generate(
+            input_ids,
+            audios=audios,
+            audio_indices=audio_indices,
+        )
+        output = tokenizer.decode(outputs[0], skip_special_tokens=False)
+        # print(f"{output=}", flush=True)
+        audio_offset = tokenizer.convert_tokens_to_ids("<|audio_0|>")
+        begin_of_audio = tokenizer.convert_tokens_to_ids("<|begin_of_audio|>")
+        end_of_audio = tokenizer.convert_tokens_to_ids("<|end_of_audio|>")
+        im_end = tokenizer.convert_tokens_to_ids("<|im_end|>")
+        response = outputs[0][len(input_ids[0]):]
+        audio_tokens = []
+        text_tokens = []
+        for token_id in response:
+            if token_id >= audio_offset:
+                audio_tokens.append(token_id - audio_offset)
+            elif (token_id.item() != begin_of_audio) and (token_id.item() != end_of_audio) and (token_id.item() != im_end):
+                text_tokens.append(token_id)
+        if len(audio_tokens) > 0:
+            tts_speech = audio_tokenizer.decode(audio_tokens)
+            audio_np = float_to_int16(tts_speech.cpu().numpy())
+            tts_speech = (22050,audio_np)
+        else:
+            tts_speech = None
+        # import pdb;pdb.set_trace()
+        history_response = tokenizer.decode(text_tokens)
+        task_history[-1] = (chat_query, history_response)
+        _chatbot[-1] = (chat_query, history_response)
+        # print("query",chat_query)
+        # print("task_history",task_history)
+        # print(_chatbot)
+        # print("answer:  ",outputs)
+        return _chatbot, tts_speech
+    def add_text(history, task_history, text):
+        task_text = text
+        # import pdb;pdb.set_trace()
+        if len(text) >= 2 and text[-1] in PUNCTUATION and text[-2] not in PUNCTUATION:
+            task_text = text[:-1]
+        history = history + [(_parse_text(text), None)]
+        task_history = task_history + [(task_text, None)]
+        return history, task_history, ""
+    def add_audio(history, task_history, file):
+        print(file)
+        if file is None:
+            return history, task_history
+        history = history + [((file,), None)]
+        task_history = task_history + [((file,), None)]
+        return history, task_history
+    def reset_user_input():
+        # import pdb;pdb.set_trace()
+        return gr.update(value="")
+    def reset_state(task_history):
+        task_history.clear()
+        return []
+    with gr.Blocks(title="VITA-Audio-Plus-Vanilla") as demo:
+        gr.Markdown("""<center><font size=8>VITA-Audio-Plus-Vanilla</center>""")
+        gr.Markdown("""<center><font size=4>The deployment of the VITA-Audio-Plus-Vanilla model employs a non-streaming deployment approach. The currently deployed model is VITA-Audio-Plus-Vanilla. For the ASR and TTS tasks, only single-turn dialogues are supported. In the Spoken QA task, generated text is used as dialogue history to reduce the context length.</center>""")
+        chatbot = gr.Chatbot(label='VITA-Audio-Plus-Vanilla', elem_classes="control-height", height=500)
+        query = gr.Textbox(lines=2, label='Text Input')
+        task_history = gr.State([])
+        with gr.Row():
+            add_text_button = gr.Button("Submit Text (提交文本)")
+            add_audio_button = gr.Button("Submit Audio (提交音频)")
+            empty_bin = gr.Button("🧹 Clear History (清除历史)")
+            task = gr.Radio(
+                        choices = ["ASR", "TTS", "Spoken QA"], label="TASK",value = 'Spoken QA'
+                    )
+        with gr.Row(scale=1):
+                record_btn = gr.Audio(sources=[ "microphone","upload"], type="filepath", label="🎤 Record or Upload Audio (录音或上传音频)", show_download_button=True, waveform_options=gr.WaveformOptions(sample_rate=16000))
+                audio_output = gr.Audio(label="Play", streaming=True,
+                                        autoplay=True, show_download_button=True)
+        add_text_button.click(add_text, [chatbot, task_history, query], [chatbot, task_history], show_progress=True).then(
+            reset_user_input, [], [query]
+        ).then(
+                predict, [chatbot, task_history,task], [chatbot,audio_output], show_progress=True
+        )
+        empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
+        add_audio_button.click(add_audio, [chatbot, task_history,record_btn], [chatbot, task_history], show_progress=True).then(
+                predict, [chatbot, task_history,task], [chatbot,audio_output], show_progress=True
+        )
+    server_port = 18806
+    demo.launch(
+        share=False,
+        debug=True,
+        server_name="0.0.0.0",
+        server_port=server_port,
+        show_api=False,
+        show_error=False,
+        )
+def main():
+    model_name_or_path = "VITA-MLLM/VITA-Audio-Plus-Vanilla"
+    device_map = "cuda:0"
+    sys.path.append("third_party/GLM-4-Voice/")
+    sys.path.append("third_party/GLM-4-Voice/cosyvoice/")
+    sys.path.append("third_party/GLM-4-Voice/third_party/Matcha-TTS/")
+    from huggingface_hub import snapshot_download
+    audio_tokenizer_path = snapshot_download(repo_id="THUDM/glm-4-voice-tokenizer")
+    flow_path = snapshot_download(repo_id="THUDM/glm-4-voice-decoder")
+    audio_tokenizer_rank = 0
+    audio_tokenizer_type = "sensevoice_glm4voice"
+    torch_dtype = torch.bfloat16
+    audio_tokenizer = get_audio_tokenizer(
+        audio_tokenizer_path, audio_tokenizer_type, flow_path=flow_path, rank=audio_tokenizer_rank
+    )
+    from evaluation.get_chat_template import qwen2_chat_template as chat_template
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name_or_path,
+        trust_remote_code=True,
+        chat_template=chat_template,
+    )
+    # print(f"{tokenizer=}")
+    # print(f"{tokenizer.get_chat_template()=}")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name_or_path,
+        trust_remote_code=True,
+        device_map=device_map,
+        torch_dtype=torch_dtype,
+        attn_implementation="flash_attention_2",
+    ).eval()
+    # print(f"{model.config.model_type=}")
+    model.generation_config = GenerationConfig.from_pretrained(
+        model_name_or_path, trust_remote_code=True
+    )
+    model.generation_config.max_new_tokens = 4096
+    model.generation_config.chat_format = "chatml"
+    model.generation_config.max_window_size = 8192
+    model.generation_config.use_cache = True
+    model.generation_config.do_sample = True
+    model.generation_config.temperature = 1.0
+    model.generation_config.top_k = 50
+    model.generation_config.top_p = 1.0
+    model.generation_config.num_beams = 1
+    model.generation_config.pad_token_id = tokenizer.pad_token_id
+    model.generation_config.mtp_inference_mode = [8192,10]
+    _launch_demo(model, tokenizer, audio_tokenizer)
+if __name__ == '__main__':
+    main()

configs/sts_finetune_stage1.yaml ADDED Viewed

	@@ -0,0 +1,273 @@

+xlsx_sample_num: 5
+dataset:
+  wenet-e2e/wenetspeech:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/wenet-e2e/wenetspeech/L_fixed.jsonl
+      - datasets/jsonl/wenet-e2e/wenetspeech/DEV_fixed.jsonl
+  Wenetspeech4TTS/Wenetspeech4TTS:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/Wenetspeech4TTS/WenetSpeech4TTS/Basic.jsonl
+  fixie-ai/librispeech_asr:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/fixie-ai/librispeech_asr/train.100.clean.jsonl
+      - datasets/jsonl/fixie-ai/librispeech_asr/train.360.clean.jsonl
+      - datasets/jsonl/fixie-ai/librispeech_asr/train.500.other.jsonl
+  mythicinfinity/libritts:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/mythicinfinity/libritts/train.clean.100.jsonl
+      - datasets/jsonl/mythicinfinity/libritts/train.clean.360.jsonl
+      - datasets/jsonl/mythicinfinity/libritts/train.other.500.jsonl
+      - datasets/jsonl/mythicinfinity/libritts_r/train.clean.100.jsonl
+      - datasets/jsonl/mythicinfinity/libritts_r/train.clean.360.jsonl
+      - datasets/jsonl/mythicinfinity/libritts_r/train.other.500.jsonl
+  parler-tts/mls_eng:
+    ratio: 1.0
+    data_paths:
+      #- datasets/jsonl/parler-tts/mls_eng_10k/train.jsonl
+      - datasets/jsonl/parler-tts/mls_eng/train.jsonl
+  mozilla-foundation/common_voice_17_0:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/mozilla-foundation/common_voice_17_0/en/train.jsonl
+      - datasets/jsonl/mozilla-foundation/common_voice_17_0/zh-CN/train.jsonl
+  MushanW/GLOBE_V2:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/MushanW/GLOBE_V2/train.jsonl
+  amphion/Emilia-Dataset:
+    ratio: 0.5
+    data_paths:
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000000_B000100.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000100_B000200.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000200_B000300.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000300_B000400.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000400_B000500.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000500_B000600.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000600_B000700.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000700_B000800.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000800_B000900.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000900_B001000.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000000_B000100.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000100_B000200.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000200_B000300.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000300_B000400.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000400_B000500.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000500_B000600.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000600_B000700.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000700_B000800.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000800_B000900.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000900_B001000.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B001000_B001100.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B001100_B001200.jsonl
+  amphion/Emilia-Dataset/speaker_prompt:
+    ratio: 0.5
+    data_paths:
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000000_B000100_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000100_B000200_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000200_B000300_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000300_B000400_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000400_B000500_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000500_B000600_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000600_B000700_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000700_B000800_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000800_B000900_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000900_B001000_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000000_B000100_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000100_B000200_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000200_B000300_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000300_B000400_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000400_B000500_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000500_B000600_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000600_B000700_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000700_B000800_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000800_B000900_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000900_B001000_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B001000_B001100_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B001100_B001200_speak_prompt.jsonl
+  openslr:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/openslr/SLR68/train.jsonl
+      - datasets/jsonl/openslr/SLR68/dev.jsonl
+  speechcolab/gigaspeech:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/speechcolab/gigaspeech/xl.jsonl
+      - datasets/jsonl/speechcolab/gigaspeech/dev.jsonl
+  MLCommons/peoples_speech:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/MLCommons/peoples_speech/clean.jsonl
+      - datasets/jsonl/MLCommons/peoples_speech/clean_sa.jsonl
+      - datasets/jsonl/MLCommons/peoples_speech/dirty.jsonl
+      - datasets/jsonl/MLCommons/peoples_speech/dirty_sa.jsonl
+      - datasets/jsonl/MLCommons/peoples_speech/validation.jsonl
+  facebook/voxpopuli:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/facebook/voxpopuli/en_train.jsonl
+      - datasets/jsonl/facebook/voxpopuli/en_accented_test.jsonl
+  shenyunhang:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/shenyunhang/AISHELL-1/train.jsonl
+      - datasets/jsonl/shenyunhang/AISHELL-1/dev.jsonl
+      - datasets/jsonl/shenyunhang/AISHELL-2/data.jsonl
+      - datasets/jsonl/shenyunhang/AISHELL-3/data.jsonl
+      - datasets/jsonl/shenyunhang/AISHELL-4/data.jsonl
+  gpt-omni/VoiceAssistant-400K:
+    ratio: 0.0
+    data_paths:
+      - datasets/jsonl/gpt-omni/VoiceAssistant-400K/data.jsonl
+  VITA-MLLM/AudioQA-1M:
+    ratio: 0.0
+    data_paths:
+      - datasets/jsonl/VITA-MLLM/AudioQA-1M/data.jsonl
+  BAAI/Infinity-Instruct:
+    ratio: 1.0
+    data_paths:
+      #- datasets/jsonl/BAAI/Infinity-Instruct/3M.jsonl
+      #- datasets/jsonl/BAAI/Infinity-Instruct/7M.jsonl
+      #- datasets/jsonl/BAAI/Infinity-Instruct/7M_domains.jsonl
+      - datasets/jsonl/BAAI/Infinity-Instruct/0625.jsonl
+      #- datasets/jsonl/BAAI/Infinity-Instruct/Gen.jsonl
+  OpenHermes:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/teknium/OpenHermes-2.5/openhermes2_5.jsonl
+  lima:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/GAIR/lima/train.jsonl
+  databricks-dolly-15k:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/databricks/databricks-dolly-15k/databricks-dolly-15k.jsonl
+  MetaMathQA:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/meta-math/MetaMathQA/MetaMathQA-395K.jsonl
+  MathInstruct:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/TIGER-Lab/MathInstruct/MathInstruct.jsonl
+  orca-math-word-problems-200k:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/microsoft/orca-math-word-problems-200k/data.jsonl
+  atlas-math-sets:
+    ratio: 1.0
+    num: 100000
+    data_paths:
+      - datasets/jsonl/AtlasUnified/atlas-math-sets/train.jsonl
+  goat:
+    ratio: 1.0
+    num: 30000
+    data_paths:
+      - datasets/jsonl/tiedong/goat/dataset.jsonl
+  camel-ai:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/camel-ai/math/math.jsonl
+  Long-Instruction-with-Paraphrasing:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_en.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_en_paraphrasing.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_en.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_alpaca_en.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_en_paraphrasing.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/translation_en2zh.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_zh.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_zh_paraphrasing.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_zh.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_llama_chinese.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_zh_paraphrasing.jsonl
+  Long:
+    ratio: 1.0
+    data_paths:
+      - datasets/jsonl/akoksal/LongForm/data.jsonl
+      - datasets/jsonl/THUDM/LongAlign-10k/long.jsonl
+      - datasets/jsonl/THUDM/LongCite-45k/long.jsonl
+      - datasets/jsonl/THUDM/LongWriter-6k/long.jsonl
+      - datasets/jsonl/YeungNLP/LongQLoRA-Dataset/LongQLoRA-SFT-Data-39k.jsonl
+      - datasets/jsonl/Yukang/LongAlpaca-12k/LongAlpaca-12k.jsonl
+      - datasets/jsonl/togethercomputer/Long-Data-Collections/natural_questions_10_200_docs.jsonl
+      - datasets/jsonl/togethercomputer/Long-Data-Collections/booksum.jsonl
+      - datasets/jsonl/KnutJaegersberg/longinstruct/longinstruct.jsonl
+  open-thoughts/OpenThoughts2-1M:
+    ratio: 0.0
+    num: 200000
+    data_paths:
+      - datasets/jsonl/open-thoughts/OpenThoughts2-1M/data.jsonl
+  nvidia/Llama-Nemotron-Post-Training-Dataset:
+    ratio: 0.0
+    num: 200000
+    data_paths:
+      - datasets/jsonl/nvidia/Llama-Nemotron-Post-Training-Dataset/SFT_chat.jsonl
+      - datasets/jsonl/nvidia/Llama-Nemotron-Post-Training-Dataset/SFT_code.jsonl
+      #- datasets/jsonl/nvidia/Llama-Nemotron-Post-Training-Dataset/SFT_math.jsonl
+      #- datasets/jsonl/nvidia/Llama-Nemotron-Post-Training-Dataset/SFT_safety.jsonl
+      - datasets/jsonl/nvidia/Llama-Nemotron-Post-Training-Dataset/SFT_science.jsonl
+  glaiveai/reasoning-v1-20m:
+    ratio: 0.0
+    num: 200000
+    data_paths:
+      - datasets/jsonl/glaiveai/reasoning-v1-20m/data.jsonl
+  nvidia/OpenCodeReasoning:
+    ratio: 0.0
+    num: 200000
+    data_paths:
+      - datasets/jsonl/nvidia/OpenCodeReasoning/split_0.jsonl
+  Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT:
+    ratio: 0.0
+    num: 200000
+    data_paths:
+      - datasets/jsonl/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT/data.jsonl
+  open-r1/OpenR1-Math-220k:
+    ratio: 0.0
+    num: 200000
+    data_paths:
+      #- datasets/jsonl/open-r1/OpenR1-Math-220k/default.jsonl
+      - datasets/jsonl/open-r1/OpenR1-Math-220k/all.jsonl
+      #- datasets/jsonl/open-r1/OpenR1-Math-220k/extended.jsonl

configs/sts_finetune_stage2.yaml ADDED Viewed

	@@ -0,0 +1,273 @@

+xlsx_sample_num: 5
+dataset:
+  wenet-e2e/wenetspeech:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/wenet-e2e/wenetspeech/L_fixed.jsonl
+      - datasets/jsonl/wenet-e2e/wenetspeech/DEV_fixed.jsonl
+  Wenetspeech4TTS/Wenetspeech4TTS:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/Wenetspeech4TTS/WenetSpeech4TTS/Basic.jsonl
+  fixie-ai/librispeech_asr:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/fixie-ai/librispeech_asr/train.100.clean.jsonl
+      - datasets/jsonl/fixie-ai/librispeech_asr/train.360.clean.jsonl
+      - datasets/jsonl/fixie-ai/librispeech_asr/train.500.other.jsonl
+  mythicinfinity/libritts:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/mythicinfinity/libritts/train.clean.100.jsonl
+      - datasets/jsonl/mythicinfinity/libritts/train.clean.360.jsonl
+      - datasets/jsonl/mythicinfinity/libritts/train.other.500.jsonl
+      - datasets/jsonl/mythicinfinity/libritts_r/train.clean.100.jsonl
+      - datasets/jsonl/mythicinfinity/libritts_r/train.clean.360.jsonl
+      - datasets/jsonl/mythicinfinity/libritts_r/train.other.500.jsonl
+  parler-tts/mls_eng:
+    ratio: 0.05
+    data_paths:
+      #- datasets/jsonl/parler-tts/mls_eng_10k/train.jsonl
+      - datasets/jsonl/parler-tts/mls_eng/train.jsonl
+  mozilla-foundation/common_voice_17_0:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/mozilla-foundation/common_voice_17_0/en/train.jsonl
+      - datasets/jsonl/mozilla-foundation/common_voice_17_0/zh-CN/train.jsonl
+  MushanW/GLOBE_V2:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/MushanW/GLOBE_V2/train.jsonl
+  amphion/Emilia-Dataset:
+    ratio: 0.025
+    data_paths:
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000000_B000100.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000100_B000200.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000200_B000300.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000300_B000400.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000400_B000500.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000500_B000600.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000600_B000700.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000700_B000800.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000800_B000900.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000900_B001000.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000000_B000100.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000100_B000200.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000200_B000300.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000300_B000400.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000400_B000500.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000500_B000600.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000600_B000700.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000700_B000800.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000800_B000900.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000900_B001000.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B001000_B001100.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B001100_B001200.jsonl
+  amphion/Emilia-Dataset/speaker_prompt:
+    ratio: 0.025
+    data_paths:
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000000_B000100_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000100_B000200_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000200_B000300_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000300_B000400_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000400_B000500_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000500_B000600_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000600_B000700_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000700_B000800_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000800_B000900_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/ZH_B000900_B001000_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000000_B000100_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000100_B000200_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000200_B000300_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000300_B000400_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000400_B000500_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000500_B000600_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000600_B000700_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000700_B000800_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000800_B000900_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B000900_B001000_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B001000_B001100_speak_prompt.jsonl
+      - datasets/jsonl/amphion/Emilia-Dataset/EN_B001100_B001200_speak_prompt.jsonl
+  openslr:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/openslr/SLR68/train.jsonl
+      - datasets/jsonl/openslr/SLR68/dev.jsonl
+  speechcolab/gigaspeech:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/speechcolab/gigaspeech/xl.jsonl
+      - datasets/jsonl/speechcolab/gigaspeech/dev.jsonl
+  MLCommons/peoples_speech:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/MLCommons/peoples_speech/clean.jsonl
+      - datasets/jsonl/MLCommons/peoples_speech/clean_sa.jsonl
+      - datasets/jsonl/MLCommons/peoples_speech/dirty.jsonl
+      - datasets/jsonl/MLCommons/peoples_speech/dirty_sa.jsonl
+      - datasets/jsonl/MLCommons/peoples_speech/validation.jsonl
+  facebook/voxpopuli:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/facebook/voxpopuli/en_train.jsonl
+      - datasets/jsonl/facebook/voxpopuli/en_accented_test.jsonl
+  shenyunhang:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/shenyunhang/AISHELL-1/train.jsonl
+      - datasets/jsonl/shenyunhang/AISHELL-1/dev.jsonl
+      - datasets/jsonl/shenyunhang/AISHELL-2/data.jsonl
+      - datasets/jsonl/shenyunhang/AISHELL-3/data.jsonl
+      - datasets/jsonl/shenyunhang/AISHELL-4/data.jsonl
+  gpt-omni/VoiceAssistant-400K:
+    ratio: 2.0
+    data_paths:
+      - datasets/jsonl/gpt-omni/VoiceAssistant-400K/data.jsonl
+  VITA-MLLM/AudioQA-1M:
+    ratio: 2.0
+    data_paths:
+      - datasets/jsonl/VITA-MLLM/AudioQA-1M/data.jsonl
+  BAAI/Infinity-Instruct:
+    ratio: 0.05
+    data_paths:
+      #- datasets/jsonl/BAAI/Infinity-Instruct/3M.jsonl
+      #- datasets/jsonl/BAAI/Infinity-Instruct/7M.jsonl
+      #- datasets/jsonl/BAAI/Infinity-Instruct/7M_domains.jsonl
+      - datasets/jsonl/BAAI/Infinity-Instruct/0625.jsonl
+      #- datasets/jsonl/BAAI/Infinity-Instruct/Gen.jsonl
+  OpenHermes:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/teknium/OpenHermes-2.5/openhermes2_5.jsonl
+  lima:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/GAIR/lima/train.jsonl
+  databricks-dolly-15k:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/databricks/databricks-dolly-15k/databricks-dolly-15k.jsonl
+  MetaMathQA:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/meta-math/MetaMathQA/MetaMathQA-395K.jsonl
+  MathInstruct:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/TIGER-Lab/MathInstruct/MathInstruct.jsonl
+  orca-math-word-problems-200k:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/microsoft/orca-math-word-problems-200k/data.jsonl
+  atlas-math-sets:
+    ratio: 0.05
+    num: 100000
+    data_paths:
+      - datasets/jsonl/AtlasUnified/atlas-math-sets/train.jsonl
+  goat:
+    ratio: 0.05
+    num: 30000
+    data_paths:
+      - datasets/jsonl/tiedong/goat/dataset.jsonl
+  camel-ai:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/camel-ai/math/math.jsonl
+  Long-Instruction-with-Paraphrasing:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_en.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_en_paraphrasing.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_en.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_alpaca_en.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_en_paraphrasing.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/translation_en2zh.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/booksum_zh.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/multi_doc_qa_zh_paraphrasing.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/sharegpt_zh.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/short_instruction_from_llama_chinese.jsonl
+      - datasets/jsonl/yuyijiong/Long-Instruction-with-Paraphrasing/single_doc_qa_zh_paraphrasing.jsonl
+  Long:
+    ratio: 0.05
+    data_paths:
+      - datasets/jsonl/akoksal/LongForm/data.jsonl
+      - datasets/jsonl/THUDM/LongAlign-10k/long.jsonl
+      - datasets/jsonl/THUDM/LongCite-45k/long.jsonl
+      - datasets/jsonl/THUDM/LongWriter-6k/long.jsonl
+      - datasets/jsonl/YeungNLP/LongQLoRA-Dataset/LongQLoRA-SFT-Data-39k.jsonl
+      - datasets/jsonl/Yukang/LongAlpaca-12k/LongAlpaca-12k.jsonl
+      - datasets/jsonl/togethercomputer/Long-Data-Collections/natural_questions_10_200_docs.jsonl
+      - datasets/jsonl/togethercomputer/Long-Data-Collections/booksum.jsonl
+      - datasets/jsonl/KnutJaegersberg/longinstruct/longinstruct.jsonl
+  open-thoughts/OpenThoughts2-1M:
+    ratio: 0.0
+    num: 10000
+    data_paths:
+      - datasets/jsonl/open-thoughts/OpenThoughts2-1M/data.jsonl
+  nvidia/Llama-Nemotron-Post-Training-Dataset:
+    ratio: 0.0
+    num: 10000
+    data_paths:
+      - datasets/jsonl/nvidia/Llama-Nemotron-Post-Training-Dataset/SFT_chat.jsonl
+      - datasets/jsonl/nvidia/Llama-Nemotron-Post-Training-Dataset/SFT_code.jsonl
+      #- datasets/jsonl/nvidia/Llama-Nemotron-Post-Training-Dataset/SFT_math.jsonl
+      #- datasets/jsonl/nvidia/Llama-Nemotron-Post-Training-Dataset/SFT_safety.jsonl
+      - datasets/jsonl/nvidia/Llama-Nemotron-Post-Training-Dataset/SFT_science.jsonl
+  glaiveai/reasoning-v1-20m:
+    ratio: 0.0
+    num: 10000
+    data_paths:
+      - datasets/jsonl/glaiveai/reasoning-v1-20m/data.jsonl
+  nvidia/OpenCodeReasoning:
+    ratio: 0.0
+    num: 10000
+    data_paths:
+      - datasets/jsonl/nvidia/OpenCodeReasoning/split_0.jsonl
+  Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT:
+    ratio: 0.0
+    num: 10000
+    data_paths:
+      - datasets/jsonl/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT/data.jsonl
+  open-r1/OpenR1-Math-220k:
+    ratio: 0.0
+    num: 10000
+    data_paths:
+      #- datasets/jsonl/open-r1/OpenR1-Math-220k/default.jsonl
+      - datasets/jsonl/open-r1/OpenR1-Math-220k/all.jsonl
+      #- datasets/jsonl/open-r1/OpenR1-Math-220k/extended.jsonl

evaluation/compute-acc-of-contain.py ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import json
+import re
+import string
+import sys
+import unicodedata
+from word2number import w2n
+# def is_list_in_string(text, candidate):
+#     return any([all(xx in text for xx in x.split(" ")) if isinstance(x, str) else all([xx in text for xx in x]) for x in candidate])
+def is_string_in_string(text, candidate):
+    return all(x in text for x in candidate.split(" "))
+def is_list_in_string(text, candidate):
+    return any(
+        [
+            is_string_in_string(text, x) if isinstance(x, str) else is_list_in_string(text, x)
+            for x in candidate
+        ]
+    )
+def clean_punctuation(value):
+    punctuation = string.punctuation
+    punctuation = punctuation.replace("'", "")
+    value = re.sub(f"[{punctuation}]", " ", value)
+    return value
+if __name__ == "__main__":
+    pred_gt_json_file = sys.argv[1]
+    with open(pred_gt_json_file, "r") as f:
+        pred_gt = json.load(f)
+    acc = 0
+    for line in pred_gt:
+        pred = line[0]
+        gt = line[1]
+        # pred = clean_punctuation(pred)
+        pred = pred.lower()
+        if isinstance(gt, list):
+            pass
+        else:
+            gt = [
+                gt,
+            ]
+        gt = [clean_punctuation(x) for x in gt]
+        gt = [x.lower().strip() for x in gt]
+        try:
+            gt_number = [str(w2n.word_to_num(x.lower())) for x in gt]
+        except:
+            gt_number = gt
+            pass
+        if is_list_in_string(pred, gt):
+            acc += 1
+        elif is_list_in_string(pred, gt_number):
+            acc += 1
+        else:
+            print("======================================================")
+            print(f"{line[0]=}")
+            print(f"{line[1]=}")
+    print("======================================================")
+    print(f"{acc=}")
+    print(f"{len(pred_gt)=}")
+    print("======================================================")
+    acc = acc / len(pred_gt) * 100
+    print("======================================================")
+    print(f"{acc=}")
+    print("======================================================")

evaluation/compute-cer.py ADDED Viewed

	@@ -0,0 +1,559 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import sys
+import unicodedata
+import codecs
+remove_tag = True
+spacelist = [' ', '\t', '\r', '\n']
+puncts = [
+    '!', ',', '?', '、', '。', '！', '，', '；', '？', '：', '「', '」', '︰', '『', '』',
+    '《', '》'
+]
+def characterize(string):
+    res = []
+    i = 0
+    while i < len(string):
+        char = string[i]
+        if char in puncts:
+            i += 1
+            continue
+        cat1 = unicodedata.category(char)
+        # https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:  # space or not assigned
+            i += 1
+            continue
+        if cat1 == 'Lo':  # letter-other
+            res.append(char)
+            i += 1
+        else:
+            # some input looks like: <unk><noise>, we want to separate it to two words.
+            sep = ' '
+            if char == '<':
+                sep = '>'
+            j = i + 1
+            while j < len(string):
+                c = string[j]
+                if ord(c) >= 128 or (c in spacelist) or (c == sep):
+                    break
+                j += 1
+            if j < len(string) and string[j] == '>':
+                j += 1
+            res.append(string[i:j])
+            i = j
+    return res
+def stripoff_tags(x):
+    if not x:
+        return ''
+    chars = []
+    i = 0
+    T = len(x)
+    while i < T:
+        if x[i] == '<':
+            while i < T and x[i] != '>':
+                i += 1
+            i += 1
+        else:
+            chars.append(x[i])
+            i += 1
+    return ''.join(chars)
+def normalize(sentence, ignore_words, cs, split=None):
+    """ sentence, ignore_words are both in unicode
+    """
+    new_sentence = []
+    for token in sentence:
+        x = token
+        if not cs:
+            x = x.upper()
+        if x in ignore_words:
+            continue
+        if remove_tag:
+            x = stripoff_tags(x)
+        if not x:
+            continue
+        if split and x in split:
+            new_sentence += split[x]
+        if x.isalnum():
+            for k in x:
+                new_sentence.append(k)
+        else:
+            new_sentence.append(x)
+    return new_sentence
+class Calculator:
+    def __init__(self):
+        self.data = {}
+        self.space = []
+        self.cost = {}
+        self.cost['cor'] = 0
+        self.cost['sub'] = 1
+        self.cost['del'] = 1
+        self.cost['ins'] = 1
+    def calculate(self, lab, rec):
+        # Initialization
+        lab.insert(0, '')
+        rec.insert(0, '')
+        while len(self.space) < len(lab):
+            self.space.append([])
+        for row in self.space:
+            for element in row:
+                element['dist'] = 0
+                element['error'] = 'non'
+            while len(row) < len(rec):
+                row.append({'dist': 0, 'error': 'non'})
+        for i in range(len(lab)):
+            self.space[i][0]['dist'] = i
+            self.space[i][0]['error'] = 'del'
+        for j in range(len(rec)):
+            self.space[0][j]['dist'] = j
+            self.space[0][j]['error'] = 'ins'
+        self.space[0][0]['error'] = 'non'
+        for token in lab:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        for token in rec:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        # Computing edit distance
+        for i, lab_token in enumerate(lab):
+            for j, rec_token in enumerate(rec):
+                if i == 0 or j == 0:
+                    continue
+                min_dist = sys.maxsize
+                min_error = 'none'
+                dist = self.space[i - 1][j]['dist'] + self.cost['del']
+                error = 'del'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                dist = self.space[i][j - 1]['dist'] + self.cost['ins']
+                error = 'ins'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                if lab_token == rec_token:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
+                    error = 'cor'
+                else:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
+                    error = 'sub'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                self.space[i][j]['dist'] = min_dist
+                self.space[i][j]['error'] = min_error
+        # Tracing back
+        result = {
+            'lab': [],
+            'rec': [],
+            'all': 0,
+            'cor': 0,
+            'sub': 0,
+            'ins': 0,
+            'del': 0
+        }
+        i = len(lab) - 1
+        j = len(rec) - 1
+        while True:
+            if self.space[i][j]['error'] == 'cor':  # correct
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
+                    result['all'] = result['all'] + 1
+                    result['cor'] = result['cor'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'sub':  # substitution
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
+                    result['all'] = result['all'] + 1
+                    result['sub'] = result['sub'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'del':  # deletion
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
+                    result['all'] = result['all'] + 1
+                    result['del'] = result['del'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, "")
+                i = i - 1
+            elif self.space[i][j]['error'] == 'ins':  # insertion
+                if len(rec[j]) > 0:
+                    self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
+                    result['ins'] = result['ins'] + 1
+                result['lab'].insert(0, "")
+                result['rec'].insert(0, rec[j])
+                j = j - 1
+            elif self.space[i][j]['error'] == 'non':  # starting point
+                break
+            else:  # shouldn't reach here
+                print('this should not happen , i={i} , j={j} , \
+                      error={error}'.format(i=i,
+                                            j=j,
+                                            error=self.space[i][j]['error']))
+        return result
+    def overall(self):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in self.data:
+            result['all'] = result['all'] + self.data[token]['all']
+            result['cor'] = result['cor'] + self.data[token]['cor']
+            result['sub'] = result['sub'] + self.data[token]['sub']
+            result['ins'] = result['ins'] + self.data[token]['ins']
+            result['del'] = result['del'] + self.data[token]['del']
+        return result
+    def cluster(self, data):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in data:
+            if token in self.data:
+                result['all'] = result['all'] + self.data[token]['all']
+                result['cor'] = result['cor'] + self.data[token]['cor']
+                result['sub'] = result['sub'] + self.data[token]['sub']
+                result['ins'] = result['ins'] + self.data[token]['ins']
+                result['del'] = result['del'] + self.data[token]['del']
+        return result
+    def keys(self):
+        return list(self.data.keys())
+def width(string):
+    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+def default_cluster(word):
+    unicode_names = [unicodedata.name(char) for char in word]
+    for i in reversed(range(len(unicode_names))):
+        if unicode_names[i].startswith('DIGIT'):  # 1
+            unicode_names[i] = 'Number'  # 'DIGIT'
+        elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH')
+              or unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')):
+            # 明 / 郎
+            unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
+        elif (unicode_names[i].startswith('LATIN CAPITAL LETTER')
+              or unicode_names[i].startswith('LATIN SMALL LETTER')):
+            # A / a
+            unicode_names[i] = 'English'  # 'LATIN LETTER'
+        elif unicode_names[i].startswith('HIRAGANA LETTER'):  # は こ め
+            unicode_names[i] = 'Japanese'  # 'GANA LETTER'
+        elif (unicode_names[i].startswith('AMPERSAND')
+              or unicode_names[i].startswith('APOSTROPHE')
+              or unicode_names[i].startswith('COMMERCIAL AT')
+              or unicode_names[i].startswith('DEGREE CELSIUS')
+              or unicode_names[i].startswith('EQUALS SIGN')
+              or unicode_names[i].startswith('FULL STOP')
+              or unicode_names[i].startswith('HYPHEN-MINUS')
+              or unicode_names[i].startswith('LOW LINE')
+              or unicode_names[i].startswith('NUMBER SIGN')
+              or unicode_names[i].startswith('PLUS SIGN')
+              or unicode_names[i].startswith('SEMICOLON')):
+            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+            del unicode_names[i]
+        else:
+            return 'Other'
+    if len(unicode_names) == 0:
+        return 'Other'
+    if len(unicode_names) == 1:
+        return unicode_names[0]
+    for i in range(len(unicode_names) - 1):
+        if unicode_names[i] != unicode_names[i + 1]:
+            return 'Other'
+    return unicode_names[0]
+def usage():
+    print("compute-wer.py : compute word error rate (WER) \
+          and align recognition results and references.")
+    print("         usage : python compute-wer.py [--cs={0,1}] \
+          [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \
+          [--padding-symbol={space,underline}] test.ref test.hyp > test.wer")
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        usage()
+        sys.exit(0)
+    calculator = Calculator()
+    cluster_file = ''
+    ignore_words = set()
+    tochar = False
+    verbose = 1
+    padding_symbol = ' '
+    case_sensitive = False
+    max_words_per_line = sys.maxsize
+    split = None
+    while len(sys.argv) > 3:
+        a = '--maxw='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):]
+            del sys.argv[1]
+            max_words_per_line = int(b)
+            continue
+        a = '--rt='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            remove_tag = (b == 'true') or (b != '0')
+            continue
+        a = '--cs='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            case_sensitive = (b == 'true') or (b != '0')
+            continue
+        a = '--cluster='
+        if sys.argv[1].startswith(a):
+            cluster_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            continue
+        a = '--splitfile='
+        if sys.argv[1].startswith(a):
+            split_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            split = dict()
+            with codecs.open(split_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    words = line.strip().split()
+                    if len(words) >= 2:
+                        split[words[0]] = words[1:]
+            continue
+        a = '--ig='
+        if sys.argv[1].startswith(a):
+            ignore_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            with codecs.open(ignore_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    line = line.strip()
+                    if len(line) > 0:
+                        ignore_words.add(line)
+            continue
+        a = '--char='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            tochar = (b == 'true') or (b != '0')
+            continue
+        a = '--v='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            verbose = 0
+            try:
+                verbose = int(b)
+            except Exception:
+                if b == 'true' or b != '0':
+                    verbose = 1
+            continue
+        a = '--padding-symbol='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            if b == 'space':
+                padding_symbol = ' '
+            elif b == 'underline':
+                padding_symbol = '_'
+            continue
+        if True or sys.argv[1].startswith('-'):
+            # ignore invalid switch
+            del sys.argv[1]
+            continue
+    if not case_sensitive:
+        ig = set([w.upper() for w in ignore_words])
+        ignore_words = ig
+    default_clusters = {}
+    default_words = {}
+    ref_file = sys.argv[1]
+    hyp_file = sys.argv[2]
+    rec_set = {}
+    if split and not case_sensitive:
+        newsplit = dict()
+        for w in split:
+            words = split[w]
+            for i in range(len(words)):
+                words[i] = words[i].upper()
+            newsplit[w.upper()] = words
+        split = newsplit
+    with codecs.open(hyp_file, 'r', 'utf-8') as fh:
+        for line in fh:
+            if tochar:
+                array = characterize(line)
+            else:
+                array = line.strip().split()
+            if len(array) == 0:
+                continue
+            fid = array[0]
+            rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
+                                     split)
+    # compute error rate on the interaction of reference file and hyp file
+    for line in open(ref_file, 'r', encoding='utf-8'):
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.rstrip('\n').split()
+        if len(array) == 0:
+            continue
+        fid = array[0]
+        if fid not in rec_set:
+            continue
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+        rec = rec_set[fid]
+        if verbose:
+            print('\nutt: %s' % fid)
+        for word in rec + lab:
+            if word not in default_words:
+                default_cluster_name = default_cluster(word)
+                if default_cluster_name not in default_clusters:
+                    default_clusters[default_cluster_name] = {}
+                if word not in default_clusters[default_cluster_name]:
+                    default_clusters[default_cluster_name][word] = 1
+                default_words[word] = default_cluster_name
+        result = calculator.calculate(lab, rec)
+        if verbose:
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] +
+                            result['del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('WER: %4.2f %%' % wer, end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+            space = {}
+            space['lab'] = []
+            space['rec'] = []
+            for idx in range(len(result['lab'])):
+                len_lab = width(result['lab'][idx])
+                len_rec = width(result['rec'][idx])
+                length = max(len_lab, len_rec)
+                space['lab'].append(length - len_lab)
+                space['rec'].append(length - len_rec)
+            upper_lab = len(result['lab'])
+            upper_rec = len(result['rec'])
+            lab1, rec1 = 0, 0
+            while lab1 < upper_lab or rec1 < upper_rec:
+                if verbose > 1:
+                    print('lab(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('lab:', end=' ')
+                lab2 = min(upper_lab, lab1 + max_words_per_line)
+                for idx in range(lab1, lab2):
+                    token = result['lab'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['lab'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print()
+                if verbose > 1:
+                    print('rec(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('rec:', end=' ')
+                rec2 = min(upper_rec, rec1 + max_words_per_line)
+                for idx in range(rec1, rec2):
+                    token = result['rec'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['rec'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print('\n', end='\n')
+                lab1 = lab2
+                rec1 = rec2
+    if verbose:
+        print('==================================================='
+              '========================')
+        print()
+    result = calculator.overall()
+    if result['all'] != 0:
+        wer = float(result['ins'] + result['sub'] +
+                    result['del']) * 100.0 / result['all']
+    else:
+        wer = 0.0
+    print('Overall -> %4.2f %%' % wer, end=' ')
+    print('N=%d C=%d S=%d D=%d I=%d' %
+          (result['all'], result['cor'], result['sub'], result['del'],
+           result['ins']))
+    if not verbose:
+        print()
+    if verbose:
+        for cluster_id in default_clusters:
+            result = calculator.cluster(k
+                                        for k in default_clusters[cluster_id])
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] +
+                            result['del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+        if len(cluster_file) > 0:  # compute separated WERs for word clusters
+            cluster_id = ''
+            cluster = []
+            for line in open(cluster_file, 'r', encoding='utf-8'):
+                for token in line.decode('utf-8').rstrip('\n').split():
+                    # end of cluster reached, like </Keyword>
+                    if token[0:2] == '</' and token[len(token) - 1] == '>' and \
+                       token.lstrip('</').rstrip('>') == cluster_id :
+                        result = calculator.cluster(cluster)
+                        if result['all'] != 0:
+                            wer = float(result['ins'] + result['sub'] +
+                                        result['del']) * 100.0 / result['all']
+                        else:
+                            wer = 0.0
+                        print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+                        print('N=%d C=%d S=%d D=%d I=%d' %
+                              (result['all'], result['cor'], result['sub'],
+                               result['del'], result['ins']))
+                        cluster_id = ''
+                        cluster = []
+                    # begin of cluster reached, like <Keyword>
+                    elif (token[0] == '<' and token[len(token) - 1] == '>'
+                          and cluster_id == ''):
+                        cluster_id = token.lstrip('<').rstrip('>')
+                        cluster = []
+                    # general terms, like WEATHER / CAR / ...
+                    else:
+                        cluster.append(token)
+        print()
+        print('======================================='
+              '====================================')

evaluation/compute-wer.py ADDED Viewed

	@@ -0,0 +1,553 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import re, sys, unicodedata
+import codecs
+remove_tag = True
+spacelist = [' ', '\t', '\r', '\n']
+puncts = [
+    '!', ',', '?', '、', '。', '！', '，', '；', '？', '：', '「', '」', '︰', '『', '』',
+    '《', '》'
+]
+def characterize(string):
+    res = []
+    i = 0
+    while i < len(string):
+        char = string[i]
+        if char in puncts:
+            i += 1
+            continue
+        cat1 = unicodedata.category(char)
+        #https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:  # space or not assigned
+            i += 1
+            continue
+        if cat1 == 'Lo':  # letter-other
+            res.append(char)
+            i += 1
+        else:
+            # some input looks like: <unk><noise>, we want to separate it to two words.
+            sep = ' '
+            if char == '<': sep = '>'
+            j = i + 1
+            while j < len(string):
+                c = string[j]
+                if ord(c) >= 128 or (c in spacelist) or (c == sep):
+                    break
+                j += 1
+            if j < len(string) and string[j] == '>':
+                j += 1
+            res.append(string[i:j])
+            i = j
+    return res
+def stripoff_tags(x):
+    if not x: return ''
+    chars = []
+    i = 0
+    T = len(x)
+    while i < T:
+        if x[i] == '<':
+            while i < T and x[i] != '>':
+                i += 1
+            i += 1
+        else:
+            chars.append(x[i])
+            i += 1
+    return ''.join(chars)
+def normalize(sentence, ignore_words, cs, split=None):
+    """ sentence, ignore_words are both in unicode
+    """
+    new_sentence = []
+    for token in sentence:
+        x = token
+        if not cs:
+            x = x.upper()
+        if x in ignore_words:
+            continue
+        if remove_tag:
+            x = stripoff_tags(x)
+        if not x:
+            continue
+        if split and x in split:
+            new_sentence += split[x]
+        else:
+            new_sentence.append(x)
+    return new_sentence
+class Calculator:
+    def __init__(self):
+        self.data = {}
+        self.space = []
+        self.cost = {}
+        self.cost['cor'] = 0
+        self.cost['sub'] = 1
+        self.cost['del'] = 1
+        self.cost['ins'] = 1
+    def calculate(self, lab, rec):
+        # Initialization
+        lab.insert(0, '')
+        rec.insert(0, '')
+        while len(self.space) < len(lab):
+            self.space.append([])
+        for row in self.space:
+            for element in row:
+                element['dist'] = 0
+                element['error'] = 'non'
+            while len(row) < len(rec):
+                row.append({'dist': 0, 'error': 'non'})
+        for i in range(len(lab)):
+            self.space[i][0]['dist'] = i
+            self.space[i][0]['error'] = 'del'
+        for j in range(len(rec)):
+            self.space[0][j]['dist'] = j
+            self.space[0][j]['error'] = 'ins'
+        self.space[0][0]['error'] = 'non'
+        for token in lab:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        for token in rec:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        # Computing edit distance
+        for i, lab_token in enumerate(lab):
+            for j, rec_token in enumerate(rec):
+                if i == 0 or j == 0:
+                    continue
+                min_dist = sys.maxsize
+                min_error = 'none'
+                dist = self.space[i - 1][j]['dist'] + self.cost['del']
+                error = 'del'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                dist = self.space[i][j - 1]['dist'] + self.cost['ins']
+                error = 'ins'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                if lab_token == rec_token:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
+                    error = 'cor'
+                else:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
+                    error = 'sub'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                self.space[i][j]['dist'] = min_dist
+                self.space[i][j]['error'] = min_error
+        # Tracing back
+        result = {
+            'lab': [],
+            'rec': [],
+            'all': 0,
+            'cor': 0,
+            'sub': 0,
+            'ins': 0,
+            'del': 0
+        }
+        i = len(lab) - 1
+        j = len(rec) - 1
+        while True:
+            if self.space[i][j]['error'] == 'cor':  # correct
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
+                    result['all'] = result['all'] + 1
+                    result['cor'] = result['cor'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'sub':  # substitution
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
+                    result['all'] = result['all'] + 1
+                    result['sub'] = result['sub'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'del':  # deletion
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
+                    result['all'] = result['all'] + 1
+                    result['del'] = result['del'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, "")
+                i = i - 1
+            elif self.space[i][j]['error'] == 'ins':  # insertion
+                if len(rec[j]) > 0:
+                    self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
+                    result['ins'] = result['ins'] + 1
+                result['lab'].insert(0, "")
+                result['rec'].insert(0, rec[j])
+                j = j - 1
+            elif self.space[i][j]['error'] == 'non':  # starting point
+                break
+            else:  # shouldn't reach here
+                print(
+                    'this should not happen , i = {i} , j = {j} , error = {error}'
+                    .format(i=i, j=j, error=self.space[i][j]['error']))
+        return result
+    def overall(self):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in self.data:
+            result['all'] = result['all'] + self.data[token]['all']
+            result['cor'] = result['cor'] + self.data[token]['cor']
+            result['sub'] = result['sub'] + self.data[token]['sub']
+            result['ins'] = result['ins'] + self.data[token]['ins']
+            result['del'] = result['del'] + self.data[token]['del']
+        return result
+    def cluster(self, data):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in data:
+            if token in self.data:
+                result['all'] = result['all'] + self.data[token]['all']
+                result['cor'] = result['cor'] + self.data[token]['cor']
+                result['sub'] = result['sub'] + self.data[token]['sub']
+                result['ins'] = result['ins'] + self.data[token]['ins']
+                result['del'] = result['del'] + self.data[token]['del']
+        return result
+    def keys(self):
+        return list(self.data.keys())
+def width(string):
+    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+def default_cluster(word):
+    unicode_names = [unicodedata.name(char) for char in word]
+    for i in reversed(range(len(unicode_names))):
+        if unicode_names[i].startswith('DIGIT'):  # 1
+            unicode_names[i] = 'Number'  # 'DIGIT'
+        elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH')
+              or unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')):
+            # 明 / 郎
+            unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
+        elif (unicode_names[i].startswith('LATIN CAPITAL LETTER')
+              or unicode_names[i].startswith('LATIN SMALL LETTER')):
+            # A / a
+            unicode_names[i] = 'English'  # 'LATIN LETTER'
+        elif unicode_names[i].startswith('HIRAGANA LETTER'):  # は こ め
+            unicode_names[i] = 'Japanese'  # 'GANA LETTER'
+        elif (unicode_names[i].startswith('AMPERSAND')
+              or unicode_names[i].startswith('APOSTROPHE')
+              or unicode_names[i].startswith('COMMERCIAL AT')
+              or unicode_names[i].startswith('DEGREE CELSIUS')
+              or unicode_names[i].startswith('EQUALS SIGN')
+              or unicode_names[i].startswith('FULL STOP')
+              or unicode_names[i].startswith('HYPHEN-MINUS')
+              or unicode_names[i].startswith('LOW LINE')
+              or unicode_names[i].startswith('NUMBER SIGN')
+              or unicode_names[i].startswith('PLUS SIGN')
+              or unicode_names[i].startswith('SEMICOLON')):
+            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+            del unicode_names[i]
+        else:
+            return 'Other'
+    if len(unicode_names) == 0:
+        return 'Other'
+    if len(unicode_names) == 1:
+        return unicode_names[0]
+    for i in range(len(unicode_names) - 1):
+        if unicode_names[i] != unicode_names[i + 1]:
+            return 'Other'
+    return unicode_names[0]
+def usage():
+    print(
+        "compute-wer.py : compute word error rate (WER) and align recognition results and references."
+    )
+    print(
+        "         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
+    )
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        usage()
+        sys.exit(0)
+    calculator = Calculator()
+    cluster_file = ''
+    ignore_words = set()
+    tochar = False
+    verbose = 1
+    padding_symbol = ' '
+    case_sensitive = False
+    max_words_per_line = sys.maxsize
+    split = None
+    while len(sys.argv) > 3:
+        a = '--maxw='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):]
+            del sys.argv[1]
+            max_words_per_line = int(b)
+            continue
+        a = '--rt='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            remove_tag = (b == 'true') or (b != '0')
+            continue
+        a = '--cs='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            case_sensitive = (b == 'true') or (b != '0')
+            continue
+        a = '--cluster='
+        if sys.argv[1].startswith(a):
+            cluster_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            continue
+        a = '--splitfile='
+        if sys.argv[1].startswith(a):
+            split_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            split = dict()
+            with codecs.open(split_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    words = line.strip().split()
+                    if len(words) >= 2:
+                        split[words[0]] = words[1:]
+            continue
+        a = '--ig='
+        if sys.argv[1].startswith(a):
+            ignore_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            with codecs.open(ignore_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    line = line.strip()
+                    if len(line) > 0:
+                        ignore_words.add(line)
+            continue
+        a = '--char='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            tochar = (b == 'true') or (b != '0')
+            continue
+        a = '--v='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            verbose = 0
+            try:
+                verbose = int(b)
+            except:
+                if b == 'true' or b != '0':
+                    verbose = 1
+            continue
+        a = '--padding-symbol='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            if b == 'space':
+                padding_symbol = ' '
+            elif b == 'underline':
+                padding_symbol = '_'
+            continue
+        if True or sys.argv[1].startswith('-'):
+            #ignore invalid switch
+            del sys.argv[1]
+            continue
+    if not case_sensitive:
+        ig = set([w.upper() for w in ignore_words])
+        ignore_words = ig
+    default_clusters = {}
+    default_words = {}
+    ref_file = sys.argv[1]
+    hyp_file = sys.argv[2]
+    rec_set = {}
+    if split and not case_sensitive:
+        newsplit = dict()
+        for w in split:
+            words = split[w]
+            for i in range(len(words)):
+                words[i] = words[i].upper()
+            newsplit[w.upper()] = words
+        split = newsplit
+    with codecs.open(hyp_file, 'r', 'utf-8') as fh:
+        for line in fh:
+            if tochar:
+                array = characterize(line)
+            else:
+                array = line.strip().split()
+            if len(array) == 0: continue
+            fid = array[0]
+            rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
+                                     split)
+    # compute error rate on the interaction of reference file and hyp file
+    for line in open(ref_file, 'r', encoding='utf-8'):
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.rstrip('\n').split()
+        if len(array) == 0: continue
+        fid = array[0]
+        if fid not in rec_set:
+            continue
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+        rec = rec_set[fid]
+        if verbose:
+            print('\nutt: %s' % fid)
+        for word in rec + lab:
+            if word not in default_words:
+                default_cluster_name = default_cluster(word)
+                if default_cluster_name not in default_clusters:
+                    default_clusters[default_cluster_name] = {}
+                if word not in default_clusters[default_cluster_name]:
+                    default_clusters[default_cluster_name][word] = 1
+                default_words[word] = default_cluster_name
+        result = calculator.calculate(lab, rec)
+        if verbose:
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] +
+                            result['del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('WER: %4.2f %%' % wer, end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+            space = {}
+            space['lab'] = []
+            space['rec'] = []
+            for idx in range(len(result['lab'])):
+                len_lab = width(result['lab'][idx])
+                len_rec = width(result['rec'][idx])
+                length = max(len_lab, len_rec)
+                space['lab'].append(length - len_lab)
+                space['rec'].append(length - len_rec)
+            upper_lab = len(result['lab'])
+            upper_rec = len(result['rec'])
+            lab1, rec1 = 0, 0
+            while lab1 < upper_lab or rec1 < upper_rec:
+                if verbose > 1:
+                    print('lab(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('lab:', end=' ')
+                lab2 = min(upper_lab, lab1 + max_words_per_line)
+                for idx in range(lab1, lab2):
+                    token = result['lab'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['lab'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print()
+                if verbose > 1:
+                    print('rec(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('rec:', end=' ')
+                rec2 = min(upper_rec, rec1 + max_words_per_line)
+                for idx in range(rec1, rec2):
+                    token = result['rec'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['rec'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print('\n', end='\n')
+                lab1 = lab2
+                rec1 = rec2
+    if verbose:
+        print(
+            '==========================================================================='
+        )
+        print()
+    result = calculator.overall()
+    if result['all'] != 0:
+        wer = float(result['ins'] + result['sub'] +
+                    result['del']) * 100.0 / result['all']
+    else:
+        wer = 0.0
+    print('Overall -> %4.2f %%' % wer, end=' ')
+    print('N=%d C=%d S=%d D=%d I=%d' %
+          (result['all'], result['cor'], result['sub'], result['del'],
+           result['ins']))
+    if not verbose:
+        print()
+    if verbose:
+        for cluster_id in default_clusters:
+            result = calculator.cluster(
+                [k for k in default_clusters[cluster_id]])
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] +
+                            result['del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+        if len(cluster_file) > 0:  # compute separated WERs for word clusters
+            cluster_id = ''
+            cluster = []
+            for line in open(cluster_file, 'r', encoding='utf-8'):
+                for token in line.decode('utf-8').rstrip('\n').split():
+                    # end of cluster reached, like </Keyword>
+                    if token[0:2] == '</' and token[len(token)-1] == '>' and \
+                       token.lstrip('</').rstrip('>') == cluster_id :
+                        result = calculator.cluster(cluster)
+                        if result['all'] != 0:
+                            wer = float(result['ins'] + result['sub'] +
+                                        result['del']) * 100.0 / result['all']
+                        else:
+                            wer = 0.0
+                        print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+                        print('N=%d C=%d S=%d D=%d I=%d' %
+                              (result['all'], result['cor'], result['sub'],
+                               result['del'], result['ins']))
+                        cluster_id = ''
+                        cluster = []
+                    # begin of cluster reached, like <Keyword>
+                    elif token[0] == '<' and token[len(token)-1] == '>' and \
+                         cluster_id == '' :
+                        cluster_id = token.lstrip('<').rstrip('>')
+                        cluster = []
+                    # general terms, like WEATHER / CAR / ...
+                    else:
+                        cluster.append(token)
+        print()
+        print(
+            '==========================================================================='
+        )

evaluation/evaluate_asr.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import argparse
+import itertools
+import json
+import os
+import random
+import sys
+import uuid
+from datetime import timedelta
+from functools import partial
+from pathlib import Path
+import torch
+import tqdm
+from datasets import load_dataset
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+import torchaudio
+from vita_audio.data.processor.audio_processor import add_audio_input_contiguous
+from vita_audio.tokenizer import get_audio_tokenizer
+def collate_fn(batches):
+    input_ids = [sample["input_ids"] for sample in batches]
+    audios = [sample["audios"] for sample in batches]
+    audio_indices = [sample["audio_indices"] for sample in batches]
+    refs = [sample["ref"] for sample in batches]
+    return input_ids, audios, audio_indices, refs
+class ASRDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        json_path,
+        tokenizer,
+        audio_tokenizer,
+        default_system_message=None,
+        add_generation_prompt=True,
+    ):
+        data = load_dataset("json", data_files=json_path, keep_in_memory=False)
+        self.data = data["train"]
+        self.tokenizer = tokenizer
+        self.add_generation_prompt = add_generation_prompt
+        self.audio_tokenizer = audio_tokenizer
+        self.default_system_message = default_system_message
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        sample = self.data[idx]
+        # print(f"sample {sample}")
+        audio_path = sample["audios"][0]
+        if self.audio_tokenizer.apply_to_role("user", is_discrete=True):
+            # discrete codec
+            audio_tokens = self.audio_tokenizer.encode(audio_path)
+            audio_tokens = "".join(f"<|audio_{i}|>" for i in audio_tokens)
+        else:
+            audio_tokens = None
+        messages = []
+        if len(sample["messages"]) == 2:
+            assert len(sample["messages"]) == 2
+            assert sample["messages"][0]["role"] == "user"
+            assert sample["messages"][1]["role"] == "assistant"
+            if self.default_system_message is not None:
+                messages = self.default_system_message + messages
+        elif len(sample["messages"]) == 3:
+            assert len(sample["messages"]) == 3
+            assert sample["messages"][0]["role"] == "system"
+            assert sample["messages"][1]["role"] == "user"
+            assert sample["messages"][2]["role"] == "assistant"
+        else:
+            raise NotImplementedError
+        # print(sample)
+        for conv in sample["messages"][:-1]:
+            new_conv = {}
+            new_conv["role"] = conv["role"]
+            content = conv["content"]
+            if audio_tokens is not None:
+                content = content.replace(
+                    "<|audio|>", f"<|begin_of_audio|>{audio_tokens}<|end_of_audio|>"
+                )
+            new_conv["content"] = content
+            messages.append(new_conv)
+        input_ids = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=self.add_generation_prompt,
+            # return_tensors="pt",
+        )
+        ref = sample["messages"][-1]["content"]
+        if self.audio_tokenizer.apply_to_role("user", is_contiguous=True):
+            # contiguous codec
+            input_ids, audios, audio_indices = add_audio_input_contiguous(
+                input_ids, [audio_path], self.tokenizer, self.audio_tokenizer
+            )
+        else:
+            audios = None
+            audio_indices = None
+        input_ids = torch.tensor([input_ids], dtype=torch.long)
+        return {
+            "input_ids": input_ids,
+            "audios": audios,
+            "audio_indices": audio_indices,
+            "ref": ref,
+        }
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[: rank + 1]), total_size)
+        return range(begin, end)
+    def __iter__(self):
+        yield from self._local_indices
+    def __len__(self):
+        return len(self._local_indices)
+def inference(model, tokenizer, audio_tokenizer, dataloader, output_dir):
+    audio_offset = tokenizer.convert_tokens_to_ids("<|audio_0|>")
+    outputs = []
+    for _, (batched_input_ids, batched_audios, batched_audio_indices, batched_ref) in enumerate(
+        tqdm.tqdm(dataloader)
+    ):
+        for input_ids, audios, audio_indices, ref in zip(
+            batched_input_ids, batched_audios, batched_audio_indices, batched_ref
+        ):
+            kwargs = {
+                # "temperature": 0.2,
+                # "top_p": 0.8,
+                # "do_sample": False,
+                # "temperature": 1.0,
+                "max_new_tokens": max([len(x) for x in batched_ref]) + 10,
+                "min_new_tokens": 1,
+            }
+            if audios is not None:
+                kwargs["audios"] = audios
+                kwargs["audio_indices"] = audio_indices
+            responses = model.generate(
+                input_ids=input_ids.cuda(),
+                **kwargs,
+            )
+            response = responses[0][len(input_ids[0]) :]
+            text_tokens = []
+            audio_tokens = []
+            for token_id in response:
+                if token_id >= audio_offset:
+                    audio_tokens.append(token_id - audio_offset)
+                else:
+                    text_tokens.append(token_id)
+            hyp = tokenizer.decode(text_tokens, skip_special_tokens=True)
+            outputs.append((hyp, ref))
+            print("")
+            print("=" * 100)
+            print(f"{hyp=}")
+            print(f"{ref=}")
+    return outputs
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--model_name_or_path", type=str, required=True, help="model_name_or_path")
+    parser.add_argument(
+        "--audio_tokenizer_path", type=str, required=True, help="audio_tokenizer_path"
+    )
+    parser.add_argument(
+        "--audio_tokenizer_type", type=str, required=True, help="audio_tokenizer_type"
+    )
+    parser.add_argument("--flow_path", type=str, required=True, help="flow_path")
+    parser.add_argument("--json_path", type=str, required=True, help="json_path")
+    parser.add_argument("--output_dir", type=str, required=True, help="output_dir")
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_workers", type=int, default=0)
+    args = parser.parse_args()
+    print(f"{args=}")
+    torch.distributed.init_process_group(
+        backend="nccl",
+        world_size=int(os.getenv("WORLD_SIZE", "1")),
+        rank=int(os.getenv("RANK", "0")),
+        timeout=timedelta(seconds=7200),
+    )
+    torch.cuda.set_device(int(os.getenv("LOCAL_RANK", 0)))
+    random.seed(42)
+    torch.manual_seed(42)
+    config = AutoConfig.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+    )
+    # ================================================================
+    if "glm" in config.model_type.lower():
+        from get_chat_template import glm4_chat_template as chat_template
+        add_generation_prompt = True
+        default_system_message = [
+            {
+                "role": "system",
+                "content": "User will provide you with a speech instruction. Do it step by step. First, think about the instruction and respond in a interleaved manner, with 13 text token followed by 26 audio tokens.",
+            }
+        ]
+    if "qwen2" in config.model_type.lower():
+        from get_chat_template import qwen2_chat_template as chat_template
+        add_generation_prompt = True
+        default_system_message = []
+    if "hunyuan" in config.model_type.lower():
+        from get_chat_template import hunyuan_chat_template as chat_template
+        add_generation_prompt = False
+        default_system_message = [
+            {
+                "role": "system",
+                "content": "You are a helpful AI assistant.",
+            }
+        ]
+    # ================================================================
+    print("Loading model")
+    # device_map = "auto"
+    device_map = "cuda"
+    # torch_dtype=torch.float16
+    torch_dtype = torch.bfloat16
+    rank = torch.distributed.get_rank()
+    audio_tokenizer = get_audio_tokenizer(
+        args.audio_tokenizer_path, args.audio_tokenizer_type, flow_path=args.flow_path, rank=rank
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+        chat_template=chat_template,
+    )
+    # print("tokenizer", tokenizer)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+        device_map=device_map,
+        torch_dtype=torch_dtype,
+        attn_implementation="flash_attention_2",
+    ).eval()
+    # print("model", model)
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.model_name_or_path, trust_remote_code=True
+    )
+    model.generation_config.max_new_tokens = 4096
+    model.generation_config.chat_format = "chatml"
+    model.generation_config.max_window_size = 8192
+    model.generation_config.use_cache = True
+    model.generation_config.do_sample = False
+    model.generation_config.pad_token_id = tokenizer.pad_token_id
+    if model.config.model_type == "hunyuan":
+        model.generation_config.eos_token_id = tokenizer.eos_id
+    # ================================================================
+    print("Loading data")
+    dataset = ASRDataset(
+        json_path=args.json_path,
+        tokenizer=tokenizer,
+        audio_tokenizer=audio_tokenizer,
+        default_system_message=default_system_message,
+        add_generation_prompt=add_generation_prompt,
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(
+            collate_fn,
+        ),
+    )
+    # ================================================================
+    outputs = inference(model, tokenizer, audio_tokenizer, dataloader, args.output_dir)
+    torch.distributed.barrier()
+    world_size = torch.distributed.get_world_size()
+    merged_outputs = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+    merged_outputs = [json.loads(_) for _ in merged_outputs]
+    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+    if torch.distributed.get_rank() == 0:
+        # json_name = Path("_".join(os.path.normpath(args.json_path).split(os.sep)[-2:])).stem
+        json_name = Path(os.path.normpath(args.json_path).split(os.sep)[-1]).stem
+        hyp_path = os.path.join(args.output_dir, f"{json_name}_hyp.txt")
+        ref_path = os.path.join(args.output_dir, f"{json_name}_ref.txt")
+        os.makedirs(os.path.dirname(ref_path), exist_ok=True)
+        os.makedirs(os.path.dirname(hyp_path), exist_ok=True)
+        hyp_file = open(hyp_path, "w")
+        ref_file = open(ref_path, "w")
+        for sample_idx, (hyp, ref) in enumerate(merged_outputs):
+            hyp_file.write(f"{sample_idx} {hyp}" + "\n")
+            ref_file.write(f"{sample_idx} {ref}" + "\n")
+        hyp_file.close()
+        ref_file.close()
+        hyp_ref_path = os.path.join(args.output_dir, f"{json_name}_hyp_ref.json")
+        hyp_ref_file = open(hyp_ref_path, "w")
+        json.dump(merged_outputs, hyp_ref_file, indent=4)
+        hyp_ref_file.close()
+    torch.distributed.barrier()
+    print("Done.")

evaluation/evaluate_libritts.py ADDED Viewed

	@@ -0,0 +1,384 @@

+import argparse
+import itertools
+import json
+import os
+import random
+import re
+import sys
+import uuid
+from datetime import timedelta
+from functools import partial
+from pathlib import Path
+import torch
+import tqdm
+from datasets import load_dataset
+from tn.english.normalizer import Normalizer as EnNormalizer
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+import torchaudio
+from vita_audio.tokenizer import get_audio_tokenizer
+def collate_fn(batches):
+    input_ids = [sample["input_ids"] for sample in batches]
+    refs = [sample["ref"] for sample in batches]
+    filenames = [sample["filename"] for sample in batches]
+    return input_ids, refs, filenames
+class TTSDataset(torch.utils.data.Dataset):
+    def __init__(self, json_path, tokenizer, audio_tokenizer, default_system_message=None, add_generation_prompt=True):
+        data = load_dataset("json", data_files=json_path, keep_in_memory=False)
+        self.data = data["train"]
+        self.tokenizer = tokenizer
+        self.audio_tokenizer = audio_tokenizer
+        self.default_system_message = default_system_message
+        self.add_generation_prompt = add_generation_prompt
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        sample = self.data[idx]
+        messages = []
+        if self.default_system_message is not None:
+            messages = self.default_system_message + messages
+        role = "user"
+        content = sample["messages"][0]["content"]
+        messages.append(
+            {
+                "role": role,
+                "content": content,
+            }
+        )
+        input_ids = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=self.add_generation_prompt,
+            return_tensors="pt",
+        )
+        ref = sample["messages"][0]["content"]
+        ref = ref.replace("Convert the text to speech.\n", "")
+        ref = ref.strip()
+        filepath = sample["audios"][0]
+        filename = os.path.basename(filepath)
+        return {
+            "input_ids": input_ids,
+            "ref": ref,
+            "filename": filename,
+        }
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[: rank + 1]), total_size)
+        return range(begin, end)
+    def __iter__(self):
+        yield from self._local_indices
+    def __len__(self):
+        return len(self._local_indices)
+def inference(model, tokenizer, audio_tokenizer, dataloader, output_dir, asr_model):
+    audio_offset = tokenizer.convert_tokens_to_ids("<|audio_0|>")
+    en_tn_model = EnNormalizer(overwrite_cache=True)
+    outputs = []
+    for _, (
+        batched_input_ids,
+        batched_ref,
+        batched_filename,
+    ) in enumerate(tqdm.tqdm(dataloader)):
+        for input_ids, ref, filename in zip(
+            batched_input_ids, batched_ref, batched_filename
+        ):
+            responses = model.generate(
+                input_ids=input_ids.cuda(),
+                # temperature=0.2,
+                # top_p=0.8,
+                # do_sample=False,
+                # temperature=1.0,
+                max_new_tokens=1024,
+                min_new_tokens=1,
+            )
+            response = responses[0][len(input_ids[0]) :]
+            text_tokens = []
+            audio_tokens = []
+            for token_id in response:
+                if token_id >= audio_offset:
+                    audio_tokens.append(token_id - audio_offset)
+                else:
+                    text_tokens.append(token_id)
+            if len(audio_tokens) == 0:
+                continue
+            tts_speech = audio_tokenizer.decode(audio_tokens)
+            wav_dir = os.path.join(output_dir, "audio")
+            wav_path = os.path.join(wav_dir, filename + ".wav")
+            os.makedirs(os.path.dirname(wav_path), exist_ok=True)
+            torchaudio.save(wav_path, tts_speech.unsqueeze(0), 22050, format="wav")
+            hyp = asr_model(wav_path, return_timestamps=True)["text"].strip()
+            hyp = en_tn_model.normalize(hyp)
+            ref = en_tn_model.normalize(ref)
+            hyp = re.sub(r"\W+", " ", hyp)
+            ref = re.sub(r"\W+", " ", ref)
+            outputs.append((hyp, ref))
+            print("")
+            print("=" * 100)
+            # print(f"{len(input_id)=}")
+            # print(f"{len(response)=}")
+            print(f"{tokenizer.decode(response, skip_special_tokens=False)}")
+            print(f"{filename=}")
+    return outputs
+def load_asr_model():
+    import torch
+    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+    rank = torch.distributed.get_rank()
+    device = f"cuda:{rank}"
+    torch_dtype = torch.float16
+    model_id = "/data/models/openai/whisper-large-v3"
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+    )
+    model.to(device)
+    processor = AutoProcessor.from_pretrained(model_id)
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        torch_dtype=torch_dtype,
+        device=device,
+    )
+    return pipe
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--model_name_or_path", type=str, required=True, help="model_name_or_path")
+    parser.add_argument(
+        "--audio_tokenizer_path", type=str, required=True, help="audio_tokenizer_path"
+    )
+    parser.add_argument(
+        "--audio_tokenizer_type", type=str, required=True, help="audio_tokenizer_type"
+    )
+    parser.add_argument("--flow_path", type=str, required=True, help="flow_path")
+    parser.add_argument("--json_path", type=str, required=True, help="json_path")
+    parser.add_argument("--output_dir", type=str, required=True, help="output_dir")
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_workers", type=int, default=0)
+    parser.add_argument("--speaker_prompt", action=argparse.BooleanOptionalAction, default=False)
+    args = parser.parse_args()
+    print(f"{args=}")
+    torch.distributed.init_process_group(
+        backend="nccl",
+        world_size=int(os.getenv("WORLD_SIZE", "1")),
+        rank=int(os.getenv("RANK", "0")),
+        timeout=timedelta(seconds=7200),
+    )
+    torch.cuda.set_device(int(os.getenv("LOCAL_RANK", 0)))
+    random.seed(42)
+    torch.manual_seed(42)
+    config = AutoConfig.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+    )
+    # ================================================================
+    if "glm" in config.model_type.lower():
+        from get_chat_template import glm4_chat_template as chat_template
+        add_generation_prompt = True
+        default_system_message = [
+            {
+                "role": "system",
+                "content": "User will provide you with a speech instruction. Do it step by step. First, think about the instruction and respond in a interleaved manner, with 13 text token followed by 26 audio tokens.",
+            }
+        ]
+    if "qwen2" in config.model_type.lower():
+        from get_chat_template import qwen2_chat_template as chat_template
+        add_generation_prompt = True
+        default_system_message = []
+    if "hunyuan" in config.model_type.lower():
+        from get_chat_template import hunyuan_chat_template as chat_template
+        add_generation_prompt = False
+        default_system_message = [
+            {
+                "role": "system",
+                "content": "You are a helpful AI assistant.",
+            }
+        ]
+    # ================================================================
+    print("Loading model")
+    device = "cuda"
+    # device_map = "auto"
+    device_map = "cuda"
+    # torch_dtype=torch.float16
+    torch_dtype = torch.bfloat16
+    rank = torch.distributed.get_rank()
+    audio_tokenizer = get_audio_tokenizer(
+        args.audio_tokenizer_path, args.audio_tokenizer_type, flow_path=args.flow_path, rank=rank
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+        chat_template=chat_template,
+    )
+    # print("tokenizer", tokenizer)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+        device_map=device_map,
+        torch_dtype=torch_dtype,
+        attn_implementation="flash_attention_2",
+    ).eval()
+    # print("model", model)
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.model_name_or_path, trust_remote_code=True
+    )
+    model.generation_config.max_new_tokens = 4096
+    model.generation_config.chat_format = "chatml"
+    model.generation_config.max_window_size = 8192
+    model.generation_config.use_cache = True
+    model.generation_config.do_sample = True
+    model.generation_config.pad_token_id = tokenizer.pad_token_id
+    if model.config.model_type == "hunyuan":
+        model.generation_config.eos_token_id = tokenizer.eos_id
+    asr_model = load_asr_model()
+    # ================================================================
+    print("Loading data")
+    dataset = TTSDataset(
+        json_path=args.json_path,
+        tokenizer=tokenizer,
+        audio_tokenizer=audio_tokenizer,
+        default_system_message=default_system_message,
+        add_generation_prompt=add_generation_prompt,
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(
+            collate_fn,
+        ),
+    )
+    # ================================================================
+    outputs = inference(model, tokenizer, audio_tokenizer, dataloader, args.output_dir, asr_model)
+    torch.distributed.barrier()
+    world_size = torch.distributed.get_world_size()
+    merged_outputs = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+    merged_outputs = [json.loads(_) for _ in merged_outputs]
+    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+    if torch.distributed.get_rank() == 0:
+        # json_name = Path("_".join(os.path.normpath(args.json_path).split(os.sep)[-2:])).stem
+        json_name = Path(os.path.normpath(args.json_path).split(os.sep)[-1]).stem
+        hyp_path = os.path.join(args.output_dir, f"{json_name}_hyp.txt")
+        ref_path = os.path.join(args.output_dir, f"{json_name}_ref.txt")
+        os.makedirs(os.path.dirname(ref_path), exist_ok=True)
+        os.makedirs(os.path.dirname(hyp_path), exist_ok=True)
+        hyp_file = open(hyp_path, "w")
+        ref_file = open(ref_path, "w")
+        for sample_idx, (hyp, ref) in enumerate(merged_outputs):
+            hyp_file.write(f"{sample_idx} {hyp}" + "\n")
+            ref_file.write(f"{sample_idx} {ref}" + "\n")
+        hyp_file.close()
+        ref_file.close()
+        hyp_ref_path = os.path.join(args.output_dir, f"{json_name}_hyp_ref.json")
+        hyp_ref_file = open(hyp_ref_path, "w")
+        json.dump(merged_outputs, hyp_ref_file, indent=4)
+        hyp_ref_file.close()
+    torch.distributed.barrier()
+    print("Done.")

evaluation/evaluate_seedtts.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import argparse
+import itertools
+import json
+import os
+import random
+import sys
+import uuid
+from datetime import timedelta
+from functools import partial
+from pathlib import Path
+import torch
+import tqdm
+from datasets import load_dataset
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+import torchaudio
+from vita_audio.tokenizer import get_audio_tokenizer
+def collate_fn(batches):
+    input_ids = [sample["input_ids"] for sample in batches]
+    refs = [sample["ref"] for sample in batches]
+    filenames = [sample["filename"] for sample in batches]
+    prompt_audio_path = [sample["prompt_audio_path"] for sample in batches]
+    return input_ids, refs, filenames, prompt_audio_path
+class SeedTTSDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        data_path,
+        tokenizer,
+        audio_tokenizer,
+        default_system_message=None,
+        speaker_prompt=False,
+        add_generation_prompt=True,
+    ):
+        self.data = []
+        meta_path = os.path.join(data_path, f"seedtts_testset/zh/meta.lst")
+        with open(meta_path, "r") as f:
+            lines = f.readlines()
+        for line in lines:
+            line = line.strip().split("|")
+            filename = line[0]
+            prompt_text = line[1]
+            prompt_audio = line[2]
+            text = line[3]
+            self.data.append(["zh", filename, prompt_text, prompt_audio, text])
+        meta_path = os.path.join(data_path, f"seedtts_testset/zh/hardcase.lst")
+        with open(meta_path, "r") as f:
+            lines = f.readlines()
+        for line in lines:
+            line = line.strip().split("|")
+            filename = line[0]
+            prompt_text = line[1]
+            prompt_audio = line[2]
+            text = line[3]
+            self.data.append(["hardcase", filename, prompt_text, prompt_audio, text])
+        meta_path = os.path.join(data_path, f"seedtts_testset/en/meta.lst")
+        with open(meta_path, "r") as f:
+            lines = f.readlines()
+        for line in lines:
+            line = line.strip().split("|")
+            filename = line[0]
+            prompt_text = line[1]
+            prompt_audio = line[2]
+            text = line[3]
+            self.data.append(["en", filename, prompt_text, prompt_audio, text])
+        self.tokenizer = tokenizer
+        self.audio_tokenizer = audio_tokenizer
+        self.default_system_message = default_system_message
+        self.add_generation_prompt = add_generation_prompt
+        self.data_path = data_path
+        self.speaker_prompt = speaker_prompt
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        sample = self.data[idx]
+        split, filename, prompt_text, prompt_audio, text = sample
+        messages = []
+        if self.default_system_message is not None:
+            messages = self.default_system_message + messages
+        if self.speaker_prompt:
+            if split == "hardcase":
+                prompt_audio_path = os.path.join(
+                    self.data_path, "seedtts_testset", "zh", prompt_audio
+                )
+            else:
+                prompt_audio_path = os.path.join(
+                    self.data_path, "seedtts_testset", split, prompt_audio
+                )
+            if self.audio_tokenizer.apply_to_role("system", is_discrete=True):
+                # discrete codec
+                prompt_audio_tokens = self.audio_tokenizer.encode(prompt_audio_path)
+                prompt_audio_tokens = "".join(f"<|audio_{i}|>" for i in prompt_audio_tokens)
+                prompt_text = f"Speaker Metadata:\nAudio: <|begin_of_audio|>{prompt_audio_tokens}<|end_of_audio|>\n"
+                if len(messages) > 0 and messages[0]["role"] == "system":
+                    messages[0]["content"] += prompt_text
+                else:
+                    messages.append(
+                        {
+                            "role": "system",
+                            "content": prompt_text,
+                        }
+                    )
+        else:
+            prompt_audio_path = None
+        role = "user"
+        content = "Convert the text to speech.\n" + text
+        messages.append(
+            {
+                "role": role,
+                "content": content,
+            }
+        )
+        input_ids = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=self.add_generation_prompt,
+            return_tensors="pt",
+        )
+        ref = text
+        return {
+            "input_ids": input_ids,
+            "ref": ref,
+            "filename": split + "/" + filename,
+            "prompt_audio_path": prompt_audio_path,
+        }
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[: rank + 1]), total_size)
+        return range(begin, end)
+    def __iter__(self):
+        yield from self._local_indices
+    def __len__(self):
+        return len(self._local_indices)
+def inference(model, tokenizer, audio_tokenizer, dataloader, output_dir):
+    audio_offset = tokenizer.convert_tokens_to_ids("<|audio_0|>")
+    outputs = []
+    for _, (
+        batched_input_ids,
+        batched_ref,
+        batched_filename,
+        batched_prompt_audio_path,
+    ) in enumerate(tqdm.tqdm(dataloader)):
+        for input_ids, ref, filename, prompt_audio_path in zip(
+            batched_input_ids, batched_ref, batched_filename, batched_prompt_audio_path
+        ):
+            responses = model.generate(
+                input_ids=input_ids.cuda(),
+                # temperature=0.2,
+                # top_p=0.8,
+                # do_sample=False,
+                # temperature=1.0,
+                max_new_tokens=1024,
+                min_new_tokens=1,
+            )
+            response = responses[0][len(input_ids[0]) :]
+            text_tokens = []
+            audio_tokens = []
+            for token_id in response:
+                if token_id >= audio_offset:
+                    audio_tokens.append(token_id - audio_offset)
+                else:
+                    text_tokens.append(token_id)
+            if len(audio_tokens) == 0:
+                continue
+            tts_speech = audio_tokenizer.decode(audio_tokens, source_speech_16k=prompt_audio_path)
+            wav_path = os.path.join(output_dir, filename + ".wav")
+            os.makedirs(os.path.dirname(wav_path), exist_ok=True)
+            torchaudio.save(wav_path, tts_speech.unsqueeze(0), 22050, format="wav")
+            outputs.append((wav_path, filename))
+            print("")
+            print("=" * 100)
+            # print(f"{len(input_id)=}")
+            # print(f"{len(response)=}")
+            print(f"{tokenizer.decode(response, skip_special_tokens=False)}")
+            print(f"{filename=}")
+    return outputs
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--model_name_or_path", type=str, required=True, help="model_name_or_path")
+    parser.add_argument(
+        "--audio_tokenizer_path", type=str, required=True, help="audio_tokenizer_path"
+    )
+    parser.add_argument(
+        "--audio_tokenizer_type", type=str, required=True, help="audio_tokenizer_type"
+    )
+    parser.add_argument("--flow_path", type=str, required=True, help="flow_path")
+    parser.add_argument("--data_path", type=str, required=True, help="data_path")
+    parser.add_argument("--output_dir", type=str, required=True, help="output_dir")
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_workers", type=int, default=0)
+    parser.add_argument("--speaker_prompt", action=argparse.BooleanOptionalAction, default=False)
+    args = parser.parse_args()
+    print(f"{args=}")
+    torch.distributed.init_process_group(
+        backend="nccl",
+        world_size=int(os.getenv("WORLD_SIZE", "1")),
+        rank=int(os.getenv("RANK", "0")),
+        timeout=timedelta(seconds=7200),
+    )
+    torch.cuda.set_device(int(os.getenv("LOCAL_RANK", 0)))
+    random.seed(42)
+    torch.manual_seed(42)
+    config = AutoConfig.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+    )
+    # ================================================================
+    if "glm" in config.model_type.lower():
+        from get_chat_template import glm4_chat_template as chat_template
+        add_generation_prompt = True
+        default_system_message = [
+            {
+                "role": "system",
+                "content": "User will provide you with a speech instruction. Do it step by step. First, think about the instruction and respond in a interleaved manner, with 13 text token followed by 26 audio tokens.",
+            }
+        ]
+    if "qwen2" in config.model_type.lower():
+        from get_chat_template import qwen2_chat_template as chat_template
+        add_generation_prompt = True
+        default_system_message = []
+    if "hunyuan" in config.model_type.lower():
+        from get_chat_template import hunyuan_chat_template as chat_template
+        add_generation_prompt = False
+        default_system_message = [
+            {
+                "role": "system",
+                "content": "You are a helpful AI assistant.",
+            }
+        ]
+    # ================================================================
+    print("Loading model")
+    device = "cuda"
+    # device_map = "auto"
+    device_map = "cuda"
+    # torch_dtype=torch.float16
+    torch_dtype = torch.bfloat16
+    rank = torch.distributed.get_rank()
+    audio_tokenizer = get_audio_tokenizer(
+        args.audio_tokenizer_path, args.audio_tokenizer_type, flow_path=args.flow_path, rank=rank
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+        chat_template=chat_template,
+    )
+    # print("tokenizer", tokenizer)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+        device_map=device_map,
+        torch_dtype=torch_dtype,
+        attn_implementation="flash_attention_2",
+    ).eval()
+    # print("model", model)
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.model_name_or_path, trust_remote_code=True
+    )
+    model.generation_config.max_new_tokens = 4096
+    model.generation_config.chat_format = "chatml"
+    model.generation_config.max_window_size = 8192
+    model.generation_config.use_cache = True
+    model.generation_config.do_sample = True
+    model.generation_config.pad_token_id = tokenizer.pad_token_id
+    if model.config.model_type == "hunyuan":
+        model.generation_config.eos_token_id = tokenizer.eos_id
+    # ================================================================
+    print("Loading data")
+    dataset = SeedTTSDataset(
+        data_path=args.data_path,
+        tokenizer=tokenizer,
+        audio_tokenizer=audio_tokenizer,
+        default_system_message=default_system_message,
+        speaker_prompt=args.speaker_prompt,
+        add_generation_prompt=add_generation_prompt,
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(
+            collate_fn,
+        ),
+    )
+    # ================================================================
+    outputs = inference(model, tokenizer, audio_tokenizer, dataloader, args.output_dir)
+    torch.distributed.barrier()
+    world_size = torch.distributed.get_world_size()
+    merged_outputs = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+    merged_outputs = [json.loads(_) for _ in merged_outputs]
+    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+    torch.distributed.barrier()
+    print("Done.")

evaluation/evaluate_sqa.py ADDED Viewed

	@@ -0,0 +1,451 @@

+import argparse
+import itertools
+import json
+import os
+import random
+import sys
+import uuid
+from datetime import timedelta
+from functools import partial
+from pathlib import Path
+import torch
+import tqdm
+from datasets import load_dataset
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+import torchaudio
+from vita_audio.data.processor.audio_processor import add_audio_input_contiguous
+from vita_audio.tokenizer import get_audio_tokenizer
+def collate_fn(batches):
+    input_ids = [sample["input_ids"] for sample in batches]
+    audios = [sample["audios"] for sample in batches]
+    audio_indices = [sample["audio_indices"] for sample in batches]
+    refs = [sample["ref"] for sample in batches]
+    filenames = [sample["filename"] for sample in batches]
+    return input_ids, audios, audio_indices, refs, filenames
+class STSDataset(torch.utils.data.Dataset):
+    def __init__(self, json_path, tokenizer, audio_tokenizer, default_system_message=None, add_generation_prompt=True):
+        data = load_dataset("json", data_files=json_path, keep_in_memory=False)
+        self.data = data["train"]
+        self.tokenizer = tokenizer
+        self.add_generation_prompt = add_generation_prompt
+        self.audio_tokenizer = audio_tokenizer
+        self.default_system_message = default_system_message
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        sample = self.data[idx]
+        assert len(sample["audios"]) == 1
+        audio_path = sample["audios"][0]
+        if self.audio_tokenizer.apply_to_role("user", is_discrete=True):
+            # discrete codec
+            audio_tokens = self.audio_tokenizer.encode(audio_path)
+            audio_tokens = "".join(f"<|audio_{i}|>" for i in audio_tokens)
+        else:
+            audio_tokens = None
+        messages = []
+        if len(sample["messages"]) == 2:
+            assert len(sample["messages"]) == 2
+            assert sample["messages"][0]["role"] == "user"
+            assert sample["messages"][1]["role"] == "assistant"
+            if self.default_system_message is not None:
+                messages = self.default_system_message + messages
+        elif len(sample["messages"]) == 3:
+            assert len(sample["messages"]) == 3
+            assert sample["messages"][0]["role"] == "system"
+            assert sample["messages"][1]["role"] == "user"
+            assert sample["messages"][2]["role"] == "assistant"
+        else:
+            raise NotImplementedError
+        for conv in sample["messages"][:-1]:
+            new_conv = {}
+            new_conv["role"] = conv["role"]
+            content = conv["content"]
+            if isinstance(content, list):
+                assert len(content) == 1
+                content = content[0]
+            if audio_tokens is not None:
+                content = content.replace(
+                    "<|audio|>", f"<|begin_of_audio|>{audio_tokens}<|end_of_audio|>"
+                )
+            new_conv["content"] = content
+            messages.append(new_conv)
+        input_ids = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=self.add_generation_prompt,
+            # return_tensors="pt",
+        )
+        ref = sample["messages"][-1]["content"]
+        if self.audio_tokenizer.apply_to_role("user", is_contiguous=True):
+            # contiguous codec
+            input_ids, audios, audio_indices = add_audio_input_contiguous(
+                input_ids, [audio_path], self.tokenizer, self.audio_tokenizer
+            )
+        else:
+            audios = None
+            audio_indices = None
+        input_ids = torch.tensor([input_ids], dtype=torch.long)
+        filename = os.path.basename(audio_path)
+        filename = os.path.splitext(filename)[0]
+        return {
+            "input_ids": input_ids,
+            "audios": audios,
+            "audio_indices": audio_indices,
+            "ref": ref,
+            "filename": filename,
+        }
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[: rank + 1]), total_size)
+        return range(begin, end)
+    def __iter__(self):
+        yield from self._local_indices
+    def __len__(self):
+        return len(self._local_indices)
+def inference(model, tokenizer, audio_tokenizer, dataloader, output_dir, asr_model):
+    audio_offset = tokenizer.convert_tokens_to_ids("<|audio_0|>")
+    outputs = []
+    for _, (batched_input_ids, batched_audios, batched_audio_indices, batched_ref, batched_filename) in enumerate(
+        tqdm.tqdm(dataloader)
+    ):
+        for input_ids, audios, audio_indices, ref, filename in zip(
+            batched_input_ids, batched_audios, batched_audio_indices, batched_ref, batched_filename
+        ):
+            responses = model.generate(
+                input_ids=input_ids.cuda(),
+                audios=audios,
+                audio_indices=audio_indices,
+                # temperature=0.2,
+                # top_p=0.8,
+                # do_sample=False,
+                # temperature=1.0,
+                max_new_tokens=1024,
+                min_new_tokens=1,
+            )
+            response = responses[0][len(input_ids[0]) :]
+            text_tokens = []
+            audio_tokens = []
+            for token_id in response:
+                if token_id >= audio_offset:
+                    audio_tokens.append(token_id - audio_offset)
+                else:
+                    text_tokens.append(token_id)
+            hyp_text = tokenizer.decode(text_tokens, skip_special_tokens=True)
+            if len(audio_tokens) == 0:
+                continue
+            tts_speech = audio_tokenizer.decode(audio_tokens)
+            wav_dir = os.path.join(output_dir, "audio")
+            wav_path = os.path.join(wav_dir, filename + ".wav")
+            os.makedirs(os.path.dirname(wav_path), exist_ok=True)
+            torchaudio.save(wav_path, tts_speech.unsqueeze(0), 22050, format="wav")
+            # hyp_speech = asr_model.transcribe(wav_path)["text"].strip()
+            hyp_speech = asr_model(wav_path, return_timestamps=True)["text"].strip()
+            # hyp_speech = ""
+            outputs.append((hyp_text, hyp_speech, ref))
+            print("")
+            print("=" * 100)
+            print(f"{tokenizer.decode(response, skip_special_tokens=False)}")
+            print(f"  {hyp_text=}")
+            print(f"{hyp_speech=}")
+            print(f"       {ref=}")
+            print(f"{filename=}")
+    return outputs
+def load_asr_model():
+    import torch
+    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+    rank = torch.distributed.get_rank()
+    device = f"cuda:{rank}"
+    torch_dtype = torch.float16
+    model_id = "/data/models/openai/whisper-large-v3"
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+    )
+    model.to(device)
+    processor = AutoProcessor.from_pretrained(model_id)
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        torch_dtype=torch_dtype,
+        device=device,
+    )
+    return pipe
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--model_name_or_path", type=str, required=True, help="model_name_or_path")
+    parser.add_argument(
+        "--audio_tokenizer_path", type=str, required=True, help="audio_tokenizer_path"
+    )
+    parser.add_argument(
+        "--audio_tokenizer_type", type=str, required=True, help="audio_tokenizer_type"
+    )
+    parser.add_argument("--flow_path", type=str, required=True, help="flow_path")
+    parser.add_argument("--json_path", type=str, required=True, help="json_path")
+    parser.add_argument("--output_dir", type=str, required=True, help="output_dir")
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_workers", type=int, default=0)
+    args = parser.parse_args()
+    print(f"{args=}")
+    torch.distributed.init_process_group(
+        backend="nccl",
+        world_size=int(os.getenv("WORLD_SIZE", "1")),
+        rank=int(os.getenv("RANK", "0")),
+        timeout=timedelta(seconds=7200),
+    )
+    torch.cuda.set_device(int(os.getenv("LOCAL_RANK", 0)))
+    random.seed(42)
+    torch.manual_seed(42)
+    config = AutoConfig.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+    )
+    # ================================================================
+    if "glm" in config.model_type.lower():
+        from get_chat_template import glm4_chat_template as chat_template
+        add_generation_prompt = True
+        default_system_message = [
+            {
+                "role": "system",
+                "content": "User will provide you with a speech instruction. Do it step by step. First, think about the instruction and respond in a interleaved manner, with 13 text token followed by 26 audio tokens.",
+            }
+        ]
+    if "qwen2" in config.model_type.lower():
+        from get_chat_template import qwen2_chat_template as chat_template
+        add_generation_prompt = True
+        default_system_message = []
+    if "hunyuan" in config.model_type.lower():
+        from get_chat_template import hunyuan_chat_template as chat_template
+        add_generation_prompt = False
+        default_system_message = [
+            {
+                "role": "system",
+                "content": "You are a helpful AI assistant.",
+            }
+        ]
+    default_system_message = [
+        {
+            "role": "system",
+            # "content": "Your Name: Luke\nYour Gender: male\nRespond in a text-audio interleaved manner.",
+            # "content": "Your Name: Lucy\nYour Gender: female\nRespond in a text-audio interleaved manner.",
+            "content": "Your Name: Omni\nYour Gender: female\nRespond in a text-audio interleaved manner.",
+        },
+    ]
+    # ================================================================
+    print("Loading model")
+    device = "cuda"
+    # device_map = "auto"
+    device_map = "cuda"
+    # torch_dtype=torch.float16
+    torch_dtype = torch.bfloat16
+    rank = torch.distributed.get_rank()
+    audio_tokenizer = get_audio_tokenizer(
+        args.audio_tokenizer_path, args.audio_tokenizer_type, flow_path=args.flow_path, rank=rank
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+        chat_template=chat_template,
+    )
+    # print("tokenizer", tokenizer)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=True,
+        device_map=device_map,
+        torch_dtype=torch_dtype,
+        attn_implementation="flash_attention_2",
+    ).eval()
+    # print("model", model)
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.model_name_or_path, trust_remote_code=True
+    )
+    model.generation_config.max_new_tokens = 4096
+    model.generation_config.chat_format = "chatml"
+    model.generation_config.max_window_size = 8192
+    model.generation_config.use_cache = True
+    model.generation_config.do_sample = False
+    model.generation_config.temperature = None
+    model.generation_config.top_p = None
+    model.generation_config.top_k = None
+    model.generation_config.pad_token_id = tokenizer.pad_token_id
+    if model.config.model_type == "hunyuan":
+        model.generation_config.eos_token_id = tokenizer.eos_id
+    asr_model = load_asr_model()
+    # ================================================================
+    print("Loading data")
+    dataset = STSDataset(
+        json_path=args.json_path,
+        tokenizer=tokenizer,
+        audio_tokenizer=audio_tokenizer,
+        default_system_message=default_system_message,
+        add_generation_prompt=add_generation_prompt,
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(
+            collate_fn,
+        ),
+    )
+    # ================================================================
+    outputs = inference(model, tokenizer, audio_tokenizer, dataloader, args.output_dir, asr_model)
+    torch.distributed.barrier()
+    world_size = torch.distributed.get_world_size()
+    merged_outputs = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+    merged_outputs = [json.loads(_) for _ in merged_outputs]
+    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+    if torch.distributed.get_rank() == 0:
+        # json_name = Path("_".join(os.path.normpath(args.json_path).split(os.sep)[-2:])).stem
+        json_name = Path(os.path.normpath(args.json_path).split(os.sep)[-1]).stem
+        hyp_text_path = os.path.join(args.output_dir, f"{json_name}_hyp_text.txt")
+        hyp_speech_path = os.path.join(args.output_dir, f"{json_name}_hyp_speech.txt")
+        ref_path = os.path.join(args.output_dir, f"{json_name}_ref.txt")
+        os.makedirs(os.path.dirname(ref_path), exist_ok=True)
+        os.makedirs(os.path.dirname(hyp_text_path), exist_ok=True)
+        os.makedirs(os.path.dirname(hyp_speech_path), exist_ok=True)
+        hyp_text_file = open(hyp_text_path, "w")
+        hyp_speech_file = open(hyp_speech_path, "w")
+        ref_file = open(ref_path, "w")
+        for sample_idx, (hyp_text, hyp_speech, ref) in enumerate(merged_outputs):
+            hyp_text_file.write(f"{sample_idx} {hyp_text}" + "\n")
+            hyp_speech_file.write(f"{sample_idx} {hyp_speech}" + "\n")
+            ref_file.write(f"{sample_idx} {ref}" + "\n")
+        hyp_text_file.close()
+        hyp_speech_file.close()
+        ref_file.close()
+        outputs_speech = [[x[1], x[2]] for x in merged_outputs]
+        outputs_text = [[x[0], x[2]] for x in merged_outputs]
+        hyp_ref_path = os.path.join(args.output_dir, f"{json_name}_hyp_ref_text.json")
+        hyp_ref_file = open(hyp_ref_path, "w")
+        json.dump(outputs_text, hyp_ref_file, indent=4)
+        hyp_ref_file.close()
+        hyp_ref_path = os.path.join(args.output_dir, f"{json_name}_hyp_ref_speech.json")
+        hyp_ref_file = open(hyp_ref_path, "w")
+        json.dump(outputs_speech, hyp_ref_file, indent=4)
+        hyp_ref_file.close()
+    torch.distributed.barrier()
+    print("Done.")

evaluation/get_chat_template.py ADDED Viewed

	@@ -0,0 +1,59 @@

+qwen2_chat_template = """
+{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n
+"""
+qwen3_chat_template = """
+"{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}"
+"""
+hunyuan_chat_template = """
+{% set context = {'has_head': true} %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 %}{% if content == '' %}{% set _ = context.update({'has_head': false}) %}{% else %}{% set content = '<|startoftext|>' + content + '<|extra_4|>' %}{% endif %}{% endif %}{% if message['role'] == 'user' %}{% if loop.index0 == 1 and not context.has_head %}{% set content = '<|startoftext|>' + content %}{% endif %}{% if loop.index0 == 1 and context.has_head %}{% set content = content + '<|extra_0|>' %}{% else %}{% set content = '<|startoftext|>' + content + '<|extra_0|>' %}{% endif %}{% elif message['role'] == 'assistant' %}{% set content = content + '<|eos|>' %}{% endif %}{{ content  }}{% endfor %}
+"""
+glm4_chat_template = """
+{%- for message in messages %} {%- if (message.role == "system") %} {{- '<|system|>' + '\n' + message.content }} {%- elif (message.role == "user") %} {{- '<|user|>' + '\n' + message.content }} {%- elif message.role == "assistant" %} {{- '<|assistant|>' }} {%- if message.content %} {{- 'streaming_transcription\n' + message.content }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|assistant|>streaming_transcription\n' }} {%- endif %}
+"""
+if __name__ == "__main__":
+    from transformers import AutoTokenizer
+    chat = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Hello, how are you?"},
+        {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+        {"role": "user", "content": "I'd like to show off how chat templating works!"},
+    ]
+    # print("=" * 100)
+    # tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-Nemo-Instruct-2407")
+    # print(tokenizer.get_chat_template())
+    # message = tokenizer.apply_chat_template(chat, tokenize=False)
+    # print(message)
+    print("=" * 100)
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct")
+    print(tokenizer.get_chat_template())
+    message = tokenizer.apply_chat_template(chat, tokenize=False)
+    print(message)
+    print("=" * 100)
+    tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3.1-70B-Instruct")
+    print(tokenizer.get_chat_template())
+    message = tokenizer.apply_chat_template(chat, tokenize=False)
+    print(message)
+    print("=" * 100)
+    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
+    print(tokenizer.get_chat_template())
+    message = tokenizer.apply_chat_template(chat, tokenize=False)
+    print(message)
+    message = tokenizer.apply_chat_template(chat, tokenize=True)
+    print(message)
+    print("=" * 100)
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
+    print(tokenizer.get_chat_template())
+    message = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True, enable_thinking=True)
+    print(message)
+    message = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True, enable_thinking=False)
+    print(message)

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ -r requirements_ds_gpu.txt

requirements_ds_gpu.txt ADDED Viewed

	@@ -0,0 +1,44 @@

+expecttest
+peft
+xlsxwriter
+termcolor
+tabulate
+tiktoken
+matplotlib
+datasets
+einops
+pybind11
+tensorboardX
+pyarrow
+transformers==4.48.3
+deepspeed
+accelerate>=1.1.1
+timm
+flask
+flask_restful
+decord
+natsort
+# setuptools==69.5.1
+setuptools
+# cosyvoice2
+pyworld
+evaluate
+hyperpyyaml
+diffusers
+conformer
+hydra-core
+lightning
+gdown
+wget
+funasr
+zhconv
+jiwer
+zhon
+WeTextProcessing
+inflect
+openai-whisper
+onnxruntime
+modelscope
+word2number

scripts/deepspeed/ds_config_zero1.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupCosineLR",
+        "params": {
+            "total_num_steps": "auto",
+            "warmup_min_ratio": 0,
+            "warmup_num_steps": "auto",
+            "cos_min_ratio": 0.1
+        }
+    },
+    "zero_optimization": {
+        "stage": 1,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true,
+        "round_robin_gradients": true,
+        "sub_group_size": 1e12
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false,
+    "dump_state": false
+}

scripts/deepspeed/ds_config_zero2.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupCosineLR",
+        "params": {
+            "total_num_steps": "auto",
+            "warmup_min_ratio": 0,
+            "warmup_num_steps": "auto",
+            "cos_min_ratio": 0.1
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true,
+        "round_robin_gradients": true,
+        "sub_group_size": 1e12
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false,
+    "dump_state": false
+}

scripts/deepspeed/ds_config_zero2_no_optimizer.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "scheduler": {
+        "type": "WarmupCosineLR",
+        "params": {
+            "total_num_steps": "auto",
+            "warmup_min_ratio": 0,
+            "warmup_num_steps": "auto",
+            "cos_min_ratio": 0.1
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true,
+        "round_robin_gradients": true,
+        "sub_group_size": 1e12
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false,
+    "dump_state": false
+}

scripts/deepspeed/ds_config_zero2_offload.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupCosineLR",
+        "params": {
+            "total_num_steps": "auto",
+            "warmup_min_ratio": 0,
+            "warmup_num_steps": "auto",
+            "cos_min_ratio": 0.1
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true,
+        "round_robin_gradients": true,
+        "sub_group_size": 1e12
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false,
+    "dump_state": false
+}

scripts/deepspeed/ds_config_zero3.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupCosineLR",
+        "params": {
+            "total_num_steps": "auto",
+            "warmup_min_ratio": 0,
+            "warmup_num_steps": "auto",
+            "cos_min_ratio": 0.1
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 1e9,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "stage3_prefetch_bucket_size": 1e9,
+        "stage3_param_persistence_threshold": 1e9,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}

scripts/deepspeed/ds_config_zero3_offload.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupCosineLR",
+        "params": {
+            "total_num_steps": "auto",
+            "warmup_min_ratio": 0,
+            "warmup_num_steps": "auto",
+            "cos_min_ratio": 0.1
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true,
+        "round_robin_gradients": true,
+        "sub_group_size": 1e12,
+        "stage3_prefetch_bucket_size": 5e8,
+        "stage3_param_persistence_threshold": 1e5,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "flops_profiler": {
+	"enabled": false,
+	"profile_step": 1,
+	"module_depth": -1,
+	"top_modules": 1,
+	"detailed": true,
+	"output_file": null
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false,
+    "dump_state": false
+}

scripts/deepspeed/evaluate_sts.sh ADDED Viewed

	@@ -0,0 +1,348 @@

+#!/bin/bash
+set -e
+set -x
+SEQ_LENGTH="$1"
+if [ -z "$SEQ_LENGTH" ]
+then
+	SEQ_LENGTH=32768
+fi
+timestamp="$2"
+if [ -z "$timestamp" ]
+then
+	timestamp=`date +'%Y%m%d_%H%M%S'`
+fi
+######################################################################
+export ROOT_PATH=/data/
+export CODE_PATH=${ROOT_PATH}/VITA-Audio/
+export LOCAL_ROOT_PATH=/data_local/
+export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
+mkdir -p ${LOCAL_ROOT_PATH}
+mkdir -p ${LOCAL_CODE_PATH}
+apt install -y rsync
+mkdir -p ${LOCAL_CODE_PATH}
+rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
+cd ${LOCAL_CODE_PATH}
+rm -fr datasets
+ln -s ${ROOT_PATH}/data datasets
+######################################################################
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+source ${CODE_PATH}/scripts/set_env_ds_gpu.sh
+pip3 install transformers==4.48.3
+#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
+######################################################################
+OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
+mkdir -p ${OUTPUT_DIR}
+rsync -avh $0 ${OUTPUT_DIR}
+export HF_HOME="${ROOT_PATH}/data/HF_HOME/"
+mkdir -p ${HF_HOME}
+export HF_ENDPOINT=https://hf-mirror.com
+export MODELSCOPE_CACHE="${ROOT_PATH}/data/MODELSCOPE_CACHE/"
+mkdir -p ${MODELSCOPE_CACHE}
+export LC_ALL="en_US.utf8"
+######################################################################
+LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+######################################################################
+if true
+#if false
+then
+	MODEL_NAME_OR_PATH="/data/output/LM/scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage2.sh/VITA-Audio-Boost/"
+	MODEL_NAME_OR_PATH="/data/output/LM/scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage2.sh/VITA-Audio-Balance/"
+	AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
+	FLOW_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-decoder
+	AUDIO_TOKENIZER_TYPE="glm4voice"
+	export PYTHONPATH=${PYTHONPATH}:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/cosyvoice/:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
+fi
+######################################################################
+DISTRIBUTED_ARGS="
+--nproc_per_node $NPROC_PER_NODE \
+	--nnodes $NNODES \
+	--node_rank $NODE_RANK \
+	--master_addr $MASTER_ADDR \
+	--master_port $MASTER_PORT
+	"
+######################################################################
+if true
+#if false
+then
+	apt-get update && apt install -y ffmpeg
+	JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/llama-questions/test.jsonl
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_sqa.py \
+		--json_path ${JSON_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/llama-questions/
+	python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/llama-questions/test_hyp_ref_text.json
+	echo "copypaste ACC: ${JSON_PATH}"
+	python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/llama-questions/test_hyp_ref_speech.json
+	echo "copypaste ACC: ${JSON_PATH}"
+	JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/trivia_qa-audio/validation.jsonl
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_sqa.py \
+		--json_path ${JSON_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/trivia_qa-audio/
+	python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/trivia_qa-audio/validation_hyp_ref_text.json
+	echo "copypaste ACC: ${JSON_PATH}"
+	python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/trivia_qa-audio/validation_hyp_ref_speech.json
+	echo "copypaste ACC: ${JSON_PATH}"
+	JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/spoken-web-questions/test.jsonl
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_sqa.py \
+		--json_path ${JSON_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/spoken-web-questions/
+	python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/spoken-web-questions/test_hyp_ref_text.json
+	echo "copypaste ACC: ${JSON_PATH}"
+	python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/spoken-web-questions/test_hyp_ref_speech.json
+	echo "copypaste ACC: ${JSON_PATH}"
+fi
+######################################################################
+if true
+#if false
+then
+	JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/validation.clean.jsonl
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
+		--json_path ${JSON_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/librispeech_asr/
+	#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.clean_hyp.txt
+	#echo "copypaste CER: ${JSON_PATH}"
+	python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.clean_hyp.txt
+	echo "copypaste WER: ${JSON_PATH}"
+	JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/validation.other.jsonl
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
+		--json_path ${JSON_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/librispeech_asr/
+	#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.other_hyp.txt
+	#echo "copypaste CER: ${JSON_PATH}"
+	python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.other_hyp.txt
+	echo "copypaste WER: ${JSON_PATH}"
+	JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/test.clean.jsonl
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
+		--json_path ${JSON_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/librispeech_asr/
+	#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.clean_hyp.txt
+	#echo "copypaste CER: ${JSON_PATH}"
+	python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.clean_hyp.txt
+	echo "copypaste WER: ${JSON_PATH}"
+	JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/test.other.jsonl
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
+		--json_path ${JSON_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/librispeech_asr/
+	#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.other_hyp.txt
+	#echo "copypaste CER: ${JSON_PATH}"
+	python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.other_hyp.txt
+	echo "copypaste WER: ${JSON_PATH}"
+fi
+######################################################################
+if true
+#if false
+then
+	JSON_PATH=${ROOT_PATH}/data/jsonl/wenet-e2e/wenetspeech/TEST_MEETING.jsonl
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
+		--json_path ${JSON_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/wenetspeech/
+	python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_hyp.txt
+	echo "copypaste CER: ${JSON_PATH}"
+	python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_hyp.txt
+	echo "copypaste WER: ${JSON_PATH}"
+	JSON_PATH=${ROOT_PATH}/data/jsonl/wenet-e2e/wenetspeech/TEST_NET.jsonl
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
+		--json_path ${JSON_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/wenetspeech/
+	python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_NET_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_NET_hyp.txt
+	echo "copypaste CER: ${JSON_PATH}"
+	python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_NET_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_NET_hyp.txt
+	echo "copypaste WER: ${JSON_PATH}"
+fi
+######################################################################
+if true
+#if false
+then
+	JSON_PATH=${ROOT_PATH}/data/jsonl/shenyunhang/AISHELL-1/test.jsonl
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
+		--json_path ${JSON_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/AISHELL-1/
+	#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/AISHELL-1/_test.clean_ref.txt ${OUTPUT_DIR}/AISHELL-1/test.clean_hyp.txt
+	#echo "copypaste CER: ${JSON_PATH}"
+	python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/AISHELL-1/test_ref.txt ${OUTPUT_DIR}/AISHELL-1/test_hyp.txt
+	echo "copypaste WER: ${JSON_PATH}"
+fi
+######################################################################
+if true
+#if false
+then
+	JSON_PATH=${ROOT_PATH}/data/jsonl/mythicinfinity/libritts/test.clean.jsonl
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_libritts.py \
+		--json_path ${JSON_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/libritts/ \
+	#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/libritts/test.clean_ref.txt ${OUTPUT_DIR}/libritts/test.clean_hyp.txt
+	#echo "copypaste CER: ${JSON_PATH}"
+	python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/libritts/test.clean_ref.txt ${OUTPUT_DIR}/libritts/test.clean_hyp.txt
+	echo "copypaste WER: ${JSON_PATH}"
+fi
+######################################################################
+if true
+#if false
+then
+	DATA_PATH=${ROOT_PATH}/data/BytedanceSpeech/seed-tts-eval/
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_seedtts.py \
+		--data_path ${DATA_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/seed-tts/ \
+		--speaker_prompt \
+	export ARNOLD_WORKER_GPU=${NPROC_PER_NODE}
+	cd ${LOCAL_CODE_PATH}/third_party/seed-tts-eval
+	bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/meta.lst ${OUTPUT_DIR}/seed-tts/zh/ zh
+	echo "copypaste WER: ${DATA_PATH} zh"
+	bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/hardcase.lst ${OUTPUT_DIR}/seed-tts/hardcase/ zh
+	echo "copypaste WER: ${DATA_PATH} hardcase"
+	bash cal_wer.sh ${DATA_PATH}/seedtts_testset/en/meta.lst ${OUTPUT_DIR}/seed-tts/en/ en
+	echo "copypaste WER: ${DATA_PATH} en"
+	bash cal_sim.sh ${DATA_PATH}/seedtts_testset/zh/meta.lst ${OUTPUT_DIR}/seed-tts/zh/ ${DATA_PATH}/wavlm_large_finetune.pth
+	echo "copypaste SIM: ${DATA_PATH} zh"
+	bash cal_sim.sh ${DATA_PATH}/seedtts_testset/zh/hardcase.lst ${OUTPUT_DIR}/seed-tts/hardcase/ ${DATA_PATH}/wavlm_large_finetune.pth
+	echo "copypaste SIM: ${DATA_PATH} hardcase"
+	bash cal_sim.sh ${DATA_PATH}/seedtts_testset/en/meta.lst ${OUTPUT_DIR}/seed-tts/en/ ${DATA_PATH}/wavlm_large_finetune.pth
+	echo "copypaste SIM: ${DATA_PATH} en"
+	cd ${LOCAL_CODE_PATH}
+fi
+######################################################################
+if false
+then
+	DATA_PATH=${ROOT_PATH}/data/BytedanceSpeech/seed-tts-eval/
+	torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_seedtts.py \
+		--data_path ${DATA_PATH} \
+		--model_name_or_path ${MODEL_NAME_OR_PATH} \
+		--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
+		--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
+		--flow_path ${FLOW_PATH} \
+		--output_dir ${OUTPUT_DIR}/seed-tts/ \
+	export ARNOLD_WORKER_GPU=${NPROC_PER_NODE}
+	cd ${LOCAL_CODE_PATH}/third_party/seed-tts-eval
+	bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/meta.lst ${OUTPUT_DIR}/seed-tts/zh/ zh
+	echo "copypaste WER: ${DATA_PATH} zh"
+	bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/hardcase.lst ${OUTPUT_DIR}/seed-tts/hardcase/ zh
+	echo "copypaste WER: ${DATA_PATH} hardcase"
+	bash cal_wer.sh ${DATA_PATH}/seedtts_testset/en/meta.lst ${OUTPUT_DIR}/seed-tts/en/ en
+	echo "copypaste WER: ${DATA_PATH} en"
+	cd ${LOCAL_CODE_PATH}
+fi
+set +x

scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage1.sh ADDED Viewed

	@@ -0,0 +1,137 @@

+#!/bin/bash
+set -e
+set -x
+SEQ_LENGTH="$1"
+if [ -z "$SEQ_LENGTH" ]
+then
+    SEQ_LENGTH=32768
+fi
+timestamp="$2"
+if [ -z "$timestamp" ]
+then
+    timestamp=`date +'%Y%m%d_%H'`0000
+fi
+######################################################################
+export ROOT_PATH=/data/
+export CODE_PATH=${ROOT_PATH}/VITA-Audio/
+export LOCAL_ROOT_PATH=/data_local/
+export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
+mkdir -p ${LOCAL_ROOT_PATH}
+mkdir -p ${LOCAL_CODE_PATH}
+apt update
+apt install -y rsync
+rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
+cd ${LOCAL_CODE_PATH}
+rm -fr datasets
+ln -s ${ROOT_PATH}/data datasets
+######################################################################
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
+pip3 install transformers==4.48.3
+#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
+######################################################################
+OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
+mkdir -p ${OUTPUT_DIR}
+rsync -avh $0 ${OUTPUT_DIR}
+export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
+mkdir -p ${HF_HOME}
+export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
+export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
+######################################################################
+LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+echo ${@}
+######################################################################
+DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
+MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp1_stage1.sh/20250313_040353/
+AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
+rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
+######################################################################
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
+    --log_level "info" \
+    --do_train \
+    --overwrite_output_dir \
+    --config_name vita_audio/models/qwen2_mtp_v4_48_3/config_7B_mtp10.json \
+    --tokenizer_name $MODEL_NAME_OR_PATH \
+    --model_name_or_path $MODEL_NAME_OR_PATH \
+    --audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
+    --audio_tokenizer_type "glm4voice" \
+    --dataset_name $DATA_PATH \
+    --bf16 True \
+    --tf32 True \
+    --torch_dtype bfloat16 \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 1 \
+    --max_steps 8000 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --save_strategy "steps" \
+    --save_steps 0.1 \
+    --save_total_limit 2 \
+    --learning_rate 1.00e-3 \
+    --max_grad_norm 1.0 \
+    --weight_decay 0.0 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.95 \
+    --adam_epsilon 1e-8 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "tensorboard" \
+    --model_max_length ${SEQ_LENGTH} \
+    --gradient_checkpointing True \
+    --deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
+    --trust_remote_code False \
+    --ddp_timeout 7200 \
+    --ddp_backend ${DISTRIBUTED_BACKEND} \
+    --attn_implementation flash_attention_2 \
+    --seed 42 \
+    --data_seed 42 \
+    --reset_attention_mask \
+    --reset_position_ids \
+    --create_attention_mask false \
+    --create_attention_mask_2d false \
+    --dataloader_num_workers 8 \
+    --language-model-freeze \
+    --text-audio-interval-ratio 1 10 4 10 \
+    #--language-model-freeze \
+    #--dataset_joint false \
+    #--variable_length true \
+    #--tokenizer_name_or_path Qwen2Tokenizer \
+    #--bf16 True \
+    #--fp16 True \
+    #--tf32 True \
+set +x

scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage2.sh ADDED Viewed

	@@ -0,0 +1,137 @@

+#!/bin/bash
+set -e
+set -x
+SEQ_LENGTH="$1"
+if [ -z "$SEQ_LENGTH" ]
+then
+    SEQ_LENGTH=32768
+fi
+timestamp="$2"
+if [ -z "$timestamp" ]
+then
+    timestamp=`date +'%Y%m%d_%H'`0000
+fi
+######################################################################
+export ROOT_PATH=/data/
+export CODE_PATH=${ROOT_PATH}/VITA-Audio/
+export LOCAL_ROOT_PATH=/data_local/
+export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
+mkdir -p ${LOCAL_ROOT_PATH}
+mkdir -p ${LOCAL_CODE_PATH}
+apt update
+apt install -y rsync
+rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
+cd ${LOCAL_CODE_PATH}
+rm -fr datasets
+ln -s ${ROOT_PATH}/data datasets
+######################################################################
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
+pip3 install transformers==4.48.3
+#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
+######################################################################
+OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
+mkdir -p ${OUTPUT_DIR}
+rsync -avh $0 ${OUTPUT_DIR}
+export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
+mkdir -p ${HF_HOME}
+export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
+export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
+######################################################################
+LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+echo ${@}
+######################################################################
+DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage2.yaml
+MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/s2s_qwen25/finetune_glm4voice_mtp10_stage1.sh/20250315_022047/
+AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
+rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
+######################################################################
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
+    --log_level "info" \
+    --do_train \
+    --overwrite_output_dir \
+    --config_name ${MODEL_NAME_OR_PATH} \
+    --tokenizer_name $MODEL_NAME_OR_PATH \
+    --model_name_or_path $MODEL_NAME_OR_PATH \
+    --audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
+    --audio_tokenizer_type "glm4voice" \
+    --dataset_name $DATA_PATH \
+    --bf16 True \
+    --tf32 True \
+    --torch_dtype bfloat16 \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 1 \
+    --max_steps 4000 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --save_strategy "steps" \
+    --save_steps 0.1 \
+    --save_total_limit 2 \
+    --learning_rate 5.00e-5 \
+    --max_grad_norm 1.0 \
+    --weight_decay 0.1 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.95 \
+    --adam_epsilon 1e-8 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "tensorboard" \
+    --model_max_length ${SEQ_LENGTH} \
+    --gradient_checkpointing True \
+    --deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2_no_optimizer.json \
+    --trust_remote_code False \
+    --ddp_timeout 7200 \
+    --ddp_backend ${DISTRIBUTED_BACKEND} \
+    --attn_implementation flash_attention_2 \
+    --seed 42 \
+    --data_seed 42 \
+    --reset_attention_mask \
+    --reset_position_ids \
+    --create_attention_mask false \
+    --create_attention_mask_2d false \
+    --dataloader_num_workers 8 \
+    --mtp_model_lr_mult 1.00e1 \
+    --text-audio-interval-ratio 1 10 4 10 \
+    #--language-model-freeze \
+    #--dataset_joint false \
+    #--variable_length true \
+    #--tokenizer_name_or_path Qwen2Tokenizer \
+    #--bf16 True \
+    #--fp16 True \
+    #--tf32 True \
+set +x

scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp1_stage1.sh ADDED Viewed

	@@ -0,0 +1,137 @@

+#!/bin/bash
+set -e
+set -x
+SEQ_LENGTH="$1"
+if [ -z "$SEQ_LENGTH" ]
+then
+    SEQ_LENGTH=32768
+fi
+timestamp="$2"
+if [ -z "$timestamp" ]
+then
+    timestamp=`date +'%Y%m%d_%H'`0000
+fi
+######################################################################
+export ROOT_PATH=/data/
+export CODE_PATH=${ROOT_PATH}/VITA-Audio/
+export LOCAL_ROOT_PATH=/data_local/
+export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
+mkdir -p ${LOCAL_ROOT_PATH}
+mkdir -p ${LOCAL_CODE_PATH}
+apt update
+apt install -y rsync
+rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
+cd ${LOCAL_CODE_PATH}
+rm -fr datasets
+ln -s ${ROOT_PATH}/data datasets
+######################################################################
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
+pip3 install transformers==4.48.3
+#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
+######################################################################
+OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
+mkdir -p ${OUTPUT_DIR}
+rsync -avh $0 ${OUTPUT_DIR}
+export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
+mkdir -p ${HF_HOME}
+export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
+export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
+######################################################################
+LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+echo ${@}
+######################################################################
+DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
+MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/s2s_qwen25/finetune_glm4voice_stage1.sh/20250222_043913/
+AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
+rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
+######################################################################
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
+    --log_level "info" \
+    --do_train \
+    --overwrite_output_dir \
+    --config_name vita_audio/models/qwen2_mtp_v4_48_3/config_7B_mtp1.json \
+    --tokenizer_name $MODEL_NAME_OR_PATH \
+    --model_name_or_path $MODEL_NAME_OR_PATH \
+    --audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
+    --audio_tokenizer_type "glm4voice" \
+    --dataset_name $DATA_PATH \
+    --bf16 True \
+    --tf32 True \
+    --torch_dtype bfloat16 \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 1 \
+    --max_steps 4000 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --save_strategy "steps" \
+    --save_steps 0.1 \
+    --save_total_limit 2 \
+    --learning_rate 1.00e-3 \
+    --max_grad_norm 1.0 \
+    --weight_decay 0.0 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.95 \
+    --adam_epsilon 1e-8 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "tensorboard" \
+    --model_max_length ${SEQ_LENGTH} \
+    --gradient_checkpointing True \
+    --deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
+    --trust_remote_code False \
+    --ddp_timeout 7200 \
+    --ddp_backend ${DISTRIBUTED_BACKEND} \
+    --attn_implementation flash_attention_2 \
+    --seed 42 \
+    --data_seed 42 \
+    --reset_attention_mask \
+    --reset_position_ids \
+    --create_attention_mask false \
+    --create_attention_mask_2d false \
+    --dataloader_num_workers 8 \
+    --language-model-freeze \
+    --text-audio-interval-ratio 1 10 4 10 \
+    #--language-model-freeze \
+    #--dataset_joint false \
+    #--variable_length true \
+    #--tokenizer_name_or_path Qwen2Tokenizer \
+    #--bf16 True \
+    #--fp16 True \
+    #--tf32 True \
+set +x

scripts/deepspeed/sts_qwen25/finetune_glm4voice_stage1.sh ADDED Viewed

	@@ -0,0 +1,136 @@

+#!/bin/bash
+set -e
+set -x
+SEQ_LENGTH="$1"
+if [ -z "$SEQ_LENGTH" ]
+then
+    SEQ_LENGTH=32768
+fi
+timestamp="$2"
+if [ -z "$timestamp" ]
+then
+    timestamp=`date +'%Y%m%d_%H'`0000
+fi
+######################################################################
+export ROOT_PATH=/data/
+export CODE_PATH=${ROOT_PATH}/VITA-Audio/
+export LOCAL_ROOT_PATH=/data_local/
+export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
+mkdir -p ${LOCAL_ROOT_PATH}
+mkdir -p ${LOCAL_CODE_PATH}
+apt update
+apt install -y rsync
+rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
+cd ${LOCAL_CODE_PATH}
+rm -fr datasets
+ln -s ${ROOT_PATH}/data datasets
+######################################################################
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
+pip3 install transformers==4.48.3
+#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
+######################################################################
+OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
+mkdir -p ${OUTPUT_DIR}
+rsync -avh $0 ${OUTPUT_DIR}
+export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
+mkdir -p ${HF_HOME}
+export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
+export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
+######################################################################
+LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+echo ${@}
+######################################################################
+DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
+MODEL_NAME_OR_PATH=${ROOT_PATH}/models/Qwen/Qwen2.5-7B-Instruct/
+AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
+rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
+######################################################################
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
+    --log_level "info" \
+    --do_train \
+    --overwrite_output_dir \
+    --config_name ${MODEL_NAME_OR_PATH} \
+    --tokenizer_name $MODEL_NAME_OR_PATH \
+    --model_name_or_path $MODEL_NAME_OR_PATH \
+    --audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
+    --audio_tokenizer_type "glm4voice" \
+    --dataset_name $DATA_PATH \
+    --bf16 True \
+    --tf32 True \
+    --torch_dtype bfloat16 \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 1 \
+    --max_steps 8000 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --save_strategy "steps" \
+    --save_steps 0.1 \
+    --save_total_limit 2 \
+    --learning_rate 6.00e-5 \
+    --max_grad_norm 1.0 \
+    --weight_decay 0.0 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.95 \
+    --adam_epsilon 1e-8 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "tensorboard" \
+    --model_max_length ${SEQ_LENGTH} \
+    --gradient_checkpointing True \
+    --deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
+    --trust_remote_code False \
+    --ddp_timeout 7200 \
+    --ddp_backend ${DISTRIBUTED_BACKEND} \
+    --attn_implementation flash_attention_2 \
+    --seed 42 \
+    --data_seed 42 \
+    --reset_attention_mask \
+    --reset_position_ids \
+    --create_attention_mask false \
+    --create_attention_mask_2d false \
+    --dataloader_num_workers 8 \
+    --text-audio-interval-ratio 1 10 4 10 \
+    #--language-model-freeze \
+    #--dataset_joint false \
+    #--variable_length true \
+    #--tokenizer_name_or_path Qwen2Tokenizer \
+    #--bf16 True \
+    #--fp16 True \
+    #--tf32 True \
+set +x

scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp10_stage1.sh ADDED Viewed

	@@ -0,0 +1,138 @@

+#!/bin/bash
+set -e
+set -x
+SEQ_LENGTH="$1"
+if [ -z "$SEQ_LENGTH" ]
+then
+    SEQ_LENGTH=32768
+fi
+timestamp="$2"
+if [ -z "$timestamp" ]
+then
+    timestamp=`date +'%Y%m%d_%H'`0000
+fi
+######################################################################
+export ROOT_PATH=/data/
+export CODE_PATH=${ROOT_PATH}/VITA-Audio/
+export LOCAL_ROOT_PATH=/data_local/
+export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
+mkdir -p ${LOCAL_ROOT_PATH}
+mkdir -p ${LOCAL_CODE_PATH}
+apt update
+apt install -y rsync
+rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
+cd ${LOCAL_CODE_PATH}
+rm -fr datasets
+ln -s ${ROOT_PATH}/data datasets
+######################################################################
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
+pip3 install transformers==4.48.3
+#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
+######################################################################
+OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
+mkdir -p ${OUTPUT_DIR}
+rsync -avh $0 ${OUTPUT_DIR}
+export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
+mkdir -p ${HF_HOME}
+export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
+export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
+######################################################################
+LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+echo ${@}
+######################################################################
+DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
+MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp1_stage1.sh/20250418_075843/
+AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
+rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
+######################################################################
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
+    --log_level "info" \
+    --do_train \
+    --overwrite_output_dir \
+    --config_name ${LOCAL_CODE_PATH}/VITA-Audio/models/qwen2_mtp_sensevoice_v4_48_3/config_7B_mtp10.json \
+    --tokenizer_name $MODEL_NAME_OR_PATH \
+    --model_name_or_path $MODEL_NAME_OR_PATH \
+    --audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
+    --audio_tokenizer_type "sensevoice_glm4voice" \
+    --dataset_name $DATA_PATH \
+    --bf16 True \
+    --tf32 True \
+    --torch_dtype bfloat16 \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 1 \
+    --max_steps 8000 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --save_strategy "steps" \
+    --save_steps 0.1 \
+    --save_total_limit 2 \
+    --learning_rate 1.00e-3 \
+    --max_grad_norm 1.0 \
+    --weight_decay 0.0 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.95 \
+    --adam_epsilon 1e-8 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "tensorboard" \
+    --model_max_length ${SEQ_LENGTH} \
+    --gradient_checkpointing True \
+    --deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
+    --trust_remote_code False \
+    --ddp_timeout 7200 \
+    --ddp_backend ${DISTRIBUTED_BACKEND} \
+    --attn_implementation flash_attention_2 \
+    --seed 42 \
+    --data_seed 42 \
+    --reset_attention_mask \
+    --reset_position_ids \
+    --create_attention_mask false \
+    --create_attention_mask_2d false \
+    --dataloader_num_workers 8 \
+    --audio-model-freeze \
+    --language-model-freeze \
+    --text-audio-interval-ratio 1 10 4 10 \
+    #--language-model-freeze \
+    #--dataset_joint false \
+    #--variable_length true \
+    #--tokenizer_name_or_path Qwen2Tokenizer \
+    #--bf16 True \
+    #--fp16 True \
+    #--tf32 True \
+set +x

scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp10_stage2.sh ADDED Viewed

	@@ -0,0 +1,138 @@

+#!/bin/bash
+set -e
+set -x
+SEQ_LENGTH="$1"
+if [ -z "$SEQ_LENGTH" ]
+then
+    SEQ_LENGTH=32768
+fi
+timestamp="$2"
+if [ -z "$timestamp" ]
+then
+    timestamp=`date +'%Y%m%d_%H'`0000
+fi
+######################################################################
+export ROOT_PATH=/data/
+export CODE_PATH=${ROOT_PATH}/VITA-Audio/
+export LOCAL_ROOT_PATH=/data_local/
+export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
+mkdir -p ${LOCAL_ROOT_PATH}
+mkdir -p ${LOCAL_CODE_PATH}
+apt update
+apt install -y rsync
+rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
+cd ${LOCAL_CODE_PATH}
+rm -fr datasets
+ln -s ${ROOT_PATH}/data datasets
+######################################################################
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
+pip3 install transformers==4.48.3
+#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
+######################################################################
+OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
+mkdir -p ${OUTPUT_DIR}
+rsync -avh $0 ${OUTPUT_DIR}
+export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
+mkdir -p ${HF_HOME}
+export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
+export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
+######################################################################
+LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+echo ${@}
+######################################################################
+DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage2.yaml
+MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp10_stage1.sh/20250421_180624/
+AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
+rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
+######################################################################
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
+    --log_level "info" \
+    --do_train \
+    --overwrite_output_dir \
+    --config_name ${LOCAL_CODE_PATH}/VITA-Audio/models/qwen2_mtp_sensevoice_v4_48_3/config_7B_mtp10.json \
+    --tokenizer_name $MODEL_NAME_OR_PATH \
+    --model_name_or_path $MODEL_NAME_OR_PATH \
+    --audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
+    --audio_tokenizer_type "sensevoice_glm4voice" \
+    --dataset_name $DATA_PATH \
+    --bf16 True \
+    --tf32 True \
+    --torch_dtype bfloat16 \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 1 \
+    --max_steps 4000 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --save_strategy "steps" \
+    --save_steps 0.1 \
+    --save_total_limit 2 \
+    --learning_rate 5.00e-5 \
+    --max_grad_norm 1.0 \
+    --weight_decay 0.1 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.95 \
+    --adam_epsilon 1e-8 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "tensorboard" \
+    --model_max_length ${SEQ_LENGTH} \
+    --gradient_checkpointing True \
+    --deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2_no_optimizer.json \
+    --trust_remote_code False \
+    --ddp_timeout 7200 \
+    --ddp_backend ${DISTRIBUTED_BACKEND} \
+    --attn_implementation flash_attention_2 \
+    --seed 42 \
+    --data_seed 42 \
+    --reset_attention_mask \
+    --reset_position_ids \
+    --create_attention_mask false \
+    --create_attention_mask_2d false \
+    --dataloader_num_workers 2 \
+    --mtp_model_lr_mult 1.00e1 \
+    --audio-model-freeze \
+    --text-audio-interval-ratio 1 10 4 10 \
+    #--language-model-freeze \
+    #--dataset_joint false \
+    #--variable_length true \
+    #--tokenizer_name_or_path Qwen2Tokenizer \
+    #--bf16 True \
+    #--fp16 True \
+    #--tf32 True \
+set +x

scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp1_stage1.sh ADDED Viewed

	@@ -0,0 +1,138 @@

+#!/bin/bash
+set -e
+set -x
+SEQ_LENGTH="$1"
+if [ -z "$SEQ_LENGTH" ]
+then
+    SEQ_LENGTH=32768
+fi
+timestamp="$2"
+if [ -z "$timestamp" ]
+then
+    timestamp=`date +'%Y%m%d_%H'`0000
+fi
+######################################################################
+export ROOT_PATH=/data/
+export CODE_PATH=${ROOT_PATH}/VITA-Audio/
+export LOCAL_ROOT_PATH=/data_local/
+export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
+mkdir -p ${LOCAL_ROOT_PATH}
+mkdir -p ${LOCAL_CODE_PATH}
+apt update
+apt install -y rsync
+rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
+cd ${LOCAL_CODE_PATH}
+rm -fr datasets
+ln -s ${ROOT_PATH}/data datasets
+######################################################################
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
+pip3 install transformers==4.48.3
+#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
+######################################################################
+OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
+mkdir -p ${OUTPUT_DIR}
+rsync -avh $0 ${OUTPUT_DIR}
+export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
+mkdir -p ${HF_HOME}
+export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
+export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
+######################################################################
+LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+echo ${@}
+######################################################################
+DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
+MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_stage1.sh/20250409_161438/
+AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
+rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
+######################################################################
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
+    --log_level "info" \
+    --do_train \
+    --overwrite_output_dir \
+    --config_name ${LOCAL_CODE_PATH}/VITA-Audio/models/qwen2_mtp_sensevoice_v4_48_3/config_7B_mtp1.json \
+    --tokenizer_name $MODEL_NAME_OR_PATH \
+    --model_name_or_path $MODEL_NAME_OR_PATH \
+    --audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
+    --audio_tokenizer_type "sensevoice_glm4voice" \
+    --dataset_name $DATA_PATH \
+    --bf16 True \
+    --tf32 True \
+    --torch_dtype bfloat16 \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 1 \
+    --max_steps 4000 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --save_strategy "steps" \
+    --save_steps 0.1 \
+    --save_total_limit 2 \
+    --learning_rate 1.00e-3 \
+    --max_grad_norm 1.0 \
+    --weight_decay 0.0 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.95 \
+    --adam_epsilon 1e-8 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "tensorboard" \
+    --model_max_length ${SEQ_LENGTH} \
+    --gradient_checkpointing True \
+    --deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
+    --trust_remote_code False \
+    --ddp_timeout 7200 \
+    --ddp_backend ${DISTRIBUTED_BACKEND} \
+    --attn_implementation flash_attention_2 \
+    --seed 42 \
+    --data_seed 42 \
+    --reset_attention_mask \
+    --reset_position_ids \
+    --create_attention_mask false \
+    --create_attention_mask_2d false \
+    --dataloader_num_workers 8 \
+    --audio-model-freeze \
+    --language-model-freeze \
+    --text-audio-interval-ratio 1 10 4 10 \
+    #--language-model-freeze \
+    #--dataset_joint false \
+    #--variable_length true \
+    #--tokenizer_name_or_path Qwen2Tokenizer \
+    #--bf16 True \
+    #--fp16 True \
+    #--tf32 True \
+set +x

scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_stage1.sh ADDED Viewed

	@@ -0,0 +1,137 @@

+#!/bin/bash
+set -e
+set -x
+SEQ_LENGTH="$1"
+if [ -z "$SEQ_LENGTH" ]
+then
+    SEQ_LENGTH=32768
+fi
+timestamp="$2"
+if [ -z "$timestamp" ]
+then
+    timestamp=`date +'%Y%m%d_%H'`0000
+fi
+######################################################################
+export ROOT_PATH=/data/
+export CODE_PATH=${ROOT_PATH}/VITA-Audio/
+export LOCAL_ROOT_PATH=/data_local/
+export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
+mkdir -p ${LOCAL_ROOT_PATH}
+mkdir -p ${LOCAL_CODE_PATH}
+apt update
+apt install -y rsync
+rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
+cd ${LOCAL_CODE_PATH}
+rm -fr datasets
+ln -s ${ROOT_PATH}/data datasets
+######################################################################
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
+pip3 install transformers==4.48.3
+#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
+######################################################################
+OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
+mkdir -p ${OUTPUT_DIR}
+rsync -avh $0 ${OUTPUT_DIR}
+export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
+mkdir -p ${HF_HOME}
+export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
+export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
+######################################################################
+LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+echo ${@}
+######################################################################
+DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
+MODEL_NAME_OR_PATH=${ROOT_PATH}/models/Qwen/Qwen2.5-7B-Instruct/
+AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
+rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
+######################################################################
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
+    --log_level "info" \
+    --do_train \
+    --overwrite_output_dir \
+    --config_name ${LOCAL_CODE_PATH}/VITA-Audio/models/qwen2_mtp_sensevoice_v4_48_3/config_7B_mtp0.json \
+    --tokenizer_name $MODEL_NAME_OR_PATH \
+    --model_name_or_path $MODEL_NAME_OR_PATH \
+    --audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
+    --audio_tokenizer_type "sensevoice_glm4voice" \
+    --dataset_name $DATA_PATH \
+    --bf16 True \
+    --tf32 True \
+    --torch_dtype bfloat16 \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 1 \
+    --max_steps 8000 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --save_strategy "steps" \
+    --save_steps 0.1 \
+    --save_total_limit 2 \
+    --learning_rate 6.00e-5 \
+    --max_grad_norm 1.0 \
+    --weight_decay 0.0 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.95 \
+    --adam_epsilon 1e-8 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "tensorboard" \
+    --model_max_length ${SEQ_LENGTH} \
+    --gradient_checkpointing True \
+    --deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
+    --trust_remote_code False \
+    --ddp_timeout 7200 \
+    --ddp_backend ${DISTRIBUTED_BACKEND} \
+    --attn_implementation flash_attention_2 \
+    --seed 42 \
+    --data_seed 42 \
+    --reset_attention_mask \
+    --reset_position_ids \
+    --create_attention_mask false \
+    --create_attention_mask_2d false \
+    --dataloader_num_workers 8 \
+    --audio-model-freeze \
+    --text-audio-interval-ratio 1 10 4 10 \
+    #--language-model-freeze \
+    #--dataset_joint false \
+    #--variable_length true \
+    #--tokenizer_name_or_path Qwen2Tokenizer \
+    #--bf16 True \
+    #--fp16 True \
+    #--tf32 True \
+set +x

scripts/set_env_ds_gpu.sh ADDED Viewed

	@@ -0,0 +1,53 @@

+#set -e
+#set -x
+######################################################################
+#export NCCL_NET=IB
+#export NCCL_SOCKET_IFNAME="bond1"
+#export GLOO_SOCKET_IFNAME="bond1"
+#export NCCL_DEBUG=INFO
+#export NCCL_IB_QPS_PER_CONNECTION=2
+#export GLOO_SOCKET_IFNAME=eth0
+#export NCCL_DEBUG=INFO
+#export NCCL_IB_QPS_PER_CONNECTION=2
+#export NCCL_IB_DISABLE=1
+export DISTRIBUTED_BACKEND="nccl"
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+######################################################################
+pip3 install -r requirements_ds_gpu.txt
+#pip3 install --no-index --find-links=/data/software/ -r requirements_ds_gpu.txt
+pip3 install deepspeed==0.15.4
+#pip3 install --no-index --find-links=/data/software/ deepspeed==0.15.4
+#pip3 install deepspeed==0.16.1
+#pip3 install deepspeed==0.14.2
+pip3 install -e `pwd`
+######################################################################
+#export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
+#apt update
+#apt install -y openssh-server
+#apt install -y rsync
+######################################################################
+export NNODES=${WORLD_SIZE}
+export NODE_RANK=${RANK}
+export MASTER_PORT=34567
+if [ -z "$NPROC_PER_NODE" ]
+then
+    export NPROC_PER_NODE=8
+    export NNODES=1
+    export NODE_RANK=0
+    export MASTER_ADDR=127.0.0.1
+fi
+######################################################################

setup.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from setuptools import find_packages, setup
+setup(
+    name='vita_audio',
+    version='0.0.1',
+    packages=[
+        "vita_audio",
+    ],
+    install_requires=[
+    ],
+)

third_party/GLM-4-Voice/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+*venv
+*.DS_Store
+*.idea/
+test*

third_party/GLM-4-Voice/.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "third_party/Matcha-TTS"]
+	path = third_party/Matcha-TTS
+	url = https://github.com/shivammehta25/Matcha-TTS

third_party/GLM-4-Voice/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 GLM-4-Voice Model Team @ Zhipu AI
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

third_party/GLM-4-Voice/README.md ADDED Viewed

	@@ -0,0 +1,159 @@

+# GLM-4-Voice
+<p align="center">
+📄<a href="https://arxiv.org/abs/2412.02612" target="_blank"> Report </a> • 🤗 <a href="https://huggingface.co/THUDM/glm-4-voice-9b" target="_blank">HF Repo</a> • 🤖 <a href="https://modelscope.cn/studios/ZhipuAI/GLM-4-Voice-Demo" target="_blank">Demo</a> • 🐦 <a href="https://twitter.com/thukeg" target="_blank">Twitter</a>
+</p>
+Read this in [English](./README_en.md)
+GLM-4-Voice 是智谱 AI 推出的端到端语音模型。GLM-4-Voice 能够直接理解和生成中英文语音，进行实时语音对话，并且能够遵循用户的指令要求改变语音的情感、语调、语速、方言等属性。
+## Model Architecture
+![Model Architecture](./resources/architecture.jpeg)
+GLM-4-Voice 由三个部分组成：
+* GLM-4-Voice-Tokenizer: 通过在 [Whisper](https://github.com/openai/whisper) 的 Encoder 部分增加 Vector Quantization 并在 ASR 数据上有监督训练，将连续的语音输入转化为离散的 token。每秒音频平均只需要用 12.5 个离散 token 表示。
+* GLM-4-Voice-Decoder: 基于 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 的 Flow Matching 模型结构训练的支持流式推理的语音解码器，将离散化的语音 token 转化为连续的语音输出。最少只需要 10 个语音 token 即可开始生成，降低端到端对话延迟。
+* GLM-4-Voice-9B: 在 [GLM-4-9B](https://github.com/THUDM/GLM-4) 的基础上进行语音模态的预训练和对齐，从而能够理解和生成离散化的语音 token。
+预训练方面，为了攻克模型在语音模态下的智商和合成表现力两个难关，我们将 Speech2Speech 任务解耦合为“根据用户音频做出文本回复”和“根据文本回复和用户语音合成回复语音”两个任务，并设计两种预训练目标，分别基于文本预训练数据和无监督音频数据合成语音-文本交错数据以适配这两种任务形式。GLM-4-Voice-9B 在 GLM-4-9B 的基座模型基础之上，经过了数百万小时音频和数千亿 token 的音频文本交错数据预训练，拥有很强的音频理解和建模能力。
+对齐方面，为了支持高质量的语音对话，我们设计了一套流式思考架构：根据用户语音，GLM-4-Voice 可以流式交替输出文本和语音两个模态的内容，其中语音模态以文本作为参照保证回复内容的高质量，并根据用户的语音指令要求做出相应的声音变化，在最大程度保留语言模型智商的情况下仍然具有端到端建模的能力，同时具备低延迟性，最低只需要输出 20 个 token 便可以合成语音。
+## Model List
+|         Model         |       Type       |                                                                     Download                                                                     |
+|:---------------------:|:----------------:|:------------------------------------------------------------------------------------------------------------------------------------------------:|
+| GLM-4-Voice-Tokenizer | Speech Tokenizer | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-tokenizer) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-voice-tokenizer) |
+|    GLM-4-Voice-9B     |    Chat Model    |        [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-9b) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-voice-9b)        |
+|  GLM-4-Voice-Decoder  |  Speech Decoder  |   [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-decoder) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-voice-decoder)   |
+## Usage
+我们提供了可以直接启动的 Web Demo。用户可以输入语音或文本，模型会同时给出语音和文字回复。
+![](resources/web_demo.png)
+### Preparation
+首先下载仓库
+```shell
+git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice
+cd GLM-4-Voice
+```
+然后安装依赖。也可以使用我们提供的镜像 `zhipuai/glm-4-voice:0.1` 以跳过这一步。
+```shell
+pip install -r requirements.txt
+```
+由于 Decoder 模型不支持通过 `transformers` 初始化，因此 checkpoint 需要单独下载。
+```shell
+# git 模型下载，请确保已安装 git-lfs
+git lfs install
+git clone https://huggingface.co/THUDM/glm-4-voice-decoder
+```
+### Launch Web Demo
+1. 启动模型服务
+```shell
+python model_server.py --host localhost --model-path THUDM/glm-4-voice-9b --port 10000 --dtype bfloat16 --device cuda:0
+```
+如果你需要使用 Int4 精度启动，请运行
+```shell
+python model_server.py --host localhost --model-path THUDM/glm-4-voice-9b --port 10000 --dtype int4 --device cuda:0
+```
+此命令会自动下载 `glm-4-voice-9b`。如果网络条件不好，也手动下载之后通过 `--model-path` 指定本地的路径。
+2. 启动 web 服务
+```shell
+python web_demo.py --tokenizer-path  THUDM/glm-4-voice-tokenizer --model-path THUDM/glm-4-voice-9b --flow-path ./glm-4-voice-decoder
+```
+即可在 http://127.0.0.1:8888 访问 web demo。
+此命令会自动下载 `glm-4-voice-tokenizer` 和 `glm-4-voice-9b`。 请注意，`glm-4-voice-decoder` 需要手动下载。
+如果网络条件不好，可以手动下载这三个模型之后通过 `--tokenizer-path`, `--flow-path` 和 `--model-path` 指定本地的路径。
+### Known Issues
+* Gradio 的流式音频播放效果不稳定。在生成完成后点击对话框中的音频质量会更高。
+## Cases
+我们提供了 GLM-4-Voice 的部分对话案例，包括控制情绪、改变语速、生成方言等。
+* 用轻柔的声音引导我放松
+https://github.com/user-attachments/assets/4e3d9200-076d-4c28-a641-99df3af38eb0
+* 用激动的声音解说足球比赛
+https://github.com/user-attachments/assets/0163de2d-e876-4999-b1bc-bbfa364b799b
+* 用哀怨的声音讲一个鬼故事
+https://github.com/user-attachments/assets/a75b2087-d7bc-49fa-a0c5-e8c99935b39a
+* 用东北话介绍一下冬天有多冷
+https://github.com/user-attachments/assets/91ba54a1-8f5c-4cfe-8e87-16ed1ecf4037
+* 用重庆话念“吃葡萄不吐葡萄皮”
+https://github.com/user-attachments/assets/7eb72461-9e84-4d8e-9c58-1809cf6a8a9b
+* 用北京话念一句绕口令
+https://github.com/user-attachments/assets/a9bb223e-9c0a-440d-8537-0a7f16e31651
+  * 加快语速
+https://github.com/user-attachments/assets/c98a4604-366b-4304-917f-3c850a82fe9f
+  * 再快一点
+https://github.com/user-attachments/assets/d5ff0815-74f8-4738-b0f1-477cfc8dcc2d
+## Acknowledgements
+本项目的部分代码来自：
+* [CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
+* [transformers](https://github.com/huggingface/transformers)
+* [GLM-4](https://github.com/THUDM/GLM-4)
+## 协议
++ GLM-4 模型的权重的使用则需要遵循 [模型协议](https://huggingface.co/THUDM/glm-4-voice-9b/blob/main/LICENSE)。
++ 本开源仓库的代码则遵循 [Apache 2.0](LICENSE) 协议。
+## 引用
+```
+@misc{zeng2024glm4,
+      title={GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot},
+      author={Aohan Zeng and Zhengxiao Du and Mingdao Liu and Kedong Wang and Shengmin Jiang and Lei Zhao and Yuxiao Dong and Jie Tang},
+      year={2024},
+      eprint={2412.02612},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2412.02612},
+}
+```
+```
+@misc{zeng2024scaling,
+      title={Scaling Speech-Text Pre-training with Synthetic Interleaved Data},
+      author={Aohan Zeng and Zhengxiao Du and Mingdao Liu and Lei Zhang and Shengmin Jiang and Yuxiao Dong and Jie Tang},
+      year={2024},
+      eprint={2411.17607},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2411.17607},
+}
+```

third_party/GLM-4-Voice/README_en.md ADDED Viewed

	@@ -0,0 +1,148 @@

+# GLM-4-Voice
+<p align="center">
+📄<a href="https://arxiv.org/abs/2412.02612" target="_blank"> Report </a> • 🤗 <a href="https://huggingface.co/THUDM/glm-4-voice-9b" target="_blank">HF Repo</a> • 🤖 <a href="https://modelscope.cn/studios/ZhipuAI/GLM-4-Voice-Demo" target="_blank">Demo</a> • 🐦 <a href="https://twitter.com/thukeg" target="_blank">Twitter</a>
+</p>
+GLM-4-Voice is an end-to-end voice model launched by Zhipu AI. GLM-4-Voice can directly understand and generate Chinese and English speech, engage in real-time voice conversations, and change attributes such as emotion, intonation, speech rate, and dialect based on user instructions.
+## Model Architecture
+![Model Architecture](./resources/architecture.jpeg)
+We provide the three components of GLM-4-Voice:
+* GLM-4-Voice-Tokenizer: Trained by adding vector quantization to the encoder part of [Whisper](https://github.com/openai/whisper), converting continuous speech input into discrete tokens. Each second of audio is converted into 12.5 discrete tokens.
+* GLM-4-Voice-9B: Pre-trained and aligned on speech modality based on [GLM-4-9B](https://github.com/THUDM/GLM-4), enabling understanding and generation of discretized speech.
+* GLM-4-Voice-Decoder: A speech decoder supporting streaming inference, retrained based on [CosyVoice](https://github.com/FunAudioLLM/CosyVoice), converting discrete speech tokens into continuous speech output. Generation can start with as few as 10 audio tokens, reducing conversation latency.
+## Model List
+|         Model         |       Type       |                               Download                               |
+|:---------------------:|:----------------:|:--------------------------------------------------------------------:|
+| GLM-4-Voice-Tokenizer | Speech Tokenizer | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-tokenizer) |
+|    GLM-4-Voice-9B     |    Chat Model    |    [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-9b)     |
+|  GLM-4-Voice-Decoder  |  Speech Decoder  |  [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-decoder)  |
+## Usage
+We provide a Web Demo that can be launched directly. Users can input speech or text, and the model will respond with both speech and text.
+![](resources/web_demo.png)
+### Preparation
+First, download the repository
+```shell
+git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice
+cd GLM-4-Voice
+```
+Then, install the dependencies. You can also use our pre-built docker image `zhipuai/glm-4-voice:0.1` to skip the step.
+```shell
+pip install -r requirements.txt
+```
+Since the Decoder model does not support initialization via `transformers`, the checkpoint needs to be downloaded separately.
+```shell
+# Git model download, please ensure git-lfs is installed
+git clone https://huggingface.co/THUDM/glm-4-voice-decoder
+```
+### Launch Web Demo
+1. Start the model server
+```shell
+python model_server.py --host localhost --model-path THUDM/glm-4-voice-9b --port 10000 --dtype bfloat16 --device cuda:0
+```
+If you need to launch with Int4 precision, run
+```shell
+python model_server.py --host localhost --model-path THUDM/glm-4-voice-9b --port 10000 --dtype int4 --device cuda:0
+```
+This command will automatically download `glm-4-voice-9b`. If network conditions are poor, you can manually download it and specify the local path using `--model-path`.
+2. Start the web service
+```shell
+python web_demo.py --tokenizer-path  THUDM/glm-4-voice-tokenizer --model-path THUDM/glm-4-voice-9b --flow-path ./glm-4-voice-decoder
+```
+You can access the web demo at [http://127.0.0.1:8888](http://127.0.0.1:8888).
+This command will automatically download `glm-4-voice-tokenizer` and `glm-4-voice-9b`. Please note that `glm-4-voice-decoder` needs to be downloaded manually.
+If the network connection is poor, you can manually download these three models and specify the local paths using `--tokenizer-path`, `--flow-path`, and `--model-path`.
+### Known Issues
+* Gradio’s streaming audio playback can be unstable. The audio quality will be higher when clicking on the audio in the dialogue box after generation is complete.
+## Examples
+We provide some dialogue cases for GLM-4-Voice, including emotion control, speech rate alteration, dialect generation, etc. (The examples are in Chinese.)
+* Use a gentle voice to guide me to relax
+https://github.com/user-attachments/assets/4e3d9200-076d-4c28-a641-99df3af38eb0
+* Use an excited voice to commentate a football match
+https://github.com/user-attachments/assets/0163de2d-e876-4999-b1bc-bbfa364b799b
+* Tell a ghost story with a mournful voice
+https://github.com/user-attachments/assets/a75b2087-d7bc-49fa-a0c5-e8c99935b39a
+* Introduce how cold winter is with a Northeastern dialect
+https://github.com/user-attachments/assets/91ba54a1-8f5c-4cfe-8e87-16ed1ecf4037
+* Say "Eat grapes without spitting out the skins" in Chongqing dialect
+https://github.com/user-attachments/assets/7eb72461-9e84-4d8e-9c58-1809cf6a8a9b
+* Recite a tongue twister with a Beijing accent
+https://github.com/user-attachments/assets/a9bb223e-9c0a-440d-8537-0a7f16e31651
+  * Increase the speech rate
+https://github.com/user-attachments/assets/c98a4604-366b-4304-917f-3c850a82fe9f
+  * Even faster
+https://github.com/user-attachments/assets/d5ff0815-74f8-4738-b0f1-477cfc8dcc2d
+## Acknowledgements
+Some code in this project is from:
+* [CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
+* [transformers](https://github.com/huggingface/transformers)
+* [GLM-4](https://github.com/THUDM/GLM-4)
+## License Agreement
++ The use of GLM-4 model weights must follow the [Model License Agreement](https://huggingface.co/THUDM/glm-4-voice-9b/blob/main/LICENSE).
++ The code in this open-source repository is licensed under the [Apache 2.0](LICENSE) License.
+## Citation
+```
+@misc{zeng2024glm4,
+      title={GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot},
+      author={Aohan Zeng and Zhengxiao Du and Mingdao Liu and Kedong Wang and Shengmin Jiang and Lei Zhao and Yuxiao Dong and Jie Tang},
+      year={2024},
+      eprint={2412.02612},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2412.02612},
+}
+```
+```
+@misc{zeng2024scaling,
+      title={Scaling Speech-Text Pre-training with Synthetic Interleaved Data},
+      author={Aohan Zeng and Zhengxiao Du and Mingdao Liu and Lei Zhang and Shengmin Jiang and Yuxiao Dong and Jie Tang},
+      year={2024},
+      eprint={2411.17607},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2411.17607},
+}
+```

third_party/GLM-4-Voice/audio_process.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import os
+import librosa
+import soundfile as sf
+import numpy as np
+from pathlib import Path
+import io
+# Split audio stream at silence points to prevent playback stuttering issues
+# caused by AAC encoder frame padding when streaming audio through Gradio audio components.
+class AudioStreamProcessor:
+    def __init__(self, sr=22050, min_silence_duration=0.1, threshold_db=-40):
+        self.sr = sr
+        self.min_silence_duration = min_silence_duration
+        self.threshold_db = threshold_db
+        self.buffer = np.array([])
+    def process(self, audio_data, last=False):
+        """
+        Add audio data and process it
+        params:
+            audio_data: audio data in numpy array
+            last: whether this is the last chunk of data
+        returns:
+            Processed audio data, returns None if no split point is found
+        """
+        # Add new data to buffer
+        self.buffer = np.concatenate([self.buffer, audio_data]) if len(self.buffer) > 0 else audio_data
+        if last:
+            result = self.buffer
+            self.buffer = np.array([])
+            return self._to_wav_bytes(result)
+        # Find silence boundary
+        split_point = self._find_silence_boundary(self.buffer)
+        if split_point is not None:
+            # Modified: Extend split point to the end of silence
+            silence_end = self._find_silence_end(split_point)
+            result = self.buffer[:silence_end]
+            self.buffer = self.buffer[silence_end:]
+            return self._to_wav_bytes(result)
+        return None
+    def _find_silence_boundary(self, audio):
+        """
+        Find the starting point of silence boundary in audio
+        """
+        # Convert audio to decibels
+        db = librosa.amplitude_to_db(np.abs(audio), ref=np.max)
+        # Find points below threshold
+        silence_points = np.where(db < self.threshold_db)[0]
+        if len(silence_points) == 0:
+            return None
+        # Calculate minimum silence samples
+        min_silence_samples = int(self.min_silence_duration * self.sr)
+        # Search backwards for continuous silence segment starting point
+        for i in range(len(silence_points) - min_silence_samples, -1, -1):
+            if i < 0:
+                break
+            if np.all(np.diff(silence_points[i:i+min_silence_samples]) == 1):
+                return silence_points[i]
+        return None
+    def _find_silence_end(self, start_point):
+        """
+        Find the end point of silence segment
+        """
+        db = librosa.amplitude_to_db(np.abs(self.buffer[start_point:]), ref=np.max)
+        silence_points = np.where(db >= self.threshold_db)[0]
+        if len(silence_points) == 0:
+            return len(self.buffer)
+        return start_point + silence_points[0]
+    def _to_wav_bytes(self, audio_data):
+        """
+        trans_to_wav_bytes
+        """
+        wav_buffer = io.BytesIO()
+        sf.write(wav_buffer, audio_data, self.sr, format='WAV')
+        return wav_buffer.getvalue()

third_party/GLM-4-Voice/cosyvoice/__init__.py ADDED Viewed

File without changes

third_party/GLM-4-Voice/cosyvoice/bin/inference.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from cosyvoice.cli.model import CosyVoiceModel
+from cosyvoice.dataset.dataset import Dataset
+def get_args():
+    parser = argparse.ArgumentParser(description='inference with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
+    parser.add_argument('--tts_text', required=True, help='tts input file')
+    parser.add_argument('--llm_model', required=True, help='llm model file')
+    parser.add_argument('--flow_model', required=True, help='flow model file')
+    parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--mode',
+                        default='sft',
+                        choices=['sft', 'zero_shot'],
+                        help='inference mode')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+    # Init cosyvoice models from configs
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f)
+    model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
+    model.load(args.llm_model, args.flow_model, args.hifigan_model)
+    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+    del configs
+    os.makedirs(args.result_dir, exist_ok=True)
+    fn = os.path.join(args.result_dir, 'wav.scp')
+    f = open(fn, 'w')
+    with torch.no_grad():
+        for batch_idx, batch in tqdm(enumerate(test_data_loader)):
+            utts = batch["utts"]
+            assert len(utts) == 1, "inference mode only support batchsize 1"
+            text = batch["text"]
+            text_token = batch["text_token"].to(device)
+            text_token_len = batch["text_token_len"].to(device)
+            tts_text = batch["tts_text"]
+            tts_index = batch["tts_index"]
+            tts_text_token = batch["tts_text_token"].to(device)
+            tts_text_token_len = batch["tts_text_token_len"].to(device)
+            speech_token = batch["speech_token"].to(device)
+            speech_token_len = batch["speech_token_len"].to(device)
+            speech_feat = batch["speech_feat"].to(device)
+            speech_feat_len = batch["speech_feat_len"].to(device)
+            utt_embedding = batch["utt_embedding"].to(device)
+            spk_embedding = batch["spk_embedding"].to(device)
+            if args.mode == 'sft':
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
+            else:
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'prompt_text': text_token, 'prompt_text_len': text_token_len,
+                               'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                               'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                               'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                               'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
+            model_output = model.inference(**model_input)
+            tts_key = '{}_{}'.format(utts[0], tts_index[0])
+            tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
+            torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
+            f.write('{} {}\n'.format(tts_key, tts_fn))
+            f.flush()
+    f.close()
+    logging.info('Result wav.scp saved in {}'.format(fn))
+if __name__ == '__main__':
+    main()

third_party/GLM-4-Voice/cosyvoice/bin/train.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import datetime
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+from copy import deepcopy
+import torch
+import torch.distributed as dist
+# import deepspeed
+import pdb
+from hyperpyyaml import load_hyperpyyaml
+from torch.distributed.elastic.multiprocessing.errors import record
+from cosyvoice.utils.executor import Executor
+from cosyvoice.utils.train_utils import (
+    init_distributed,
+    init_dataset_and_dataloader,
+    init_optimizer_and_scheduler,
+    init_summarywriter, save_model,
+    wrap_cuda_model, check_modify_and_save_config)
+def get_args():
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--train_engine',
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'deepspeed'],
+                        help='Engine for paralleled training')
+    parser.add_argument('--model', required=True, help='model which will be trained')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--train_data', required=True, help='train data file')
+    parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--checkpoint', help='checkpoint model')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='tensorboard log dir')
+    parser.add_argument('--ddp.dist_backend',
+                        dest='dist_backend',
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='distributed backend')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for reading')
+    parser.add_argument('--prefetch',
+                        default=100,
+                        type=int,
+                        help='prefetch number')
+    parser.add_argument('--pin_memory',
+                        action='store_true',
+                        default=False,
+                        help='Use pinned memory buffers used for reading')
+    parser.add_argument('--deepspeed.save_states',
+                        dest='save_states',
+                        default='model_only',
+                        choices=['model_only', 'model+optimizer'],
+                        help='save model/optimizer states')
+    parser.add_argument('--timeout',
+                        default=30,
+                        type=int,
+                        help='timeout (in seconds) of cosyvoice_join.')
+    # parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    return args
+@record
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f, overrides=override_dict)
+    configs['train_conf'].update(vars(args))
+    # Init env for ddp
+    init_distributed(args)
+    # Get dataset & dataloader
+    train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
+        init_dataset_and_dataloader(args, configs)
+    # Do some sanity checks and save config to arsg.model_dir
+    configs = check_modify_and_save_config(args, configs)
+    # Tensorboard summary
+    writer = init_summarywriter(args)
+    # load checkpoint
+    model = configs[args.model]
+    if args.checkpoint is not None:
+        model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))
+    # Dispatch model from cpu to gpu
+    model = wrap_cuda_model(args, model)
+    # Get optimizer & scheduler
+    model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)
+    # pdb.set_trace()
+    # Save init checkpoints
+    info_dict = deepcopy(configs['train_conf'])
+    save_model(model, 'init', info_dict)
+    # Get executor
+    executor = Executor()
+    # Start training loop
+    for epoch in range(info_dict['max_epoch']):
+        executor.epoch = epoch
+        train_dataset.set_epoch(epoch)
+        dist.barrier()
+        # try:
+        #     dist.barrier()
+        # except RuntimeError as e:
+        #     logging.info('except RuntimeError as e: {}'.format(e))
+        group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
+        executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
+        dist.destroy_process_group(group_join)
+if __name__ == '__main__':
+    main()

third_party/GLM-4-Voice/cosyvoice/cli/__init__.py ADDED Viewed

File without changes

third_party/GLM-4-Voice/cosyvoice/cli/cosyvoice.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+from hyperpyyaml import load_hyperpyyaml
+from modelscope import snapshot_download
+from cosyvoice.cli.frontend import CosyVoiceFrontEnd
+from cosyvoice.cli.model import CosyVoiceModel
+class CosyVoice:
+    def __init__(self, model_dir):
+        instruct = True if '-Instruct' in model_dir else False
+        self.model_dir = model_dir
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
+            configs = load_hyperpyyaml(f)
+        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+                                          configs['feat_extractor'],
+                                          '{}/campplus.onnx'.format(model_dir),
+                                          '{}/speech_tokenizer_v1.onnx'.format(model_dir),
+                                          '{}/spk2info.pt'.format(model_dir),
+                                          instruct,
+                                          configs['allowed_special'])
+        self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/hift.pt'.format(model_dir))
+        del configs
+    def list_avaliable_spks(self):
+        spks = list(self.frontend.spk2info.keys())
+        return spks
+    def inference_sft(self, tts_text, spk_id):
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_sft(i, spk_id)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+    def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
+        prompt_text = self.frontend.text_normalize(prompt_text, split=False)
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+    def inference_cross_lingual(self, tts_text, prompt_speech_16k):
+        if self.frontend.instruct is True:
+            raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+    def inference_instruct(self, tts_text, spk_id, instruct_text):
+        if self.frontend.instruct is False:
+            raise ValueError('{} do not support instruct inference'.format(self.model_dir))
+        instruct_text = self.frontend.text_normalize(instruct_text, split=False)
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}

third_party/GLM-4-Voice/cosyvoice/cli/frontend.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import onnxruntime
+import torch
+import numpy as np
+import whisper
+from typing import Callable
+import torchaudio.compliance.kaldi as kaldi
+import torchaudio
+import os
+import re
+import inflect
+try:
+    import ttsfrd
+    use_ttsfrd = True
+except ImportError:
+    print("failed to import ttsfrd, use WeTextProcessing instead")
+    from tn.chinese.normalizer import Normalizer as ZhNormalizer
+    from tn.english.normalizer import Normalizer as EnNormalizer
+    use_ttsfrd = False
+from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
+class CosyVoiceFrontEnd:
+    def __init__(self,
+                 get_tokenizer: Callable,
+                 feat_extractor: Callable,
+                 campplus_model: str,
+                 speech_tokenizer_model: str,
+                 spk2info: str = '',
+                 instruct: bool = False,
+                 allowed_special: str = 'all'):
+        self.tokenizer = get_tokenizer()
+        self.feat_extractor = feat_extractor
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
+        self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option, providers=["CUDAExecutionProvider"if torch.cuda.is_available() else "CPUExecutionProvider"])
+        if os.path.exists(spk2info):
+            self.spk2info = torch.load(spk2info, map_location=self.device)
+        self.instruct = instruct
+        self.allowed_special = allowed_special
+        self.inflect_parser = inflect.engine()
+        self.use_ttsfrd = use_ttsfrd
+        if self.use_ttsfrd:
+            self.frd = ttsfrd.TtsFrontendEngine()
+            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+            assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, 'failed to initialize ttsfrd resource'
+            self.frd.set_lang_type('pinyin')
+            self.frd.enable_pinyin_mix(True)
+            self.frd.set_breakmodel_index(1)
+        else:
+            self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
+            self.en_tn_model = EnNormalizer()
+    def _extract_text_token(self, text):
+        text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
+        text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
+        text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
+        return text_token, text_token_len
+    def _extract_speech_token(self, speech):
+        feat = whisper.log_mel_spectrogram(speech, n_mels=128)
+        speech_token = self.speech_tokenizer_session.run(None, {self.speech_tokenizer_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
+                                                                self.speech_tokenizer_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
+        speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
+        speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_token, speech_token_len
+    def _extract_spk_embedding(self, speech):
+        feat = kaldi.fbank(speech,
+                           num_mel_bins=80,
+                           dither=0,
+                           sample_frequency=16000)
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        embedding = self.campplus_session.run(None, {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
+        embedding = torch.tensor([embedding]).to(self.device)
+        return embedding
+    def _extract_speech_feat(self, speech):
+        speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
+        speech_feat = speech_feat.unsqueeze(dim=0)
+        speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_feat, speech_feat_len
+    def text_normalize(self, text, split=True):
+        text = text.strip()
+        if contains_chinese(text):
+            if self.use_ttsfrd:
+                text = self.frd.get_frd_extra_info(text, 'input')
+            else:
+                text = self.zh_tn_model.normalize(text)
+            text = text.replace("\n", "")
+            text = replace_blank(text)
+            text = replace_corner_mark(text)
+            text = text.replace(".", "、")
+            text = text.replace(" - ", "，")
+            text = remove_bracket(text)
+            text = re.sub(r'[，,]+$', '。', text)
+            texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
+                                                token_min_n=60, merge_len=20,
+                                                comma_split=False)]
+        else:
+            if self.use_ttsfrd:
+                text = self.frd.get_frd_extra_info(text, 'input')
+            else:
+                text = self.en_tn_model.normalize(text)
+            text = spell_out_number(text, self.inflect_parser)
+            texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
+                                                token_min_n=60, merge_len=20,
+                                                comma_split=False)]
+        if split is False:
+            return text
+        return texts
+    def frontend_sft(self, tts_text, spk_id):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        embedding = self.spk2info[spk_id]['embedding']
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+        prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
+        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
+        speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                       'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+                       'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                       'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                       'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                       'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_cross_lingual(self, tts_text, prompt_speech_16k):
+        model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k)
+        # in cross lingual mode, we remove prompt in llm
+        del model_input['prompt_text']
+        del model_input['prompt_text_len']
+        del model_input['llm_prompt_speech_token']
+        del model_input['llm_prompt_speech_token_len']
+        return model_input
+    def frontend_instruct(self, tts_text, spk_id, instruct_text):
+        model_input = self.frontend_sft(tts_text, spk_id)
+        # in instruct mode, we remove spk_embedding in llm due to information leakage
+        del model_input['llm_embedding']
+        instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
+        model_input['prompt_text'] = instruct_text_token
+        model_input['prompt_text_len'] = instruct_text_token_len
+        return model_input

third_party/GLM-4-Voice/cosyvoice/cli/model.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+class CosyVoiceModel:
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 hift: torch.nn.Module):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+    def load(self, llm_model, flow_model, hift_model):
+        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
+        self.llm.to(self.device).eval()
+        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
+        self.flow.to(self.device).eval()
+        self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
+        self.hift.to(self.device).eval()
+    def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
+                  prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
+                  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
+        tts_speech_token = self.llm.inference(text=text.to(self.device),
+                                              text_len=text_len.to(self.device),
+                                              prompt_text=prompt_text.to(self.device),
+                                              prompt_text_len=prompt_text_len.to(self.device),
+                                              prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                              prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
+                                              embedding=llm_embedding.to(self.device),
+                                              beam_size=1,
+                                              sampling=25,
+                                              max_token_text_ratio=30,
+                                              min_token_text_ratio=3)
+        tts_mel = self.flow.inference(token=tts_speech_token,
+                                      token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
+                                      prompt_token=flow_prompt_speech_token.to(self.device),
+                                      prompt_token_len=flow_prompt_speech_token_len.to(self.device),
+                                      prompt_feat=prompt_speech_feat.to(self.device),
+                                      prompt_feat_len=prompt_speech_feat_len.to(self.device),
+                                      embedding=flow_embedding.to(self.device))
+        tts_speech = self.hift.inference(mel=tts_mel).cpu()
+        torch.cuda.empty_cache()
+        return {'tts_speech': tts_speech}
+    def text_to_token(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
+                  prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
+                  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
+        tts_speech_token = self.llm.inference(text=text.to(self.device),
+                                              text_len=text_len.to(self.device),
+                                              prompt_text=prompt_text.to(self.device),
+                                              prompt_text_len=prompt_text_len.to(self.device),
+                                              prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                              prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
+                                              embedding=llm_embedding.to(self.device),
+                                              beam_size=1,
+                                              sampling=25,
+                                              max_token_text_ratio=30,
+                                              min_token_text_ratio=3)
+        return tts_speech_token
+    def token_to_speech(self, tts_speech_token, flow_embedding, llm_embedding=torch.zeros(0, 192),
+                  prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
+                  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
+        tts_mel = self.flow.inference(token=tts_speech_token,
+                                      token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
+                                      prompt_token=flow_prompt_speech_token.to(self.device),
+                                      prompt_token_len=flow_prompt_speech_token_len.to(self.device),
+                                      prompt_feat=prompt_speech_feat.to(self.device),
+                                      prompt_feat_len=prompt_speech_feat_len.to(self.device),
+                                      embedding=flow_embedding.to(self.device))
+        tts_speech = self.hift.inference(mel=tts_mel).cpu()
+        torch.cuda.empty_cache()
+        return {'tts_speech': tts_speech}

third_party/GLM-4-Voice/cosyvoice/dataset/__init__.py ADDED Viewed

File without changes

third_party/GLM-4-Voice/cosyvoice/dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import json
+import math
+from functools import partial
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+from cosyvoice.utils.file_utils import read_lists, read_json_lists
+class Processor(IterableDataset):
+    def __init__(self, source, f, *args, **kw):
+        assert callable(f)
+        self.source = source
+        self.f = f
+        self.args = args
+        self.kw = kw
+    def set_epoch(self, epoch):
+        self.source.set_epoch(epoch)
+    def __iter__(self):
+        """ Return an iterator over the source dataset processed by the
+            given processor.
+        """
+        assert self.source is not None
+        assert callable(self.f)
+        return self.f(iter(self.source), *self.args, **self.kw)
+    def apply(self, f):
+        assert callable(f)
+        return Processor(self, f, *self.args, **self.kw)
+class DistributedSampler:
+    def __init__(self, shuffle=True, partition=True):
+        self.epoch = -1
+        self.update()
+        self.shuffle = shuffle
+        self.partition = partition
+    def update(self):
+        assert dist.is_available()
+        if dist.is_initialized():
+            self.rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+        else:
+            self.rank = 0
+            self.world_size = 1
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            self.worker_id = 0
+            self.num_workers = 1
+        else:
+            self.worker_id = worker_info.id
+            self.num_workers = worker_info.num_workers
+        return dict(rank=self.rank,
+                    world_size=self.world_size,
+                    worker_id=self.worker_id,
+                    num_workers=self.num_workers)
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def sample(self, data):
+        """ Sample data according to rank/world_size/num_workers
+            Args:
+                data(List): input data list
+            Returns:
+                List: data list after sample
+        """
+        data = list(range(len(data)))
+        # force datalist even
+        if self.partition:
+            if self.shuffle:
+                random.Random(self.epoch).shuffle(data)
+            if len(data) < self.world_size:
+                data = data * math.ceil(self.world_size / len(data))
+                data = data[:self.world_size]
+            data = data[self.rank::self.world_size]
+        if len(data) < self.num_workers:
+            data = data * math.ceil(self.num_workers / len(data))
+            data = data[:self.num_workers]
+        data = data[self.worker_id::self.num_workers]
+        return data
+class DataList(IterableDataset):
+    def __init__(self, lists, shuffle=True, partition=True):
+        self.lists = lists
+        self.sampler = DistributedSampler(shuffle, partition)
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+    def __iter__(self):
+        sampler_info = self.sampler.update()
+        indexes = self.sampler.sample(self.lists)
+        for index in indexes:
+            data = dict(src=self.lists[index])
+            data.update(sampler_info)
+            yield data
+def Dataset(data_list_file,
+            data_pipeline,
+            mode='train',
+            shuffle=True,
+            partition=True,
+            tts_file='',
+            prompt_utt2data=''):
+    """ Construct dataset from arguments
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shards tar/raw file level. The second is global shuffle
+        at training samples level.
+        Args:
+            data_type(str): raw/shard
+            tokenizer (BaseTokenizer): tokenizer to tokenize
+            partition(bool): whether to do data partition in terms of rank
+    """
+    assert mode in ['train', 'inference']
+    lists = read_lists(data_list_file)
+    # import pdb
+    # pdb.set_trace()
+    if mode == 'inference':
+        with open(tts_file) as f:
+            tts_data = json.load(f)
+        utt2lists = read_json_lists(prompt_utt2data)
+        # filter unnecessary file in inference mode
+        lists = list(set([utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists]))
+    dataset = DataList(lists,shuffle=shuffle,partition=partition)
+    if mode == 'inference':
+        # map partial arg tts_data in inference mode
+        data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
+    for func in data_pipeline:
+        dataset = Processor(dataset, func, mode=mode)
+    return dataset

third_party/GLM-4-Voice/cosyvoice/dataset/processor.py ADDED Viewed

	@@ -0,0 +1,965 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+import json
+import tarfile
+import json
+import io
+import pyarrow.parquet as pq
+from io import BytesIO
+import torch
+import torchaudio
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+import tarfile
+import json
+import io
+import wave
+import numpy as np
+import torchaudio
+import os
+import sys
+import json
+import random
+import pickle
+import argparse
+import itertools
+import mmap
+import struct
+import collections
+import shutil
+import multiprocessing as mp
+from pathlib import Path
+from tqdm import tqdm
+from collections import defaultdict
+from copy import deepcopy
+from datetime import datetime
+import pickle
+from wids import wids
+import math
+torchaudio.set_audio_backend('soundfile')
+AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
+try:
+    MAIN_SPK_EMBEDDING=torch.load("/workspace/audio_checkpoints/flow_model/spk_embedding/0909/mean_embedding.pt")
+    GPT_SPK_EMBEDDING=torch.load("/workspace/audio_checkpoints/flow_model/spk_embedding/0909/spk_mean_embeddings.pt")
+except:
+    MAIN_SPK_EMBEDDING=torch.zeros(1,192)
+    GPT_SPK_EMBEDDING=torch.zeros(1,192)
+def parquet_opener(data, mode='train', tts_data={}):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        url = sample['src']
+        try:
+            df = pq.read_table(url).to_pandas()
+            for i in range(len(df)):
+                if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
+                    continue
+                sample.update(dict(df.loc[i]))
+                if mode == 'train':
+                    # NOTE do not return sample directly, must initialize a new dict
+                    yield {**sample}
+                else:
+                    for index, text in enumerate(tts_data[df.loc[i, 'utt']]):
+                        yield {**sample, 'tts_index': index, 'tts_text': text}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(url, ex))
+def parse_tar_header(header_bytes):
+    header = struct.unpack("!100s8s8s8s12s12s8s1s100s6s2s32s32s8s8s155s", header_bytes)
+    return TarHeader(*header)
+TarHeader = collections.namedtuple(
+    "TarHeader",
+    [
+        "name",
+        "mode",
+        "uid",
+        "gid",
+        "size",
+        "mtime",
+        "chksum",
+        "typeflag",
+        "linkname",
+        "magic",
+        "version",
+        "uname",
+        "gname",
+        "devmajor",
+        "devminor",
+        "prefix",
+    ],
+)
+class MMTar:
+    def __init__(self, file_path: Path | str):
+        self.stream = open(file_path, "rb")
+        self.mmap = mmap.mmap(self.stream.fileno(), 0, access=mmap.ACCESS_READ)
+    def __del__(self):
+        try:
+            self.mmap.close()
+            self.stream.close()
+        except:  # noqa
+            pass
+    def get_at_offset(self, offset) -> tuple[str, bytes]:
+        header = parse_tar_header(self.mmap[offset : offset + 500])
+        name = header.name.decode("utf-8").strip("\x00")
+        start = offset + 512
+        end = start + int(header.size.decode("utf-8")[:-1], 8)
+        return name, self.mmap[start:end]
+class Tar:
+    def __init__(self, path: Path):
+        self.tar = MMTar(path)
+        indices_path = path.with_suffix(".index")
+        self.index = pickle.loads(indices_path.read_bytes())
+        self.name_mapping = {}
+        for name, offset, _ in self.index:
+            self.name_mapping[name] = offset
+    def read(self, name: str) -> bytes:
+        return self.tar.get_at_offset(self.name_mapping[name])[1]
+def cosy_jsonl_opener(data, mode='train', tts_data={}):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        cosy_jsonl_path = sample['src']
+        tar_file_path=cosy_jsonl_path.replace(".vq0907.jsonl",".tar")
+        try:
+            tar_data=Tar(Path(tar_file_path))
+            with open(cosy_jsonl_path, 'r') as f:
+                for line in f:
+                    item=json.loads(line)
+                    cosy_token = item['cosy_token']
+                    sample['speech_token']=torch.tensor(cosy_token)
+                    sample['speech'], sample['sample_rate']= torchaudio.load(io.BytesIO(tar_data.read(item['filename'])))
+                    # print(item['filename'])
+                    yield {**sample}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(cosy_jsonl_path, ex))
+def cosy_jsonl_opener_vq0918_nopool(data, mode='train', tts_data={}):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        cosy_jsonl_path = sample['src']
+        tar_file_path=cosy_jsonl_path.replace(".vq0918-nopool.jsonl",".tar")
+        try:
+            tar_data=Tar(Path(tar_file_path))
+            with open(cosy_jsonl_path, 'r') as f:
+                # cosy_data = [json.loads(line) for line in f]
+                for line in f:
+                    item=json.loads(line)
+                    cosy_token = item['cosy_token']
+                    sample['speech_token']=torch.tensor(cosy_token)
+                    sample['speech'], sample['sample_rate']= torchaudio.load(io.BytesIO(tar_data.read(item['filename'])))
+                    # print(item['filename'])
+                    yield {**sample}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(cosy_jsonl_path, ex))
+def cosy_jsonl_opener_vq0918_pool2(data, mode='train', tts_data={}):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        cosy_jsonl_path = sample['src']
+        tar_file_path=cosy_jsonl_path.replace(".vq0918-pool2.jsonl",".tar")
+        try:
+            tar_data=Tar(Path(tar_file_path))
+            with open(cosy_jsonl_path, 'r') as f:
+                for line in f:
+                    item=json.loads(line)
+                    cosy_token = item['cosy_token']
+                    sample['speech_token']=torch.tensor(cosy_token)
+                    sample['speech'], sample['sample_rate']= torchaudio.load(io.BytesIO(tar_data.read(item['filename'])))
+                    yield {**sample}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(cosy_jsonl_path, ex))
+def cosy_jsonl_opener_vq0918_pool4(data, mode='train', tts_data={}):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        cosy_jsonl_path = sample['src']
+        tar_file_path=cosy_jsonl_path.replace(".vq0918-pool4.jsonl",".tar")
+        try:
+            tar_data=Tar(Path(tar_file_path))
+            with open(cosy_jsonl_path, 'r') as f:
+                # cosy_data = [json.loads(line) for line in f]
+                for line in f:
+                    item=json.loads(line)
+                    cosy_token = item['cosy_token']
+                    sample['speech_token']=torch.tensor(cosy_token)
+                    sample['speech'], sample['sample_rate']= torchaudio.load(io.BytesIO(tar_data.read(item['filename'])))
+                    # print(item['filename'])
+                    yield {**sample}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(cosy_jsonl_path, ex))
+def cosy_jsonl_opener_vq0918_pool8(data, mode='train', tts_data={}):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        cosy_jsonl_path = sample['src']
+        tar_file_path=cosy_jsonl_path.replace(".vq0918-pool8.jsonl",".tar")
+        try:
+            tar_data=Tar(Path(tar_file_path))
+            with open(cosy_jsonl_path, 'r') as f:
+                # cosy_data = [json.loads(line) for line in f]
+                for line in f:
+                    item=json.loads(line)
+                    cosy_token = item['cosy_token']
+                    sample['speech_token']=torch.tensor(cosy_token)
+                    sample['speech'], sample['sample_rate']= torchaudio.load(io.BytesIO(tar_data.read(item['filename'])))
+                    # print(item['filename'])
+                    yield {**sample}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(cosy_jsonl_path, ex))
+def process_sft_vq0918_pool4(data, mode='train', tts_data={}):
+    for sample in data:
+        assert 'src' in sample
+        token_npy_path = sample['src']
+        wav_path=token_npy_path.replace(".vq0918-pool4.npy","")
+        # wav_path,token_npy_path=sample['src'].split(' ')
+        try:
+            sample['speech_token']=torch.tensor(np.load(token_npy_path))
+            sample['speech'], sample['sample_rate']= torchaudio.load(wav_path)
+            if sample['speech'].shape[0] > 1:
+                sample['speech'] = sample['speech'].mean(dim=0, keepdim=True)
+            sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING)
+            yield {**sample}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex))
+            logging.warning('Failed to open {}'.format(wav_path))
+def process_sft_vq0918_pool4_split(data, mode='train',split_token=25, tts_data={}):
+    for sample in data:
+        assert 'src' in sample
+        token_npy_path = sample['src']
+        wav_path=token_npy_path.replace(".vq0918-pool4.npy","")
+        # wav_path,token_npy_path=sample['src'].split(' ')
+        try:
+            # sample['speech_token']=torch.tensor(np.load(token_npy_path))
+            # sample['speech'], sample['sample_rate']= torchaudio.load(wav_path)
+            # if sample['speech'].shape[0] > 1:
+            #     sample['speech'] = sample['speech'].mean(dim=0, keepdim=True)
+            # sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING)
+            speech_token=torch.tensor(np.load(token_npy_path))
+            speech,sample_rate= torchaudio.load(wav_path)
+            # split_speech=int(split_token / 12.5 * sample_rate)
+            if speech.shape[0] > 1:
+                speech = speech.mean(dim=0, keepdim=True)
+            sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING)
+            sample['sample_rate']=sample_rate
+            num_splits = (speech_token.size(0) + split_token - 1) // split_token
+            for split_id in range(num_splits):
+                end_token_idx = min((split_id + 1) * split_token, speech_token.size(0))
+                end_speech_idx=int(np.ceil(end_token_idx / 12.5 * sample_rate))
+                sample['speech_token']=speech_token[:end_token_idx]
+                sample['speech']=speech[:,:end_speech_idx]
+                print(sample['speech_token'].size(),sample['speech'].size())
+                yield {**sample}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex))
+            logging.warning('Failed to open {}'.format(wav_path))
+def process_sft_vq0918_pool2(data, mode='train', tts_data={}):
+    for sample in data:
+        assert 'src' in sample
+        token_npy_path = sample['src'].replace(".vq0918-pool4.npy",".vq0918-pool2.npy")
+        wav_path=token_npy_path.replace(".vq0918-pool2.npy","")
+        # wav_path,token_npy_path=sample['src'].split(' ')
+        try:
+            sample['speech_token']=torch.tensor(np.load(token_npy_path))
+            sample['speech'], sample['sample_rate']= torchaudio.load(wav_path)
+            if sample['speech'].shape[0] > 1:
+                sample['speech'] = sample['speech'].mean(dim=0, keepdim=True)
+            sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING)
+            yield {**sample}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex))
+            logging.warning('Failed to open {}'.format(wav_path))
+def process_sft_vq0918_pool2_split(data, mode='train',split_token=50, tts_data={}):
+    for sample in data:
+        assert 'src' in sample
+        token_npy_path = sample['src']
+        wav_path=token_npy_path.replace(".vq0918-pool2.npy","")
+        # wav_path,token_npy_path=sample['src'].split(' ')
+        try:
+            # sample['speech_token']=torch.tensor(np.load(token_npy_path))
+            # sample['speech'], sample['sample_rate']= torchaudio.load(wav_path)
+            # if sample['speech'].shape[0] > 1:
+            #     sample['speech'] = sample['speech'].mean(dim=0, keepdim=True)
+            # sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING)
+            speech_token=torch.tensor(np.load(token_npy_path))
+            speech,sample_rate= torchaudio.load(wav_path)
+            # split_speech=int(split_token / 12.5 * sample_rate)
+            if speech.shape[0] > 1:
+                speech = speech.mean(dim=0, keepdim=True)
+            sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING)
+            sample['sample_rate']=sample_rate
+            num_splits = (speech_token.size(0) + split_token - 1) // split_token
+            for split_id in range(num_splits):
+                end_token_idx = min((split_id + 1) * split_token, speech_token.size(0))
+                end_speech_idx=int(np.ceil(end_token_idx / 25 * sample_rate))
+                sample['speech_token']=speech_token[:end_token_idx]
+                sample['speech']=speech[:,:end_speech_idx]
+                print(sample['speech_token'].size(),sample['speech'].size())
+                yield {**sample}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex))
+            logging.warning('Failed to open {}'.format(wav_path))
+def process_sft_vq0918_pool4_gpt(data, mode='train', tts_data={}):
+    for sample in data:
+        assert 'src' in sample
+        try:
+            entry=json.loads(sample['src'])
+            sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING)
+            for conv in entry["conversations"]:
+                if "response_wav" in conv:
+                    wav_path = f"/workspace/audio_data/sft/{conv['response_wav']}"
+                    token_npy_path=wav_path.replace(".wav",".wav.vq0918-pool4.npy")
+                    sample['speech_token']=torch.tensor(np.load(token_npy_path))
+                    sample['speech'], sample['sample_rate']= torchaudio.load(wav_path)
+                    if sample['speech'].shape[0] > 1:
+                        sample['speech'] = sample['speech'].mean(dim=0, keepdim=True)
+                    sample['spk_embedding']=spk_embedding
+                    yield {**sample}
+        except Exception as ex:
+            # logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex))
+            logging.warning('Failed to open {}'.format(wav_path))
+def process_sft_vq0918_pool4_gpt_1010(data, mode='train', tts_data={}):
+    for sample in data:
+        assert 'src' in sample
+        try:
+            entry=json.loads(sample['src'])
+            sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING)
+            for conv in entry["conversations"]:
+                if "response_wav" in conv:
+                    wav_path = f"/workspace/audio_data/sft/{conv['response_wav']}"
+                    token_npy_path=wav_path.replace(".wav",".wav.vq0918-pool4.npy")
+                    sample['speech_token']=torch.tensor(np.load(token_npy_path))
+                    sample['speech'], sample['sample_rate']= torchaudio.load(wav_path)
+                    if sample['speech'].shape[0] > 1:
+                        sample['speech'] = sample['speech'].mean(dim=0, keepdim=True)
+                    sample['spk_embedding']=spk_embedding
+                    yield {**sample}
+                if "prompt_wav" in conv:
+                    wav_path = f"/workspace/audio_data/sft/{conv['response_wav']}"
+                    token_npy_path=wav_path.replace(".wav",".wav.vq0918-pool4.npy")
+                    sample['speech_token']=torch.tensor(np.load(token_npy_path))
+                    sample['speech'], sample['sample_rate']= torchaudio.load(wav_path)
+                    if sample['speech'].shape[0] > 1:
+                        sample['speech'] = sample['speech'].mean(dim=0, keepdim=True)
+                    sample['spk_embedding']=spk_embedding
+                    yield {**sample}
+        except Exception as ex:
+            # logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex))
+            logging.warning('Failed to open {}'.format(wav_path))
+def filter(data,
+           max_length=10240,
+           min_length=10,
+           token_max_length=200,
+           token_min_length=1,
+           min_output_input_ratio=0.0005,
+           max_output_input_ratio=1,
+           mode='train'):
+    """ Filter sample according to feature and label length
+        Inplace operation.
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+            token_max_length: drop utterance which is greater than
+                token_max_length, especially when use char unit for
+                english modeling
+            token_min_length: drop utterance which is
+                less than token_max_length
+            min_output_input_ratio: minimal ration of
+                token_length / feats_length(10ms)
+            max_output_input_ratio: maximum ration of
+                token_length / feats_length(10ms)
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        # sample['speech'], sample['sample_rate'] = torchaudio.load(BytesIO(sample['audio_data']))
+        # del sample['audio_data']
+        # sample['wav'] is torch.Tensor, we have 100 frames every second
+        num_frames = sample['speech'].size(1) / sample['sample_rate'] * 100
+        if num_frames < min_length:
+            continue
+        if num_frames > max_length:
+            continue
+        if len(sample['text_token']) < token_min_length:
+            continue
+        if len(sample['text_token']) > token_max_length:
+            continue
+        if len(sample['speech_token']) == 0:
+            continue
+        if num_frames != 0:
+            if len(sample['text_token']) / num_frames < min_output_input_ratio:
+                continue
+            if len(sample['text_token']) / num_frames > max_output_input_ratio:
+                continue
+        yield sample
+def filter_speech_token(data,
+           max_length=10240,
+           min_length=10,
+           token_max_length=5000,
+           token_min_length=1,
+           min_output_input_ratio=0.0005,
+           max_output_input_ratio=30,
+           mode='train'):
+    """ Filter sample according to feature and label length
+        Inplace operation.
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+            token_max_length: drop utterance which is greater than
+                token_max_length, especially when use char unit for
+                english modeling
+            token_min_length: drop utterance which is
+                less than token_max_length
+            min_output_input_ratio: minimal ration of
+                token_length / feats_length(10ms)
+            max_output_input_ratio: maximum ration of
+                token_length / feats_length(10ms)
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        # sample['speech'], sample['sample_rate'] = torchaudio.load(BytesIO(sample['audio_data']))
+        # del sample['audio_data']
+        # sample['wav'] is torch.Tensor, we have 100 frames every second
+        num_frames = sample['speech'].size(1) / sample['sample_rate'] * 100
+        if num_frames < min_length:
+            continue
+        if num_frames > max_length:
+            continue
+        if len(sample['speech_token']) < token_min_length:
+            continue
+        if len(sample['speech_token']) > token_max_length:
+            continue
+        if len(sample['speech_token']) == 0:
+            continue
+        if num_frames != 0:
+            if len(sample['speech_token']) / num_frames < min_output_input_ratio:
+                continue
+            if len(sample['speech_token']) / num_frames > max_output_input_ratio:
+                continue
+        yield sample
+def resample(data, resample_rate=22050, min_sample_rate=16000, mode='train'):
+    """ Resample data.
+        Inplace operation.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['speech']
+        if sample_rate != resample_rate:
+            if sample_rate < min_sample_rate:
+                continue
+            sample['sample_rate'] = resample_rate
+            sample['speech'] = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        max_val = sample['speech'].abs().max()
+        if max_val > 1:
+            sample['speech'] /= max_val
+        yield sample
+def compute_fbank(data,
+                  feat_extractor,
+                  mode='train'):
+    """ Extract fbank
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        # assert 'utt' in sample
+        # assert 'text_token' in sample
+        waveform = sample['speech']
+        mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+        sample['speech_feat'] = mat
+        del sample['speech']
+        yield sample
+def parse_embedding(data, normalize, mode='train'):
+    """ Parse utt_embedding/spk_embedding
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
+        sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
+        if normalize:
+            sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
+            sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
+        yield sample
+def tokenize(data, get_tokenizer, allowed_special, mode='train'):
+    """ Decode text to chars or BPE
+        Inplace operation
+        Args:
+            data: Iterable[{key, wav, txt, sample_rate}]
+        Returns:
+            Iterable[{key, wav, txt, tokens, label, sample_rate}]
+    """
+    tokenizer = get_tokenizer()
+    for sample in data:
+        assert 'text' in sample
+        sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
+        if mode == 'inference':
+            sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special)
+        yield sample
+def shuffle(data, shuffle_size=10000, mode='train'):
+    """ Local shuffle the data
+        Args:
+            data: Iterable[{key, feat, label}]
+            shuffle_size: buffer size for shuffle
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= shuffle_size:
+            random.shuffle(buf)
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    random.shuffle(buf)
+    for x in buf:
+        yield x
+def sort(data, sort_size=500, mode='train'):
+    """ Sort the data by feature length.
+        Sort is used after shuffle and before batch, so we can group
+        utts with similar lengths into a batch, and `sort_size` should
+        be less than `shuffle_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            sort_size: buffer size for sort
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= sort_size:
+            buf.sort(key=lambda x: x['speech_feat'].size(0))
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    buf.sort(key=lambda x: x['speech_feat'].size(0))
+    for x in buf:
+        yield x
+def static_batch(data, batch_size=16):
+    """ Static batch the data by `batch_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            batch_size: batch size
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= batch_size:
+            yield buf
+            buf = []
+    if len(buf) > 0:
+        yield buf
+def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
+    """ Dynamic batch the data until the total frames in batch
+        reach `max_frames_in_batch`
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_frames_in_batch: max_frames in one batch
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    longest_frames = 0
+    for sample in data:
+        assert 'speech_feat' in sample
+        assert isinstance(sample['speech_feat'], torch.Tensor)
+        new_sample_frames = sample['speech_feat'].size(0)
+        longest_frames = max(longest_frames, new_sample_frames)
+        frames_after_padding = longest_frames * (len(buf) + 1)
+        if frames_after_padding > max_frames_in_batch:
+            yield buf
+            buf = [sample]
+            longest_frames = new_sample_frames
+        else:
+            buf.append(sample)
+    if len(buf) > 0:
+        yield buf
+def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
+    """ Wrapper for static/dynamic batch
+    """
+    if mode == 'inference':
+        return static_batch(data, 1)
+    else:
+        if batch_type == 'static':
+            return static_batch(data, batch_size)
+        elif batch_type == 'dynamic':
+            return dynamic_batch(data, max_frames_in_batch)
+        else:
+            logging.fatal('Unsupported batch type {}'.format(batch_type))
+def padding(data, use_spk_embedding, mode='train'):
+    """ Padding the data into training data
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+    for sample in data:
+        assert isinstance(sample, list)
+        speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample],
+                                       dtype=torch.int32)
+        order = torch.argsort(speech_feat_len, descending=True)
+        utts = [sample[i]['utt'] for i in order]
+        speech_token = [torch.tensor(sample[i]['speech_token']) for i in order]
+        speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32)
+        speech_token = pad_sequence(speech_token,
+                                    batch_first=True,
+                                    padding_value=0)
+        speech_feat = [sample[i]['speech_feat'] for i in order]
+        speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32)
+        speech_feat = pad_sequence(speech_feat,
+                                   batch_first=True,
+                                   padding_value=0)
+        text = [sample[i]['text'] for i in order]
+        text_token = [torch.tensor(sample[i]['text_token']) for i in order]
+        text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
+        text_token = pad_sequence(text_token, batch_first=True, padding_value=0)
+        utt_embedding = torch.stack([sample[i]['utt_embedding'] for i in order], dim=0)
+        spk_embedding = torch.stack([sample[i]['spk_embedding'] for i in order], dim=0)
+        batch = {
+            "utts": utts,
+            "speech_token": speech_token,
+            "speech_token_len": speech_token_len,
+            "speech_feat": speech_feat,
+            "speech_feat_len": speech_feat_len,
+            "text": text,
+            "text_token": text_token,
+            "text_token_len": text_token_len,
+            "utt_embedding": utt_embedding,
+            "spk_embedding": spk_embedding,
+        }
+        if mode == 'inference':
+            tts_text = [sample[i]['tts_text'] for i in order]
+            tts_index = [sample[i]['tts_index'] for i in order]
+            tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
+            tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
+            tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
+            batch.update({'tts_text': tts_text,
+                          'tts_index': tts_index,
+                          'tts_text_token': tts_text_token,
+                          'tts_text_token_len': tts_text_token_len})
+        if use_spk_embedding is True:
+            batch["embedding"] = batch["spk_embedding"]
+        else:
+            batch["embedding"] = batch["utt_embedding"]
+        yield batch
+def padding_speech_token(data, use_spk_embedding, mode='train'):
+    """ Padding the data into training data
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+    for sample in data:
+        assert isinstance(sample, list)
+        speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample],
+                                       dtype=torch.int32)
+        order = torch.argsort(speech_feat_len, descending=True)
+        # utts = [sample[i]['utt'] for i in order]
+        # speech_token = [torch.tensor(sample[i]['speech_token']) for i in order]
+        try:
+            speech_token = [sample[i]['speech_token'].clone().detach() for i in order]
+            speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32)
+            speech_token = pad_sequence(speech_token,
+                                        batch_first=True,
+                                        padding_value=0)
+            speech_feat = [sample[i]['speech_feat'] for i in order]
+            speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32)
+            speech_feat = pad_sequence(speech_feat,
+                                    batch_first=True,
+                                    padding_value=0)
+            batch = {
+                "speech_token": speech_token,
+                "speech_token_len": speech_token_len,
+                "speech_feat": speech_feat,
+                "speech_feat_len": speech_feat_len,
+            }
+            if mode == 'inference':
+                tts_text = [sample[i]['tts_text'] for i in order]
+                tts_index = [sample[i]['tts_index'] for i in order]
+                tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
+                tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
+                tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
+                batch.update({'tts_text': tts_text,
+                            'tts_index': tts_index,
+                            'tts_text_token': tts_text_token,
+                            'tts_text_token_len': tts_text_token_len})
+            # if use_spk_embedding is True:
+            #     batch["embedding"] = batch["spk_embedding"]
+            # else:
+            #     batch["embedding"] = batch["utt_embedding"]
+            batch["embedding"]=torch.zeros((batch["speech_feat"].size(0),192),device=batch["speech_feat"].device)
+            yield batch
+        except Exception as ex:
+            logging.warning(' ex info {}'.format(ex))
+            # assert False
+def padding_speech_token_spk(data, use_spk_embedding, mode='train'):
+    """ Padding the data into training data
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+    for sample in data:
+        assert isinstance(sample, list)
+        speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample],
+                                       dtype=torch.int32)
+        order = torch.argsort(speech_feat_len, descending=True)
+        # utts = [sample[i]['utt'] for i in order]
+        # speech_token = [torch.tensor(sample[i]['speech_token']) for i in order]
+        try:
+            speech_token = [sample[i]['speech_token'].clone().detach() for i in order]
+            speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32)
+            speech_token = pad_sequence(speech_token,
+                                        batch_first=True,
+                                        padding_value=0)
+            speech_feat = [sample[i]['speech_feat'] for i in order]
+            speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32)
+            speech_feat = pad_sequence(speech_feat,
+                                    batch_first=True,
+                                    padding_value=0)
+            spk_embedding = torch.stack([sample[i]['spk_embedding'] for i in order], dim=0)
+            batch = {
+                "speech_token": speech_token,
+                "speech_token_len": speech_token_len,
+                "speech_feat": speech_feat,
+                "speech_feat_len": speech_feat_len,
+                "spk_embedding": spk_embedding,
+            }
+            if mode == 'inference':
+                tts_text = [sample[i]['tts_text'] for i in order]
+                tts_index = [sample[i]['tts_index'] for i in order]
+                tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
+                tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
+                tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
+                batch.update({'tts_text': tts_text,
+                            'tts_index': tts_index,
+                            'tts_text_token': tts_text_token,
+                            'tts_text_token_len': tts_text_token_len})
+            # if use_spk_embedding is True:
+            #     batch["embedding"] = batch["spk_embedding"]
+            # else:
+            #     batch["embedding"] = batch["utt_embedding"]
+            # batch["embedding"]=torch.zeros((batch["speech_feat"].size(0),192),device=batch["speech_feat"].device)
+            batch["embedding"] = batch["spk_embedding"]
+            yield batch
+        except Exception as ex:
+            logging.warning(' ex info {}'.format(ex))
+            # assert False

third_party/GLM-4-Voice/cosyvoice/flow/decoder.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from einops import pack, rearrange, repeat
+from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
+from matcha.models.components.transformer import BasicTransformerBlock
+class ConditionalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for i in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = ResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x, mask, mu, t, spks=None, cond=None):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask