Spaces:

fb700
/

chatglm-fitness-RLHF

Runtime error

App Files Files Community

fb700 commited on Jul 29, 2023

Commit

8c9c9c7

1 Parent(s): 4202a0b

Upload 171 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.flake8 +1 -1
.gitattributes +18 -1
.gitignore +159 -0
Dockerfile +59 -28
LICENSE +21 -674
README.md +8 -7
app old.py +608 -0
app.py +435 -176
checkpoint/__init__.py +0 -0
checkpoint/freevc-24.pth +3 -0
checkpoints/BFM_Fitting/01_MorphableModel.mat +0 -0
checkpoints/BFM_Fitting/BFM09_model_info.mat +0 -0
checkpoints/BFM_Fitting/BFM_exp_idx.mat +0 -0
checkpoints/BFM_Fitting/BFM_front_idx.mat +0 -0
checkpoints/BFM_Fitting/facemodel_info.mat +0 -0
checkpoints/BFM_Fitting/select_vertex_id.mat +0 -0
checkpoints/BFM_Fitting/similarity_Lm3D_all.mat +0 -0
checkpoints/BFM_Fitting/std_exp.txt +0 -0
checkpoints/shape_predictor_68_face_landmarks.dat +0 -0
commons.py +171 -0
configs/freevc-24.json +54 -0
mel_processing.py +112 -0
models.py +351 -0
modules.py +342 -0
packages.txt +2 -0
requirements.txt +30 -4
speaker_encoder/__init__.py +1 -0
speaker_encoder/audio.py +107 -0
speaker_encoder/ckpt/__init__.py +1 -0
speaker_encoder/ckpt/pretrained_bak_5805000.pt +3 -0
speaker_encoder/compute_embed.py +40 -0
speaker_encoder/config.py +45 -0
speaker_encoder/data_objects/__init__.py +2 -0
speaker_encoder/data_objects/random_cycler.py +37 -0
speaker_encoder/data_objects/speaker.py +40 -0
speaker_encoder/data_objects/speaker_batch.py +12 -0
speaker_encoder/data_objects/speaker_verification_dataset.py +56 -0
speaker_encoder/data_objects/utterance.py +26 -0
speaker_encoder/hparams.py +31 -0
speaker_encoder/inference.py +177 -0
speaker_encoder/model.py +135 -0
speaker_encoder/params_data.py +29 -0
speaker_encoder/params_model.py +11 -0
speaker_encoder/preprocess.py +285 -0
speaker_encoder/train.py +125 -0
speaker_encoder/visualizations.py +178 -0
speaker_encoder/voice_encoder.py +173 -0
src/audio2exp_models/audio2exp.py +41 -0
src/audio2exp_models/networks.py +74 -0
src/audio2pose_models/audio2pose.py +94 -0

.flake8 CHANGED Viewed

@@ -18,4 +18,4 @@ exclude =
   dist,
   .venv
   pad*.py
-max-complexity = 25

   dist,
   .venv
   pad*.py
+max-complexity = 25

.gitattributes CHANGED Viewed

@@ -25,7 +25,6 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,21 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoints/BFM_Fitting/01_MorphableModel.mat filter=lfs diff=lfs merge=lfs -text
+checkpoints/BFM_Fitting/BFM09_model_info.mat filter=lfs diff=lfs merge=lfs -text
+checkpoints/facevid2vid_00189-model.pth.tar filter=lfs diff=lfs merge=lfs -text
+checkpoints/mapping_00229-model.pth.tar filter=lfs diff=lfs merge=lfs -text
+checkpoints/shape_predictor_68_face_landmarks.dat filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/chinese_news.wav filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/deyu.wav filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/eluosi.wav filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/fayu.wav filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/imagine.wav filter=lfs diff=lfs merge=lfs -text
+examples/driven_audio/japanese.wav filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_16.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_17.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_3.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_4.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_5.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_8.png filter=lfs diff=lfs merge=lfs -text
+examples/source_image/art_9.png filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

	@@ -0,0 +1,159 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+results/
+checkpoints/
+gradio_cached_examples/
+gfpgan/
+start.sh

Dockerfile CHANGED Viewed

@@ -1,28 +1,59 @@
-# 此Dockerfile适用于“无本地模型”的环境构建，如果需要使用chatglm等本地模型，请参考 docs/Dockerfile+ChatGLM
-# 如何构建: 先修改 `config.py`， 然后 docker build -t gpt-academic .
-# 如何运行: docker run --rm -it --net=host gpt-academic
-FROM python:3.11
-RUN echo '[global]' > /etc/pip.conf && \
-    echo 'index-url = https://mirrors.aliyun.com/pypi/simple/' >> /etc/pip.conf && \
-    echo 'trusted-host = mirrors.aliyun.com' >> /etc/pip.conf
-WORKDIR /gpt
-# 安装依赖
-COPY requirements.txt ./
-COPY ./docs/gradio-3.32.2-py3-none-any.whl ./docs/gradio-3.32.2-py3-none-any.whl
-RUN pip3 install -r requirements.txt
-# 装载项目文件
-COPY . .
-RUN pip3 install -r requirements.txt
-# 可选步骤，用于预热模块
-RUN python3  -c 'from check_proxy import warm_up_modules; warm_up_modules()'
-# 启动
-CMD ["python3", "-u", "main.py"]

+FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends \
+    git \
+    zip \
+    unzip \
+    git-lfs \
+    wget \
+    curl \
+    # ffmpeg \
+    ffmpeg \
+    x264 \
+    # python build dependencies \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    libxml2-dev \
+    libxmlsec1-dev \
+    libffi-dev \
+    liblzma-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:${PATH}
+WORKDIR ${HOME}/app
+RUN curl https://pyenv.run | bash
+ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
+ENV PYTHON_VERSION=3.10.9
+RUN pyenv install ${PYTHON_VERSION} && \
+    pyenv global ${PYTHON_VERSION} && \
+    pyenv rehash && \
+    pip install --no-cache-dir -U pip setuptools wheel
+RUN pip install --no-cache-dir -U torch==1.12.1 torchvision==0.13.1
+COPY --chown=1000 requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir -U -r /tmp/requirements.txt
+COPY --chown=1000 . ${HOME}/app
+RUN ls -a
+ENV PYTHONPATH=${HOME}/app \
+    PYTHONUNBUFFERED=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_THEME=huggingface \
+    SYSTEM=spaces
+CMD ["python", "app.py"]

LICENSE CHANGED Viewed

@@ -1,674 +1,21 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
- Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-                            Preamble
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-  The precise terms and conditions for copying, distribution and
-modification follow.
-                       TERMS AND CONDITIONS
-  0. Definitions.
-  "This License" refers to version 3 of the GNU General Public License.
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-  1. Source Code.
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-  The Corresponding Source for a work in source code form is that
-same work.
-  2. Basic Permissions.
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-  4. Conveying Verbatim Copies.
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-  5. Conveying Modified Source Versions.
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-  6. Conveying Non-Source Forms.
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-  7. Additional Terms.
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-  8. Termination.
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-  9. Acceptance Not Required for Having Copies.
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-  10. Automatic Licensing of Downstream Recipients.
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-  11. Patents.
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-  12. No Surrender of Others' Freedom.
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-  13. Use with the GNU Affero General Public License.
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-  14. Revised Versions of this License.
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-  15. Disclaimer of Warranty.
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-  16. Limitation of Liability.
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-  17. Interpretation of Sections 15 and 16.
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-                     END OF TERMS AND CONDITIONS
-            How to Apply These Terms to Your New Programs
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-Also add information on how to contact you by electronic and paper mail.
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<https://www.gnu.org/licenses/>.
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<https://www.gnu.org/licenses/why-not-lgpl.html>.

+MIT License
+Copyright (c) 2023 Tencent AI Lab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
-title: Bofan Chatglm Fitness RLHF lora
-emoji: 🍀
-colorFrom: yellow
-colorTo: yellow
 sdk: gradio
-sdk_version: 3.38.0
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ChatGLM2-SadTalker
+emoji: 📺
+colorFrom: purple
+colorTo: green
 sdk: gradio
+sdk_version: 3.23.0
 app_file: app.py
 pinned: false
+license: mit
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app old.py ADDED Viewed

	@@ -0,0 +1,608 @@

+import os, sys
+import tempfile
+import gradio as gr
+from src.gradio_demo import SadTalker
+# from src.utils.text2speech import TTSTalker
+from huggingface_hub import snapshot_download
+import torch
+import librosa
+from scipy.io.wavfile import write
+from transformers import WavLMModel
+import utils
+from models import SynthesizerTrn
+from mel_processing import mel_spectrogram_torch
+from speaker_encoder.voice_encoder import SpeakerEncoder
+import time
+from textwrap import dedent
+import mdtex2html
+from loguru import logger
+from transformers import AutoModel, AutoTokenizer
+from tts_voice import tts_order_voice
+import edge_tts
+import tempfile
+import anyio
+def get_source_image(image):
+        return image
+try:
+    import webui  # in webui
+    in_webui = True
+except:
+    in_webui = False
+def toggle_audio_file(choice):
+    if choice == False:
+        return gr.update(visible=True), gr.update(visible=False)
+    else:
+        return gr.update(visible=False), gr.update(visible=True)
+def ref_video_fn(path_of_ref_video):
+    if path_of_ref_video is not None:
+        return gr.update(value=True)
+    else:
+        return gr.update(value=False)
+def download_model():
+    REPO_ID = 'vinthony/SadTalker-V002rc'
+    snapshot_download(repo_id=REPO_ID, local_dir='./checkpoints', local_dir_use_symlinks=True)
+def sadtalker_demo():
+    download_model()
+    sad_talker = SadTalker(lazy_load=True)
+    # tts_talker = TTSTalker()
+download_model()
+sad_talker = SadTalker(lazy_load=True)
+# ChatGLM2 & FreeVC
+'''
+def get_wavlm():
+    os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
+    shutil.move('WavLM-Large.pt', 'wavlm')
+'''
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+print("Loading FreeVC(24k)...")
+hps = utils.get_hparams_from_file("configs/freevc-24.json")
+freevc_24 = SynthesizerTrn(
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model).to(device)
+_ = freevc_24.eval()
+_ = utils.load_checkpoint("checkpoint/freevc-24.pth", freevc_24, None)
+print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+def convert(model, src, tgt):
+    with torch.no_grad():
+        # tgt
+        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+        if model == "FreeVC" or model == "FreeVC (24kHz)":
+            g_tgt = smodel.embed_utterance(wav_tgt)
+            g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+        else:
+            wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
+            mel_tgt = mel_spectrogram_torch(
+                wav_tgt,
+                hps.data.filter_length,
+                hps.data.n_mel_channels,
+                hps.data.sampling_rate,
+                hps.data.hop_length,
+                hps.data.win_length,
+                hps.data.mel_fmin,
+                hps.data.mel_fmax
+            )
+        # src
+        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+        wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+        c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+        # infer
+        if model == "FreeVC":
+            audio = freevc.infer(c, g=g_tgt)
+        elif model == "FreeVC-s":
+            audio = freevc_s.infer(c, mel=mel_tgt)
+        else:
+            audio = freevc_24.infer(c, g=g_tgt)
+        audio = audio[0][0].data.cpu().float().numpy()
+        if model == "FreeVC" or model == "FreeVC-s":
+            write("out.wav", hps.data.sampling_rate, audio)
+        else:
+            write("out.wav", 24000, audio)
+    out = "out.wav"
+    return out
+# GLM2
+language_dict = tts_order_voice
+# fix timezone in Linux
+os.environ["TZ"] = "Asia/Shanghai"
+try:
+    time.tzset()  # type: ignore # pylint: disable=no-member
+except Exception:
+    # Windows
+    logger.warning("Windows, cant run time.tzset()")
+# model_name = "THUDM/chatglm2-6b"
+model_name = "THUDM/chatglm2-6b-int4"
+RETRY_FLAG = False
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# model = AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda()
+# 4/8 bit
+# model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).quantize(4).cuda()
+has_cuda = torch.cuda.is_available()
+# has_cuda = False  # force cpu
+if has_cuda:
+    model_glm = (
+        AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda().half()
+    )  # 3.92G
+else:
+    model_glm = AutoModel.from_pretrained(
+        model_name, trust_remote_code=True
+    ).float()  # .float() .half().float()
+model_glm = model_glm.eval()
+_ = """Override Chatbot.postprocess"""
+def postprocess(self, y):
+    if y is None:
+        return []
+    for i, (message, response) in enumerate(y):
+        y[i] = (
+            None if message is None else mdtex2html.convert((message)),
+            None if response is None else mdtex2html.convert(response),
+        )
+    return y
+gr.Chatbot.postprocess = postprocess
+def parse_text(text):
+    """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split("`")
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = "<br></code></pre>"
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", r"\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>" + line
+    text = "".join(lines)
+    return text
+def predict(
+    RETRY_FLAG, input, chatbot, max_length, top_p, temperature, history, past_key_values
+):
+    try:
+        chatbot.append((parse_text(input), ""))
+    except Exception as exc:
+        logger.error(exc)
+        logger.debug(f"{chatbot=}")
+        _ = """
+        if chatbot:
+            chatbot[-1] = (parse_text(input), str(exc))
+            yield chatbot, history, past_key_values
+        # """
+        yield chatbot, history, past_key_values
+    for response, history, past_key_values in model_glm.stream_chat(
+        tokenizer,
+        input,
+        history,
+        past_key_values=past_key_values,
+        return_past_key_values=True,
+        max_length=max_length,
+        top_p=top_p,
+        temperature=temperature,
+    ):
+        chatbot[-1] = (parse_text(input), parse_text(response))
+        # chatbot[-1][-1] = parse_text(response)
+        yield chatbot, history, past_key_values, parse_text(response)
+def trans_api(input, max_length=4096, top_p=0.8, temperature=0.2):
+    if max_length < 10:
+        max_length = 4096
+    if top_p < 0.1 or top_p > 1:
+        top_p = 0.85
+    if temperature <= 0 or temperature > 1:
+        temperature = 0.01
+    try:
+        res, _ = model_glm.chat(
+            tokenizer,
+            input,
+            history=[],
+            past_key_values=None,
+            max_length=max_length,
+            top_p=top_p,
+            temperature=temperature,
+        )
+        # logger.debug(f"{res=} \n{_=}")
+    except Exception as exc:
+        logger.error(f"{exc=}")
+        res = str(exc)
+    return res
+def reset_user_input():
+    return gr.update(value="")
+def reset_state():
+    return [], [], None, ""
+# Delete last turn
+def delete_last_turn(chat, history):
+    if chat and history:
+        chat.pop(-1)
+        history.pop(-1)
+    return chat, history
+# Regenerate response
+def retry_last_answer(
+    user_input, chatbot, max_length, top_p, temperature, history, past_key_values
+):
+    if chatbot and history:
+        # Removing the previous conversation from chat
+        chatbot.pop(-1)
+        # Setting up a flag to capture a retry
+        RETRY_FLAG = True
+        # Getting last message from user
+        user_input = history[-1][0]
+        # Removing bot response from the history
+        history.pop(-1)
+    yield from predict(
+        RETRY_FLAG,  # type: ignore
+        user_input,
+        chatbot,
+        max_length,
+        top_p,
+        temperature,
+        history,
+        past_key_values,
+    )
+# print
+def print(text):
+    return text
+# TTS
+async def text_to_speech_edge(text, language_code):
+    voice = language_dict[language_code]
+    communicate = edge_tts.Communicate(text, voice)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_path = tmp_file.name
+    await communicate.save(tmp_path)
+    return tmp_path
+with gr.Blocks(title="ChatGLM2-6B-int4", theme=gr.themes.Soft(text_size="sm"), analytics_enabled=False) as demo:
+    gr.HTML("<center>"
+            "<h1>📺💕🎶 - ChatGLM2+声音克隆+视频对话：和喜欢的角色畅所欲言吧！</h1>"
+            "</center>")
+    gr.Markdown("## <center>🥳 - ChatGLM2+FreeVC+SadTalker，为您打造沉浸式的视频对话体验，支持中英双语</center>")
+    gr.Markdown("## <center>🌊 - 更多精彩应用，尽在[滔滔AI](http://www.talktalkai.com)；滔滔AI，为爱滔滔！💕</center>")
+    gr.Markdown("### <center>⭐ - 如果您喜欢这个程序，欢迎给我的[GitHub项目](https://github.com/KevinWang676/ChatGLM2-Voice-Cloning)点赞支持！</center>")
+    with gr.Tab("🍻 - ChatGLM2聊天区"):
+        with gr.Accordion("📒 相关信息", open=False):
+            _ = f""" ChatGLM2的可选参数信息：
+                * Low temperature: responses will be more deterministic and focused; High temperature: responses more creative.
+                * Suggested temperatures -- translation: up to 0.3; chatting: > 0.4
+                * Top P controls dynamic vocabulary selection based on context.\n
+                如果您想让ChatGLM2进行角色扮演并与之对话，请先输入恰当的提示词，如“请你扮演成动漫角色蜡笔小新并和我进行对话”；您也可以为ChatGLM2提供自定义的角色设定\n
+                当您使用声音克隆功能时，请先在此程序的对应位置上传一段您喜欢的音频
+                """
+            gr.Markdown(dedent(_))
+        chatbot = gr.Chatbot(height=300)
+        with gr.Row():
+            with gr.Column(scale=4):
+                with gr.Column(scale=12):
+                    user_input = gr.Textbox(
+                        label="请在此处和GLM2聊天 (按回车键即可发送)",
+                        placeholder="聊点什么吧",
+                    )
+                    RETRY_FLAG = gr.Checkbox(value=False, visible=False)
+        with gr.Column(min_width=32, scale=1):
+            with gr.Row():
+                submitBtn = gr.Button("开始和GLM2交流吧", variant="primary")
+                deleteBtn = gr.Button("删除最新一轮对话", variant="secondary")
+                retryBtn = gr.Button("重新生成最新一轮对话", variant="secondary")
+        with gr.Accordion("🔧 更多设置", open=False):
+            with gr.Row():
+                emptyBtn = gr.Button("清空所有聊天记录")
+                max_length = gr.Slider(
+                    0,
+                    32768,
+                    value=8192,
+                    step=1.0,
+                    label="Maximum length",
+                    interactive=True,
+                )
+                top_p = gr.Slider(
+                    0, 1, value=0.85, step=0.01, label="Top P", interactive=True
+                )
+                temperature = gr.Slider(
+                    0.01, 1, value=0.95, step=0.01, label="Temperature", interactive=True
+                )
+        with gr.Row():
+            test1 = gr.Textbox(label="GLM2的最新回答 (可编辑)", lines = 3)
+            with gr.Column():
+                language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人")
+                tts_btn = gr.Button("生成对应的音频吧", variant="primary")
+            output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False)
+        tts_btn.click(text_to_speech_edge, inputs=[test1, language], outputs=[output_audio])
+        with gr.Row():
+            model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False)
+            audio1 = output_audio
+            audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath')
+            clone_btn = gr.Button("开始AI声音克隆吧", variant="primary")
+            audio_cloned =  gr.Audio(label="为您生成的专属声音克隆音频", type='filepath')
+        clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned])
+        history = gr.State([])
+        past_key_values = gr.State(None)
+        user_input.submit(
+            predict,
+            [
+                RETRY_FLAG,
+                user_input,
+                chatbot,
+                max_length,
+                top_p,
+                temperature,
+                history,
+                past_key_values,
+            ],
+            [chatbot, history, past_key_values, test1],
+            show_progress="full",
+        )
+        submitBtn.click(
+            predict,
+            [
+                RETRY_FLAG,
+                user_input,
+                chatbot,
+                max_length,
+                top_p,
+                temperature,
+                history,
+                past_key_values,
+            ],
+            [chatbot, history, past_key_values, test1],
+            show_progress="full",
+            api_name="predict",
+        )
+        submitBtn.click(reset_user_input, [], [user_input])
+        emptyBtn.click(
+            reset_state, outputs=[chatbot, history, past_key_values, test1], show_progress="full"
+        )
+        retryBtn.click(
+            retry_last_answer,
+            inputs=[
+                user_input,
+                chatbot,
+                max_length,
+                top_p,
+                temperature,
+                history,
+                past_key_values,
+            ],
+            # outputs = [chatbot, history, last_user_message, user_message]
+            outputs=[chatbot, history, past_key_values, test1],
+        )
+        deleteBtn.click(delete_last_turn, [chatbot, history], [chatbot, history])
+        with gr.Accordion("📔 提示词示例", open=False):
+            etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
+            examples = gr.Examples(
+                examples=[
+                    ["Explain the plot of Cinderella in a sentence."],
+                    [
+                        "How long does it take to become proficient in French, and what are the best methods for retaining information?"
+                    ],
+                    ["What are some common mistakes to avoid when writing code?"],
+                    ["Build a prompt to generate a beautiful portrait of a horse"],
+                    ["Suggest four metaphors to describe the benefits of AI"],
+                    ["Write a pop song about leaving home for the sandy beaches."],
+                    ["Write a summary demonstrating my ability to tame lions"],
+                    ["鲁迅和周树人什么关系"],
+                    ["从前有一头牛，这头牛后面有什么？"],
+                    ["正无穷大加一大于正无穷大吗？"],
+                    ["正无穷大加正无穷大大于正无穷大吗？"],
+                    ["-2的平方根等于什么"],
+                    ["树上有5只鸟，猎人开枪打死了一只。树上还有几只鸟？"],
+                    ["树上有11只鸟，猎人开枪打死了一只。树上还有几只鸟？提示：需考虑鸟可能受惊吓飞走。"],
+                    ["鲁迅和周树人什么关系 用英文回答"],
+                    ["以红楼梦的行文风格写一张委婉的请假条。不少于320字。"],
+                    [f"{etext} 翻成中文，列出3个版本"],
+                    [f"{etext} \n 翻成中文，保留原意，但使用文学性的语言。不要写解释。列出3个版本"],
+                    ["js 判断一个数是不是质数"],
+                    ["js 实现python 的 range(10)"],
+                    ["js 实现python 的 [*(range(10)]"],
+                    ["假定 1 + 2 = 4, 试求 7 + 8"],
+                    ["Erkläre die Handlung von Cinderella in einem Satz."],
+                    ["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
+                ],
+                inputs=[user_input],
+                examples_per_page=30,
+            )
+        with gr.Accordion("For Chat/Translation API", open=False, visible=False):
+            input_text = gr.Text()
+            tr_btn = gr.Button("Go", variant="primary")
+            out_text = gr.Text()
+        tr_btn.click(
+            trans_api,
+            [input_text, max_length, top_p, temperature],
+            out_text,
+            # show_progress="full",
+            api_name="tr",
+        )
+        _ = """
+        input_text.submit(
+            trans_api,
+            [input_text, max_length, top_p, temperature],
+            out_text,
+            show_progress="full",
+            api_name="tr1",
+        )
+        # """
+    with gr.Tab("📺 - 视频聊天区"):
+        with gr.Row().style(equal_height=False):
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="sadtalker_source_image"):
+                    with gr.TabItem('图片上传'):
+                        with gr.Row():
+                            source_image = gr.Image(label="请上传一张您喜欢角色的图片", source="upload", type="filepath", elem_id="img2img_image").style(width=512)
+                with gr.Tabs(elem_id="sadtalker_driven_audio"):
+                    with gr.TabItem('💡您还可以将视频下载到本地'):
+                        with gr.Row():
+                            driven_audio = audio_cloned
+                            driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", source="upload", type="filepath", visible=False)
+                            with gr.Column():
+                                use_idle_mode = gr.Checkbox(label="Use Idle Animation", visible=False)
+                                length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.", visible=False)
+                                use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
+                        with gr.Row():
+                            ref_video = gr.Video(label="Reference Video", source="upload", type="filepath", elem_id="vidref", visible=False).style(width=512)
+                            with gr.Column():
+                                use_ref_video = gr.Checkbox(label="Use Reference Video", visible=False)
+                                ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))", visible=False)
+                            ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="sadtalker_checkbox"):
+                    with gr.TabItem('视频设置'):
+                        with gr.Column(variant='panel'):
+                            # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
+                            # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
+                            with gr.Row():
+                                pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0, visible=False) #
+                                exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1, visible=False) #
+                                blink_every = gr.Checkbox(label="use eye blink", value=True, visible=False)
+                            with gr.Row():
+                                size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?", visible=False) #
+                                preprocess_type = gr.Radio(['crop', 'full'], value='crop', label='是否聚焦角色面部', info="crop：视频会聚焦角色面部；full：视频会显示图片全貌")
+                            with gr.Row():
+                                is_still_mode = gr.Checkbox(label="静态模式 (开启静态模式，角色的面部动作会减少；默认开启)", value=True)
+                                facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?", visible=False)
+                            with gr.Row():
+                                batch_size = gr.Slider(label="Batch size (数值越大，生成速度越快；若显卡性能好，可增大数值)", step=1, maximum=32, value=2)
+                                enhancer = gr.Checkbox(label="GFPGAN as Face enhancer", value=True, visible=False)
+                            submit = gr.Button('开始视频聊天吧', elem_id="sadtalker_generate", variant='primary')
+                with gr.Tabs(elem_id="sadtalker_genearted"):
+                        gen_video = gr.Video(label="为您生成的专属视频", format="mp4").style(width=256)
+        submit.click(
+                fn=sad_talker.test,
+                inputs=[source_image,
+                        driven_audio,
+                        preprocess_type,
+                        is_still_mode,
+                        enhancer,
+                        batch_size,
+                        size_of_image,
+                        pose_style,
+                        facerender,
+                        exp_weight,
+                        use_ref_video,
+                        ref_video,
+                        ref_info,
+                        use_idle_mode,
+                        length_of_audio,
+                        blink_every
+                        ],
+                outputs=[gen_video]
+                )
+    gr.Markdown("### <center>注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。</center>")
+    gr.Markdown("<center>💡- 如何使用此程序：输入您对ChatGLM的提问后，依次点击“开始和GLM2交流吧”、“生成对应的音频吧”、“开始AI声音克隆吧”、“开始视频聊天吧”四个按键即可；使用声音克隆功能时，请先上传一段您喜欢的音频</center>")
+    gr.HTML('''
+        <div class="footer">
+                    <p>🌊🏞️🎶 - 江水东流急，滔滔无尽声。 明·顾璘
+                    </p>
+        </div>
+    ''')
+demo.queue().launch(show_error=True, debug=True)

app.py CHANGED Viewed

@@ -1,22 +1,137 @@
-"""Credit to https://github.com/THUDM/ChatGLM2-6B/blob/main/web_demo.py while mistakes are mine."""
-# pylint: disable=broad-exception-caught, redefined-outer-name, missing-function-docstring, missing-module-docstring, too-many-arguments, line-too-long, invalid-name, redefined-builtin, redefined-argument-from-local
-# import gradio as gr
-# model_name = "models/THUDM/chatglm2-6b-int4"
-# gr.load(model_name).lauch()
-# %%writefile demo-4bit.py
-import os
 import time
 from textwrap import dedent
-import gradio as gr
 import mdtex2html
-import torch
 from loguru import logger
 from transformers import AutoModel, AutoTokenizer
 # fix timezone in Linux
 os.environ["TZ"] = "Asia/Shanghai"
 try:
@@ -25,16 +140,32 @@ except Exception:
     # Windows
     logger.warning("Windows, cant run time.tzset()")
-model_name = "fb700/chatglm-fitness-RLHF"
 RETRY_FLAG = False
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-#model = AutoModel.from_pretrained(model_name, trust_remote_code=True).quantize(4).half().cuda()
-model = AutoModel.from_pretrained(model_name, trust_remote_code=True).half().cuda()
-model = model.eval()
 _ = """Override Chatbot.postprocess"""
@@ -54,6 +185,7 @@ gr.Chatbot.postprocess = postprocess
 def parse_text(text):
     lines = text.split("\n")
     lines = [line for line in lines if line != ""]
     count = 0
@@ -99,8 +231,8 @@ def predict(
             yield chatbot, history, past_key_values
         # """
         yield chatbot, history, past_key_values
-    """
-    for response, history, past_key_values in model.stream_chat(
         tokenizer,
         input,
         history,
@@ -110,23 +242,21 @@ def predict(
         top_p=top_p,
         temperature=temperature,
     ):
-    """
-    for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p,
-                                               temperature=temperature):
         chatbot[-1] = (parse_text(input), parse_text(response))
-        yield chatbot, history, past_key_values
-def trans_api(input, max_length=40960, top_p=0.8, temperature=0.2):
     if max_length < 10:
-        max_length = 40960
     if top_p < 0.1 or top_p > 1:
         top_p = 0.85
     if temperature <= 0 or temperature > 1:
         temperature = 0.01
     try:
-        res, _ = model.chat(
             tokenizer,
             input,
             history=[],
@@ -148,7 +278,7 @@ def reset_user_input():
 def reset_state():
-    return [], [], None
 # Delete last turn
@@ -184,131 +314,177 @@ def retry_last_answer(
         past_key_values,
     )
-with gr.Blocks(title="Bofan Ai", theme=gr.themes.Soft(text_size="sm")) as demo:
-    # gr.HTML("""<h1 align="center">ChatGLM2-6B-int4</h1>""")
-    gr.HTML(
-        """<center><a href="https://huggingface.co/spaces/mikeee/chatglm2-6b-4bit?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>It's beyond Fitness,模型由[帛凡]基于ChatGLM-6b进行微调后，在健康（全科）、心理等领域达至少60分的专业水准，而且中文总结能力超越了GPT3.5各版本。</center>"""
-        """<center>特别声明：本应用仅为模型能力演示，无任何商业行为，部署资源为Huggingface官方免费提供，任何通过此项目产生的知识仅用于学术参考，作者和网站均不承担任何责任。</center>"""
-        """<h1 align="center">帛凡 Fitness AI 演示</h1>"""
-        """<center><a href="https://huggingface.co/fb700/chatglm-fitness-RLHF">Bofan基于chatglm-6的微调模型</a>如果喜欢请给个 ❤ 。遇到任何问题可邮件和我联系👉 [email protected]</center>"""
-    )
-    with gr.Accordion("🎈 Info", open=False):
-        _ = f"""
-            ## {model_name}
-            ChatGLM-6B 是开源中英双语对话模型，本次训练基于ChatGLM-6B 的第一代版本，在保留了初代模型对话流畅、部署门槛较低等众多优秀特性的基础之上开展训练。
-            本项目经过多位网友实测，中文总结能力超越了GPT3.5各版本，健康咨询水平优于其它同量级模型，且经优化目前可以支持无限context，远大于4k、8K、16K......，可能是任何个人和中小企业首选模型。
-            *首先，用40万条高质量数据进行强化训练，以提高模型的基础能力；
-            *第二，使用30万条人类反馈数据，构建一个表达方式规范优雅的语言模式（RM模型）；
-            *第三，在保留SFT阶段三分之一训练数据的同时，增加了30万条fitness数据，叠加RM模型，对ChatGLM-6B进行强化训练。
-            通过训练我们对模型有了更深刻的认知，LLM在一直在进化，好的方法和数据可以挖掘出模型的更大潜能。
-            训练中特别强化了中英文学术论文的翻译和总结，可以成为普通用户和科研人员的得力助手。
-            免责声明：本应用仅为模型能力演示，无任何商业行为，部署资源为huggingface官方免费提供，任何通过此项目产生的知识仅用��学术参考，作者和网站均不承担任何责任   。
-            The T4 GPU is sponsored by a community GPU grant from Huggingface. Thanks a lot!
-            [模型下载地址](https://huggingface.co/fb700/chatglm-fitness-RLHF)
-            """
-        gr.Markdown(dedent(_))
-    chatbot = gr.Chatbot()
-    with gr.Row():
-        with gr.Column(scale=4):
-            with gr.Column(scale=12):
-                user_input = gr.Textbox(
-                    show_label=False,
-                    placeholder="请输入内容Input...",
-                ).style(container=False)
-                RETRY_FLAG = gr.Checkbox(value=False, visible=False)
-            with gr.Column(min_width=32, scale=1):
-                with gr.Row():
-                    submitBtn = gr.Button("发送Submit", variant="primary")
-                    deleteBtn = gr.Button("删除最后一条对话", variant="secondary")
-                    retryBtn = gr.Button("重新生成Regenerate", variant="secondary")
-        with gr.Column(scale=1):
-            emptyBtn = gr.Button("清空对话Clear History")
-            max_length = gr.Slider(
-                0,
-                32768,
-                value=8192,
-                step=1.0,
-                label="Maximum length",
-                interactive=True,
-            )
-            top_p = gr.Slider(
-                0, 1, value=0.2, step=0.01, label="Top P", interactive=True
-            )
-            temperature = gr.Slider(
-                0.01, 1, value=0.85, step=0.01, label="Temperature", interactive=True
-            )
-    history = gr.State([])
-    past_key_values = gr.State(None)
-    user_input.submit(
-        predict,
-        [
-            RETRY_FLAG,
-            user_input,
-            chatbot,
-            max_length,
-            top_p,
-            temperature,
-            history,
-            past_key_values,
-        ],
-        [chatbot, history, past_key_values],
-        show_progress="full",
-    )
-    submitBtn.click(
-        predict,
-        [
-            RETRY_FLAG,
-            user_input,
-            chatbot,
-            max_length,
-            top_p,
-            temperature,
-            history,
-            past_key_values,
-        ],
-        [chatbot, history, past_key_values],
-        show_progress="full",
-        api_name="predict",
-    )
-    submitBtn.click(reset_user_input, [], [user_input])
-    emptyBtn.click(
-        reset_state, outputs=[chatbot, history, past_key_values], show_progress="full"
-    )
-    retryBtn.click(
-        retry_last_answer,
-        inputs=[
-            user_input,
-            chatbot,
-            max_length,
-            top_p,
-            temperature,
-            history,
-            past_key_values,
-        ],
-        # outputs = [chatbot, history, last_user_message, user_message]
-        outputs=[chatbot, history, past_key_values],
-    )
-    deleteBtn.click(delete_last_turn, [chatbot, history], [chatbot, history])
-    with gr.Accordion("Example inputs", open=True):
-        etext0 = """ "act": "作为基于文本的冒险游戏",\n "prompt": "我想让你扮演一个基于文本的冒险游戏。我在这个基于文本的冒险游戏中扮演一个角色。请尽可能具体地描述角色所看到的内容和环境，并在游戏输出1、2、3让用户选择进行回复，而不是其它方式。我将输入命令来告诉角色该做什么，而你需要回复角色的行动结果以推动游戏的进行。我的第一个命令是'醒来'，请从这里开始故事 “ """
         etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
         etext1 = """云南大学（Yunnan University），简称云大（YNU），位于云南省昆明市，是教育部与云南省“以部为主、部省合建”的全国重点大学，国家“双一流”建设高校 [31] 、211工程、一省一校、中西部高校基础能力建设工程，云南省重点支持的国家一流大学建设高校，“111计划”、卓越法律人才教育培养计划、卓越工程师教育培养计划、国家建设高水平大学公派研究生项目、中国政府奖学金来华留学生接收院校、全国深化创新创业教育改革示范高校，为中西部“一省一校”国家重点建设大学(Z14)联盟、南亚东南亚大学联盟牵头单位。 [1]
 云南大学始建于1922年，时为私立东陆大学。1930年，改为省立东陆大学。1934年更名为省立云南大学。1938年改为国立云南大学。1946年，《不列颠百科全书》将云南大学列为中国15所在世界最具影响的大学之一。1950年定名为云南大学。1958年，云南大学由中央高教部划归云南省管理。1978年，云南大学被国务院确定为88所全国重点大学之一。1996年首批列入国家“211工程”重点建设大学。1999年，云南政法高等专科学校并入云南大学。 [2] [23]
@@ -370,37 +546,120 @@ with gr.Blocks(title="Bofan Ai", theme=gr.themes.Soft(text_size="sm")) as demo:
                 ["Erkläre die Handlung von Cinderella in einem Satz."],
                 ["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
             ],
-            inputs=[user_input],
-            examples_per_page=50,
         )
-    with gr.Accordion("For Chat/Translation API", open=False, visible=False):
-        input_text = gr.Text()
-        tr_btn = gr.Button("Go", variant="primary")
-        out_text = gr.Text()
-    tr_btn.click(
-        trans_api,
-        [input_text, max_length, top_p, temperature],
-        out_text,
-        # show_progress="full",
-        api_name="tr",
-    )
-    _ = """
-    input_text.submit(
-        trans_api,
-        [input_text, max_length, top_p, temperature],
-        out_text,
-        show_progress="full",
-        api_name="tr1",
-    )
-    # """
-# demo.queue().launch(share=False, inbrowser=True)
-# demo.queue().launch(share=True, inbrowser=True, debug=True)
-# concurrency_count > 1 requires more memory, max_size: queue size
-# T4 medium: 30GB, model size: ~4G concurrency_count = 6
-# leave one for api access
-# reduce to 5 if OOM occurs to often
-demo.queue(concurrency_count=6, max_size=30).launch(debug=True)

+import os, sys
+import tempfile
+import gradio as gr
+from src.gradio_demo import SadTalker
+# from src.utils.text2speech import TTSTalker
+from huggingface_hub import snapshot_download
+import torch
+import librosa
+from scipy.io.wavfile import write
+from transformers import WavLMModel
+import utils
+from models import SynthesizerTrn
+from mel_processing import mel_spectrogram_torch
+from speaker_encoder.voice_encoder import SpeakerEncoder
 import time
 from textwrap import dedent
 import mdtex2html
 from loguru import logger
 from transformers import AutoModel, AutoTokenizer
+from tts_voice import tts_order_voice
+import edge_tts
+import tempfile
+import anyio
+def get_source_image(image):
+        return image
+try:
+    import webui  # in webui
+    in_webui = True
+except:
+    in_webui = False
+def toggle_audio_file(choice):
+    if choice == False:
+        return gr.update(visible=True), gr.update(visible=False)
+    else:
+        return gr.update(visible=False), gr.update(visible=True)
+def ref_video_fn(path_of_ref_video):
+    if path_of_ref_video is not None:
+        return gr.update(value=True)
+    else:
+        return gr.update(value=False)
+def download_model():
+    REPO_ID = 'vinthony/SadTalker-V002rc'
+    snapshot_download(repo_id=REPO_ID, local_dir='./checkpoints', local_dir_use_symlinks=True)
+def sadtalker_demo():
+    download_model()
+    sad_talker = SadTalker(lazy_load=True)
+    # tts_talker = TTSTalker()
+download_model()
+sad_talker = SadTalker(lazy_load=True)
+# ChatGLM2 & FreeVC
+'''
+def get_wavlm():
+    os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
+    shutil.move('WavLM-Large.pt', 'wavlm')
+'''
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+print("Loading FreeVC(24k)...")
+hps = utils.get_hparams_from_file("configs/freevc-24.json")
+freevc_24 = SynthesizerTrn(
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model).to(device)
+_ = freevc_24.eval()
+_ = utils.load_checkpoint("checkpoint/freevc-24.pth", freevc_24, None)
+print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+def convert(model, src, tgt):
+    with torch.no_grad():
+        # tgt
+        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+        if model == "FreeVC" or model == "FreeVC (24kHz)":
+            g_tgt = smodel.embed_utterance(wav_tgt)
+            g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+        else:
+            wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
+            mel_tgt = mel_spectrogram_torch(
+                wav_tgt,
+                hps.data.filter_length,
+                hps.data.n_mel_channels,
+                hps.data.sampling_rate,
+                hps.data.hop_length,
+                hps.data.win_length,
+                hps.data.mel_fmin,
+                hps.data.mel_fmax
+            )
+        # src
+        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+        wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+        c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+        # infer
+        if model == "FreeVC":
+            audio = freevc.infer(c, g=g_tgt)
+        elif model == "FreeVC-s":
+            audio = freevc_s.infer(c, mel=mel_tgt)
+        else:
+            audio = freevc_24.infer(c, g=g_tgt)
+        audio = audio[0][0].data.cpu().float().numpy()
+        if model == "FreeVC" or model == "FreeVC-s":
+            write("out.wav", hps.data.sampling_rate, audio)
+        else:
+            write("out.wav", 24000, audio)
+    out = "out.wav"
+    return out
+# BofanAi
+language_dict = tts_order_voice
 # fix timezone in Linux
 os.environ["TZ"] = "Asia/Shanghai"
 try:
     # Windows
     logger.warning("Windows, cant run time.tzset()")
+# model_name = "THUDM/chatglm2-6b"
+model_name = "fb700/chatglm-fitness-RLHF"
 RETRY_FLAG = False
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# model = AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda()
+# 4/8 bit
+# model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).quantize(4).cuda()
+has_cuda = torch.cuda.is_available()
+# has_cuda = False  # force cpu
+if has_cuda:
+    model_glm = (
+        AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda().half()
+    )  # 3.92G
+else:
+    model_glm = AutoModel.from_pretrained(
+        model_name, trust_remote_code=True
+    ).float()  # .float() .half().float()
+model_glm = model_glm.eval()
 _ = """Override Chatbot.postprocess"""
 def parse_text(text):
+    """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
     lines = text.split("\n")
     lines = [line for line in lines if line != ""]
     count = 0
             yield chatbot, history, past_key_values
         # """
         yield chatbot, history, past_key_values
+    for response, history, past_key_values in model_glm.stream_chat(
         tokenizer,
         input,
         history,
         top_p=top_p,
         temperature=temperature,
     ):
         chatbot[-1] = (parse_text(input), parse_text(response))
+        # chatbot[-1][-1] = parse_text(response)
+        yield chatbot, history, past_key_values, parse_text(response)
+def trans_api(input, max_length=4096, top_p=0.8, temperature=0.2):
     if max_length < 10:
+        max_length = 4096
     if top_p < 0.1 or top_p > 1:
         top_p = 0.85
     if temperature <= 0 or temperature > 1:
         temperature = 0.01
     try:
+        res, _ = model_glm.chat(
             tokenizer,
             input,
             history=[],
 def reset_state():
+    return [], [], None, ""
 # Delete last turn
         past_key_values,
     )
+# print
+def print(text):
+    return text
+# TTS
+async def text_to_speech_edge(text, language_code):
+    voice = language_dict[language_code]
+    communicate = edge_tts.Communicate(text, voice)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_path = tmp_file.name
+    await communicate.save(tmp_path)
+    return tmp_path
+with gr.Blocks(title="Bofan Ai", theme=gr.themes.Soft(text_size="sm"), analytics_enabled=False) as demo:
+    gr.HTML("<center>"
+            "<h1>📺💕🎶 - BofanAi+声音克隆+视频对话：和喜欢的角色畅所欲言吧！</h1>"
+            "</center>"
+            """<center><a href="https://huggingface.co/fb700/chatglm-fitness-RLHF">Bofan基于chatglm-6的微调模型</a>如果喜欢请给个 ❤ 。遇到任何问题可邮件和我联系👉 [email protected]</center>"""
+    )
+       gr.Markdown("## <center>帛凡 Fitness AI 演示</center>"
+       """<center>特别声明：本应用仅为模型能力演示，无任何商业行为，部署资源为Huggingface官方免费提供，任何通过此项目产生的知识仅用于学术参考，作者和网站均不承担任何责任。</center>"""
+       )
+    with gr.Tab("🍻 - BofanAi聊天区"):
+        with gr.Accordion("📒 相关信息", open=False):
+            _ = f""" BofanAi的可选参数信息：
+                * Low temperature: responses will be more deterministic and focused; High temperature: responses more creative.
+                * Suggested temperatures -- translation: up to 0.3; chatting: > 0.4
+                * Top P controls dynamic vocabulary selection based on context.\n
+                如果您想让BofanAi进行角色扮演并与之对话，请先输入恰当的提示词，如“请你扮演成动漫角色蜡笔小新并和我进行对话”；您也可以为BofanAi提供自定义的角色设定\n
+                当您使用声音克隆功能时，请先在此程序的对应位置上传一段您喜欢的音频
+                ## {model_name}
+                ChatGLM-6B 是开源中英双语对话模型，本次训练基于ChatGLM-6B 的第一代版本，在保留了初代模型对话流畅、部署门槛较低等众多优秀特性的基础之上开展训练。
+                本项目经过多位网友实测，中文总结能力超越了GPT3.5各版本，健康咨询水平优于其它同量级模型，且经优化目前可以支持无限context，远大于4k、8K、16K......，可能是任何个人和中小企业首选模型。
+                *首先，用40万条高质量数据进行强化训练，以提高模型的基础能力；
+                *第二，使用30万条人类反馈数据，构建一个表达方式规范优雅的语言模式（RM模型）；
+                *第三，在保留SFT阶段三分之一训练数据的同时，增加了30万条fitness数据，叠加RM模型，对ChatGLM-6B进行强化训练。
+                通过训练我们对模型有了更深刻的认知，LLM在一直在进化，好的方法和数据可以挖掘出模型的更大潜能。
+                训练中特别强化了中英文学术论文的翻译和总结，可以成为普通用户和科研人员的得力助手。
+                免责声明：本应用仅为模型能力演示，无任何商业行为，部署资源为huggingface官方免费提供，任何通过此项目产生的知识仅用于学术参考，作者和网站均不承担任何责任   。
+                The T4 GPU is sponsored by a community GPU grant from Huggingface. Thanks a lot!
+                [模型下载地址](https://huggingface.co/fb700/chatglm-fitness-RLHF)
+                """
+            gr.Markdown(dedent(_))
+        chatbot = gr.Chatbot(height=300)
+        with gr.Row():
+            with gr.Column(scale=4):
+                with gr.Column(scale=12):
+                    user_input = gr.Textbox(
+                        label="请在此处和BofanAi聊天 (按回车键即可发送)",
+                        placeholder="聊点什么吧",
+                    )
+                    RETRY_FLAG = gr.Checkbox(value=False, visible=False)
+        with gr.Column(min_width=32, scale=1):
+            with gr.Row():
+                submitBtn = gr.Button("开始和BofanAi交流吧", variant="primary")
+                deleteBtn = gr.Button("删除最新一轮对话", variant="secondary")
+                retryBtn = gr.Button("重新生成最新一轮对话", variant="secondary")
+        with gr.Accordion("🔧 更多设置", open=False):
+            with gr.Row():
+                emptyBtn = gr.Button("清空所有聊天记录")
+                max_length = gr.Slider(
+                    0,
+                    32768,
+                    value=8192,
+                    step=1.0,
+                    label="Maximum length",
+                    interactive=True,
+                )
+                top_p = gr.Slider(
+                    0, 1, value=0.2, step=0.01, label="Top P", interactive=True
+                )
+                temperature = gr.Slider(
+                    0.01, 1, value=0.85, step=0.01, label="Temperature", interactive=True
+                )
+        with gr.Row():
+            test1 = gr.Textbox(label="BofanAi的最新回答 (可编辑)", lines = 3)
+            with gr.Column():
+                language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人")
+                tts_btn = gr.Button("生成对应的音频吧", variant="primary")
+            output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False)
+        tts_btn.click(text_to_speech_edge, inputs=[test1, language], outputs=[output_audio])
+        with gr.Row():
+            model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False)
+            audio1 = output_audio
+            audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath')
+            clone_btn = gr.Button("开始AI声音克隆吧", variant="primary")
+            audio_cloned =  gr.Audio(label="为您生成的专属声音克隆音频", type='filepath')
+        clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned])
+        history = gr.State([])
+        past_key_values = gr.State(None)
+        user_input.submit(
+            predict,
+            [
+                RETRY_FLAG,
+                user_input,
+                chatbot,
+                max_length,
+                top_p,
+                temperature,
+                history,
+                past_key_values,
+            ],
+            [chatbot, history, past_key_values, test1],
+            show_progress="full",
+        )
+        submitBtn.click(
+            predict,
+            [
+                RETRY_FLAG,
+                user_input,
+                chatbot,
+                max_length,
+                top_p,
+                temperature,
+                history,
+                past_key_values,
+            ],
+            [chatbot, history, past_key_values, test1],
+            show_progress="full",
+            api_name="predict",
+        )
+        submitBtn.click(reset_user_input, [], [user_input])
+        emptyBtn.click(
+            reset_state, outputs=[chatbot, history, past_key_values, test1], show_progress="full"
+        )
+        retryBtn.click(
+            retry_last_answer,
+            inputs=[
+                user_input,
+                chatbot,
+                max_length,
+                top_p,
+                temperature,
+                history,
+                past_key_values,
+            ],
+            # outputs = [chatbot, history, last_user_message, user_message]
+            outputs=[chatbot, history, past_key_values, test1],
+        )
+        deleteBtn.click(delete_last_turn, [chatbot, history], [chatbot, history])
+        with gr.Accordion("📔 模型应用测试", open=True):
+            etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
+            examples = gr.Examples(
+                examples=[
+                    etext0 = """ "act": "作为基于文本的冒险游戏",\n "prompt": "我想让你扮演一个基于文本的冒险游戏。我在这个基于文本的冒险游戏中扮演一个角色。请尽可能具体地描述角色所看到的内容和环境，并在游戏输出1、2、3让用户选择进行回复，而不是其它方式。我将输入命令来告诉角色该做什么，而你需要回复角色的行动结果以推动游戏的进行。我的第一个命令是'醒来'，请从这里开始故事 “ """
         etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
         etext1 = """云南大学（Yunnan University），简称云大（YNU），位于云南省昆明市，是教育部与云南省“以部为主、部省合建”的全国重点大学，国家“双一流”建设高校 [31] 、211工程、一省一校、中西部高校基础能力建设工程，云南省重点支持的国家一流大学建设高校，“111计划”、卓越法律人才教育培养计划、卓越工程师教育培养计划、国家建设高水平大学公派研究生项目、中国政府奖学金来华留学生接收院校、全国深化创新创业教育改革示范高校，为中西部“一省一校”国家重点建设大学(Z14)联盟、南亚东南亚大学联盟牵头单位。 [1]
 云南大学始建于1922年，时为私立东陆大学。1930年，改为省立东陆大学。1934年更名为省立云南大学。1938年改为国立云南大学。1946年，《不列颠百科全书》将云南大学列为中国15所在世界最具影响的大学之一。1950年定名为云南大学。1958年，云南大学由中央高教部划归云南省管理。1978年，云南大学被国务院确定为88所全国重点大学之一。1996年首批列入国家“211工程”重点建设大学。1999年，云南政法高等专科学校并入云南大学。 [2] [23]
                 ["Erkläre die Handlung von Cinderella in einem Satz."],
                 ["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
             ],
+                inputs=[user_input],
+                examples_per_page=50,
+            )
+        with gr.Accordion("For Chat/Translation API", open=False, visible=False):
+            input_text = gr.Text()
+            tr_btn = gr.Button("Go", variant="primary")
+            out_text = gr.Text()
+        tr_btn.click(
+            trans_api,
+            [input_text, max_length, top_p, temperature],
+            out_text,
+            # show_progress="full",
+            api_name="tr",
         )
+        _ = """
+        input_text.submit(
+            trans_api,
+            [input_text, max_length, top_p, temperature],
+            out_text,
+            show_progress="full",
+            api_name="tr1",
+        )
+        # """
+    with gr.Tab("📺 - 视频聊天区"):
+        with gr.Row().style(equal_height=False):
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="sadtalker_source_image"):
+                    with gr.TabItem('图片上传'):
+                        with gr.Row():
+                            source_image = gr.Image(label="请上传一张您喜欢角色的图片", source="upload", type="filepath", elem_id="img2img_image").style(width=512)
+                with gr.Tabs(elem_id="sadtalker_driven_audio"):
+                    with gr.TabItem('💡您还可以将视频下载到本地'):
+                        with gr.Row():
+                            driven_audio = audio_cloned
+                            driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", source="upload", type="filepath", visible=False)
+                            with gr.Column():
+                                use_idle_mode = gr.Checkbox(label="Use Idle Animation", visible=False)
+                                length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.", visible=False)
+                                use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
+                        with gr.Row():
+                            ref_video = gr.Video(label="Reference Video", source="upload", type="filepath", elem_id="vidref", visible=False).style(width=512)
+                            with gr.Column():
+                                use_ref_video = gr.Checkbox(label="Use Reference Video", visible=False)
+                                ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))", visible=False)
+                            ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="sadtalker_checkbox"):
+                    with gr.TabItem('视频设置'):
+                        with gr.Column(variant='panel'):
+                            # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
+                            # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
+                            with gr.Row():
+                                pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0, visible=False) #
+                                exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1, visible=False) #
+                                blink_every = gr.Checkbox(label="use eye blink", value=True, visible=False)
+                            with gr.Row():
+                                size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?", visible=False) #
+                                preprocess_type = gr.Radio(['crop', 'full'], value='crop', label='是否聚焦角色面部', info="crop：视频会聚焦角色面部；full：视频会显示图片全貌")
+                            with gr.Row():
+                                is_still_mode = gr.Checkbox(label="静态模式 (开启静态模式，角色的面部动作会减少；默认开启)", value=True)
+                                facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?", visible=False)
+                            with gr.Row():
+                                batch_size = gr.Slider(label="Batch size (数值越大，生成速度越快；若显卡性能好，可增大数值)", step=1, maximum=32, value=2)
+                                enhancer = gr.Checkbox(label="GFPGAN as Face enhancer", value=True, visible=False)
+                            submit = gr.Button('开始视频聊天吧', elem_id="sadtalker_generate", variant='primary')
+                with gr.Tabs(elem_id="sadtalker_genearted"):
+                        gen_video = gr.Video(label="为您生成的专属视频", format="mp4").style(width=256)
+        submit.click(
+                fn=sad_talker.test,
+                inputs=[source_image,
+                        driven_audio,
+                        preprocess_type,
+                        is_still_mode,
+                        enhancer,
+                        batch_size,
+                        size_of_image,
+                        pose_style,
+                        facerender,
+                        exp_weight,
+                        use_ref_video,
+                        ref_video,
+                        ref_info,
+                        use_idle_mode,
+                        length_of_audio,
+                        blink_every
+                        ],
+                outputs=[gen_video]
+                )
+    gr.Markdown("### <center>注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。</center>")
+    gr.Markdown("<center>💡- 如何使用此程序：输入您对ChatGLM的提问后，依次点击“开始和BofanAi交流吧”、“生成对应的音频吧”、“开始AI声音克隆吧”、“开始视频聊天吧”四个按键即可；使用声音克隆功能时，请先上传一段您喜欢的音频</center>")
+    gr.HTML('''
+        <div class="footer">
+                    <p>🌊🏞️🎶 - 江水东流急，滔滔无尽声。 明·顾璘
+                    </p>
+        </div>
+    ''')
+demo.queue().launch(show_error=True, debug=True)

checkpoint/__init__.py ADDED Viewed

File without changes

checkpoint/freevc-24.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b39a86fefbc9ec6e30be8d26ee2a6aa5ffe6d235f6ab15773d01cdf348e5b20
+size 472644351

checkpoints/BFM_Fitting/01_MorphableModel.mat ADDED Viewed

File without changes

checkpoints/BFM_Fitting/BFM09_model_info.mat ADDED Viewed

File without changes

checkpoints/BFM_Fitting/BFM_exp_idx.mat ADDED Viewed

File without changes

checkpoints/BFM_Fitting/BFM_front_idx.mat ADDED Viewed

File without changes

checkpoints/BFM_Fitting/facemodel_info.mat ADDED Viewed

File without changes

checkpoints/BFM_Fitting/select_vertex_id.mat ADDED Viewed

File without changes

checkpoints/BFM_Fitting/similarity_Lm3D_all.mat ADDED Viewed

File without changes

checkpoints/BFM_Fitting/std_exp.txt ADDED Viewed

File without changes

checkpoints/shape_predictor_68_face_landmarks.dat ADDED Viewed

File without changes

commons.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+def init_weights(m, mean=0.0, std=0.01):
+  classname = m.__class__.__name__
+  if classname.find("Conv") != -1:
+    m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+  return int((kernel_size*dilation - dilation)/2)
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+def intersperse(lst, item):
+  result = [item] * (len(lst) * 2 + 1)
+  result[1::2] = lst
+  return result
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+  """KL(P||Q)"""
+  kl = (logs_q - logs_p) - 0.5
+  kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+  return kl
+def rand_gumbel(shape):
+  """Sample from the Gumbel distribution, protect from overflows."""
+  uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+  return -torch.log(-torch.log(uniform_samples))
+def rand_gumbel_like(x):
+  g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+  return g
+def slice_segments(x, ids_str, segment_size=4):
+  ret = torch.zeros_like(x[:, :, :segment_size])
+  for i in range(x.size(0)):
+    idx_str = ids_str[i]
+    idx_end = idx_str + segment_size
+    ret[i] = x[i, :, idx_str:idx_end]
+  return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size + 1
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+def rand_spec_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+def get_timing_signal_1d(
+    length, channels, min_timescale=1.0, max_timescale=1.0e4):
+  position = torch.arange(length, dtype=torch.float)
+  num_timescales = channels // 2
+  log_timescale_increment = (
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (num_timescales - 1))
+  inv_timescales = min_timescale * torch.exp(
+      torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+  scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+  signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+  signal = F.pad(signal, [0, 0, 0, channels % 2])
+  signal = signal.view(1, channels, length)
+  return signal
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return x + signal.to(dtype=x.dtype, device=x.device)
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+def subsequent_mask(length):
+  mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+  return mask
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+  n_channels_int = n_channels[0]
+  in_act = input_a + input_b
+  t_act = torch.tanh(in_act[:, :n_channels_int, :])
+  s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+  acts = t_act * s_act
+  return acts
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+def shift_1d(x):
+  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+  return x
+def sequence_mask(length, max_length=None):
+  if max_length is None:
+    max_length = length.max()
+  x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+  return x.unsqueeze(0) < length.unsqueeze(1)
+def generate_path(duration, mask):
+  """
+  duration: [b, 1, t_x]
+  mask: [b, 1, t_y, t_x]
+  """
+  device = duration.device
+  b, _, t_y, t_x = mask.shape
+  cum_duration = torch.cumsum(duration, -1)
+  cum_duration_flat = cum_duration.view(b * t_x)
+  path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+  path = path.view(b, t_x, t_y)
+  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+  path = path.unsqueeze(1).transpose(2,3) * mask
+  return path
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+  if isinstance(parameters, torch.Tensor):
+    parameters = [parameters]
+  parameters = list(filter(lambda p: p.grad is not None, parameters))
+  norm_type = float(norm_type)
+  if clip_value is not None:
+    clip_value = float(clip_value)
+  total_norm = 0
+  for p in parameters:
+    param_norm = p.grad.data.norm(norm_type)
+    total_norm += param_norm.item() ** norm_type
+    if clip_value is not None:
+      p.grad.data.clamp_(min=-clip_value, max=clip_value)
+  total_norm = total_norm ** (1. / norm_type)
+  return total_norm

configs/freevc-24.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8640,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8008"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,6,4,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": true
+  }
+}

mel_processing.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import math
+import os
+import random
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data
+import numpy as np
+import librosa
+import librosa.util as librosa_util
+from librosa.util import normalize, pad_center, tiny
+from scipy.signal import get_window
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+MAX_WAV_VALUE = 32768.0
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    return spec
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    global mel_basis
+    dtype_device = str(spec.dtype) + '_' + str(spec.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec

models.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+import commons
+import modules
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from commons import init_weights, get_padding
+class ResidualCouplingBlock(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      n_flows=4,
+      gin_channels=0):
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.n_flows = n_flows
+    self.gin_channels = gin_channels
+    self.flows = nn.ModuleList()
+    for i in range(n_flows):
+      self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+      self.flows.append(modules.Flip())
+  def forward(self, x, x_mask, g=None, reverse=False):
+    if not reverse:
+      for flow in self.flows:
+        x, _ = flow(x, x_mask, g=g, reverse=reverse)
+    else:
+      for flow in reversed(self.flows):
+        x = flow(x, x_mask, g=g, reverse=reverse)
+    return x
+class Encoder(nn.Module):
+  def __init__(self,
+      in_channels,
+      out_channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      gin_channels=0):
+    super().__init__()
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+    self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+  def forward(self, x, x_lengths, g=None):
+    x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+    x = self.pre(x) * x_mask
+    x = self.enc(x, x_mask, g=g)
+    stats = self.proj(x) * x_mask
+    m, logs = torch.split(stats, self.out_channels, dim=1)
+    z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+    return z, m, logs, x_mask
+class Generator(torch.nn.Module):
+    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+          x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+            norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+            norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2,3,5,7,11]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class SpeakerEncoder(torch.nn.Module):
+    def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
+        super(SpeakerEncoder, self).__init__()
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+    def forward(self, mels):
+        self.lstm.flatten_parameters()
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+    def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
+        mel_slices = []
+        for i in range(0, total_frames-partial_frames, partial_hop):
+            mel_range = torch.arange(i, i+partial_frames)
+            mel_slices.append(mel_range)
+        return mel_slices
+    def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
+        mel_len = mel.size(1)
+        last_mel = mel[:,-partial_frames:]
+        if mel_len > partial_frames:
+            mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
+            mels = list(mel[:,s] for s in mel_slices)
+            mels.append(last_mel)
+            mels = torch.stack(tuple(mels), 0).squeeze(1)
+            with torch.no_grad():
+                partial_embeds = self(mels)
+            embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
+            #embed = embed / torch.linalg.norm(embed, 2)
+        else:
+            with torch.no_grad():
+                embed = self(last_mel)
+        return embed
+class SynthesizerTrn(nn.Module):
+  """
+  Synthesizer for Training
+  """
+  def __init__(self,
+    spec_channels,
+    segment_size,
+    inter_channels,
+    hidden_channels,
+    filter_channels,
+    n_heads,
+    n_layers,
+    kernel_size,
+    p_dropout,
+    resblock,
+    resblock_kernel_sizes,
+    resblock_dilation_sizes,
+    upsample_rates,
+    upsample_initial_channel,
+    upsample_kernel_sizes,
+    gin_channels,
+    ssl_dim,
+    use_spk,
+    **kwargs):
+    super().__init__()
+    self.spec_channels = spec_channels
+    self.inter_channels = inter_channels
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.resblock = resblock
+    self.resblock_kernel_sizes = resblock_kernel_sizes
+    self.resblock_dilation_sizes = resblock_dilation_sizes
+    self.upsample_rates = upsample_rates
+    self.upsample_initial_channel = upsample_initial_channel
+    self.upsample_kernel_sizes = upsample_kernel_sizes
+    self.segment_size = segment_size
+    self.gin_channels = gin_channels
+    self.ssl_dim = ssl_dim
+    self.use_spk = use_spk
+    self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
+    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+    self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+    if not self.use_spk:
+      self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
+  def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if spec_lengths == None:
+      spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
+    if not self.use_spk:
+      g = self.enc_spk(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+    _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
+    z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
+    z_p = self.flow(z, spec_mask, g=g)
+    z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size)
+    o = self.dec(z_slice, g=g)
+    return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+  def infer(self, c, g=None, mel=None, c_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if not self.use_spk:
+      g = self.enc_spk.embed_utterance(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+    z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
+    z = self.flow(z_p, c_mask, g=g, reverse=True)
+    o = self.dec(z * c_mask, g=g)
+    return o

modules.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+import commons
+from commons import init_weights, get_padding
+LRELU_SLOPE = 0.1
+class LayerNorm(nn.Module):
+  def __init__(self, channels, eps=1e-5):
+    super().__init__()
+    self.channels = channels
+    self.eps = eps
+    self.gamma = nn.Parameter(torch.ones(channels))
+    self.beta = nn.Parameter(torch.zeros(channels))
+  def forward(self, x):
+    x = x.transpose(1, -1)
+    x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+    return x.transpose(1, -1)
+class ConvReluNorm(nn.Module):
+  def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+    super().__init__()
+    self.in_channels = in_channels
+    self.hidden_channels = hidden_channels
+    self.out_channels = out_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    assert n_layers > 1, "Number of layers should be larger than 0."
+    self.conv_layers = nn.ModuleList()
+    self.norm_layers = nn.ModuleList()
+    self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+    self.norm_layers.append(LayerNorm(hidden_channels))
+    self.relu_drop = nn.Sequential(
+        nn.ReLU(),
+        nn.Dropout(p_dropout))
+    for _ in range(n_layers-1):
+      self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+      self.norm_layers.append(LayerNorm(hidden_channels))
+    self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    self.proj.weight.data.zero_()
+    self.proj.bias.data.zero_()
+  def forward(self, x, x_mask):
+    x_org = x
+    for i in range(self.n_layers):
+      x = self.conv_layers[i](x * x_mask)
+      x = self.norm_layers[i](x)
+      x = self.relu_drop(x)
+    x = x_org + self.proj(x)
+    return x * x_mask
+class DDSConv(nn.Module):
+  """
+  Dialted and Depth-Separable Convolution
+  """
+  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+    super().__init__()
+    self.channels = channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    self.drop = nn.Dropout(p_dropout)
+    self.convs_sep = nn.ModuleList()
+    self.convs_1x1 = nn.ModuleList()
+    self.norms_1 = nn.ModuleList()
+    self.norms_2 = nn.ModuleList()
+    for i in range(n_layers):
+      dilation = kernel_size ** i
+      padding = (kernel_size * dilation - dilation) // 2
+      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
+          groups=channels, dilation=dilation, padding=padding
+      ))
+      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+      self.norms_1.append(LayerNorm(channels))
+      self.norms_2.append(LayerNorm(channels))
+  def forward(self, x, x_mask, g=None):
+    if g is not None:
+      x = x + g
+    for i in range(self.n_layers):
+      y = self.convs_sep[i](x * x_mask)
+      y = self.norms_1[i](y)
+      y = F.gelu(y)
+      y = self.convs_1x1[i](y)
+      y = self.norms_2[i](y)
+      y = F.gelu(y)
+      y = self.drop(y)
+      x = x + y
+    return x * x_mask
+class WN(torch.nn.Module):
+  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+    super(WN, self).__init__()
+    assert(kernel_size % 2 == 1)
+    self.hidden_channels =hidden_channels
+    self.kernel_size = kernel_size,
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.p_dropout = p_dropout
+    self.in_layers = torch.nn.ModuleList()
+    self.res_skip_layers = torch.nn.ModuleList()
+    self.drop = nn.Dropout(p_dropout)
+    if gin_channels != 0:
+      cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+      self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+    for i in range(n_layers):
+      dilation = dilation_rate ** i
+      padding = int((kernel_size * dilation - dilation) / 2)
+      in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+                                 dilation=dilation, padding=padding)
+      in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+      self.in_layers.append(in_layer)
+      # last one is not necessary
+      if i < n_layers - 1:
+        res_skip_channels = 2 * hidden_channels
+      else:
+        res_skip_channels = hidden_channels
+      res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+      res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+      self.res_skip_layers.append(res_skip_layer)
+  def forward(self, x, x_mask, g=None, **kwargs):
+    output = torch.zeros_like(x)
+    n_channels_tensor = torch.IntTensor([self.hidden_channels])
+    if g is not None:
+      g = self.cond_layer(g)
+    for i in range(self.n_layers):
+      x_in = self.in_layers[i](x)
+      if g is not None:
+        cond_offset = i * 2 * self.hidden_channels
+        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+      else:
+        g_l = torch.zeros_like(x_in)
+      acts = commons.fused_add_tanh_sigmoid_multiply(
+          x_in,
+          g_l,
+          n_channels_tensor)
+      acts = self.drop(acts)
+      res_skip_acts = self.res_skip_layers[i](acts)
+      if i < self.n_layers - 1:
+        res_acts = res_skip_acts[:,:self.hidden_channels,:]
+        x = (x + res_acts) * x_mask
+        output = output + res_skip_acts[:,self.hidden_channels:,:]
+      else:
+        output = output + res_skip_acts
+    return output * x_mask
+  def remove_weight_norm(self):
+    if self.gin_channels != 0:
+      torch.nn.utils.remove_weight_norm(self.cond_layer)
+    for l in self.in_layers:
+      torch.nn.utils.remove_weight_norm(l)
+    for l in self.res_skip_layers:
+     torch.nn.utils.remove_weight_norm(l)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Log(nn.Module):
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+      logdet = torch.sum(-y, [1, 2])
+      return y, logdet
+    else:
+      x = torch.exp(x) * x_mask
+      return x
+class Flip(nn.Module):
+  def forward(self, x, *args, reverse=False, **kwargs):
+    x = torch.flip(x, [1])
+    if not reverse:
+      logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+      return x, logdet
+    else:
+      return x
+class ElementwiseAffine(nn.Module):
+  def __init__(self, channels):
+    super().__init__()
+    self.channels = channels
+    self.m = nn.Parameter(torch.zeros(channels,1))
+    self.logs = nn.Parameter(torch.zeros(channels,1))
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = self.m + torch.exp(self.logs) * x
+      y = y * x_mask
+      logdet = torch.sum(self.logs * x_mask, [1,2])
+      return y, logdet
+    else:
+      x = (x - self.m) * torch.exp(-self.logs) * x_mask
+      return x
+class ResidualCouplingLayer(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      p_dropout=0,
+      gin_channels=0,
+      mean_only=False):
+    assert channels % 2 == 0, "channels should be divisible by 2"
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.half_channels = channels // 2
+    self.mean_only = mean_only
+    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+    self.post.weight.data.zero_()
+    self.post.bias.data.zero_()
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0) * x_mask
+    h = self.enc(h, x_mask, g=g)
+    stats = self.post(h) * x_mask
+    if not self.mean_only:
+      m, logs = torch.split(stats, [self.half_channels]*2, 1)
+    else:
+      m = stats
+      logs = torch.zeros_like(m)
+    if not reverse:
+      x1 = m + x1 * torch.exp(logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      logdet = torch.sum(logs, [1,2])
+      return x, logdet
+    else:
+      x1 = (x1 - m) * torch.exp(-logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      return x

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ libsndfile1

requirements.txt CHANGED Viewed

@@ -1,9 +1,35 @@
 protobuf
-transformers==4.30.2
 cpm_kernels
-torch>=2.0
-# gradio
 mdtex2html
 sentencepiece
 accelerate
-loguru

+torch
+torchvision
+torchaudio
+numpy==1.22.0
+face_alignment==1.3.0
+imageio==2.19.3
+imageio-ffmpeg==0.4.7
+librosa==0.8.1
+numba
+resampy==0.3.1
+pydub==0.25.1
+scipy
+kornia==0.6.8
+tqdm
+yacs==0.1.8
+pyyaml
+joblib==1.1.0
+scikit-image==0.19.3
+basicsr==1.4.2
+facexlib==0.3.0
+dlib-bin
+gfpgan
+av
+safetensors
+transformers
+webrtcvad==2.0.10
 protobuf
 cpm_kernels
 mdtex2html
 sentencepiece
 accelerate
+loguru
+edge_tts
+altair
+gradio==3.36.1

speaker_encoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

speaker_encoder/audio.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from scipy.ndimage.morphology import binary_dilation
+from speaker_encoder.params_data import *
+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+import webrtcvad
+import librosa
+import struct
+int16_max = (2 ** 15) - 1
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+                   source_sr: Optional[int] = None):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        wav, source_sr = librosa.load(fpath_or_wav, sr=None)
+    else:
+        wav = fpath_or_wav
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != sampling_rate:
+        wav = librosa.resample(wav, source_sr, sampling_rate)
+    # Apply the preprocessing: normalize volume and shorten long silences
+    wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    wav = trim_long_silences(wav)
+    return wav
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        y=wav,
+        sr=sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels
+    )
+    return frames.astype(np.float32).T
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    :param wav: the raw waveform as a numpy array of floats
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    return wav[audio_mask == True]
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))

speaker_encoder/ckpt/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

speaker_encoder/ckpt/pretrained_bak_5805000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc7ff82ef75becd495aab2ede3a8220da393a717f178ae9534df355a6173bbca
+size 17090379

speaker_encoder/compute_embed.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from speaker_encoder import inference as encoder
+from multiprocessing.pool import Pool
+from functools import partial
+from pathlib import Path
+# from utils import logmmse
+# from tqdm import tqdm
+# import numpy as np
+# import librosa
+def embed_utterance(fpaths, encoder_model_fpath):
+    if not encoder.is_loaded():
+        encoder.load_model(encoder_model_fpath)
+    # Compute the speaker embedding of the utterance
+    wav_fpath, embed_fpath = fpaths
+    wav = np.load(wav_fpath)
+    wav = encoder.preprocess_wav(wav)
+    embed = encoder.embed_utterance(wav)
+    np.save(embed_fpath, embed, allow_pickle=False)
+def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int):
+    wav_dir = outdir_root.joinpath("audio")
+    metadata_fpath = synthesizer_root.joinpath("train.txt")
+    assert wav_dir.exists() and metadata_fpath.exists()
+    embed_dir = synthesizer_root.joinpath("embeds")
+    embed_dir.mkdir(exist_ok=True)
+    # Gather the input wave filepath and the target output embed filepath
+    with metadata_fpath.open("r") as metadata_file:
+        metadata = [line.split("|") for line in metadata_file]
+        fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
+    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
+    # Embed the utterances in separate threads
+    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
+    job = Pool(n_processes).imap(func, fpaths)
+    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

speaker_encoder/config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+librispeech_datasets = {
+    "train": {
+        "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
+        "other": ["LibriSpeech/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriSpeech/test-clean"],
+        "other": ["LibriSpeech/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriSpeech/dev-clean"],
+        "other": ["LibriSpeech/dev-other"]
+    },
+}
+libritts_datasets = {
+    "train": {
+        "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
+        "other": ["LibriTTS/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriTTS/test-clean"],
+        "other": ["LibriTTS/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriTTS/dev-clean"],
+        "other": ["LibriTTS/dev-other"]
+    },
+}
+voxceleb_datasets = {
+    "voxceleb1" : {
+        "train": ["VoxCeleb1/wav"],
+        "test": ["VoxCeleb1/test_wav"]
+    },
+    "voxceleb2" : {
+        "train": ["VoxCeleb2/dev/aac"],
+        "test": ["VoxCeleb2/test_wav"]
+    }
+}
+other_datasets = [
+    "LJSpeech-1.1",
+    "VCTK-Corpus/wav48",
+]
+anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]

speaker_encoder/data_objects/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
2	+ from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

speaker_encoder/data_objects/random_cycler.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import random
+class RandomCycler:
+    """
+    Creates an internal copy of a sequence and allows access to its items in a constrained random
+    order. For a source sequence of n items and one or several consecutive queries of a total
+    of m items, the following guarantees hold (one implies the other):
+        - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
+        - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
+    """
+    def __init__(self, source):
+        if len(source) == 0:
+            raise Exception("Can't create RandomCycler from an empty collection")
+        self.all_items = list(source)
+        self.next_items = []
+    def sample(self, count: int):
+        shuffle = lambda l: random.sample(l, len(l))
+        out = []
+        while count > 0:
+            if count >= len(self.all_items):
+                out.extend(shuffle(list(self.all_items)))
+                count -= len(self.all_items)
+                continue
+            n = min(count, len(self.next_items))
+            out.extend(self.next_items[:n])
+            count -= n
+            self.next_items = self.next_items[n:]
+            if len(self.next_items) == 0:
+                self.next_items = shuffle(list(self.all_items))
+        return out
+    def __next__(self):
+        return self.sample(1)[0]

speaker_encoder/data_objects/speaker.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.utterance import Utterance
+from pathlib import Path
+# Contains the set of utterances of a single speaker
+class Speaker:
+    def __init__(self, root: Path):
+        self.root = root
+        self.name = root.name
+        self.utterances = None
+        self.utterance_cycler = None
+    def _load_utterances(self):
+        with self.root.joinpath("_sources.txt").open("r") as sources_file:
+            sources = [l.split(",") for l in sources_file]
+        sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+        self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
+        self.utterance_cycler = RandomCycler(self.utterances)
+    def random_partial(self, count, n_frames):
+        """
+        Samples a batch of <count> unique partial utterances from the disk in a way that all
+        utterances come up at least once every two cycles and in a random order every time.
+        :param count: The number of partial utterances to sample from the set of utterances from
+        that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than
+        the number of utterances available.
+        :param n_frames: The number of frames in the partial utterance.
+        :return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
+        frames are the frames of the partial utterances and range is the range of the partial
+        utterance with regard to the complete utterance.
+        """
+        if self.utterances is None:
+            self._load_utterances()
+        utterances = self.utterance_cycler.sample(count)
+        a = [(u,) + u.random_partial(n_frames) for u in utterances]
+        return a

speaker_encoder/data_objects/speaker_batch.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import numpy as np
+from typing import List
+from speaker_encoder.data_objects.speaker import Speaker
+class SpeakerBatch:
+    def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
+        self.speakers = speakers
+        self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
+        # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
+        # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
+        self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])

speaker_encoder/data_objects/speaker_verification_dataset.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.speaker_batch import SpeakerBatch
+from speaker_encoder.data_objects.speaker import Speaker
+from speaker_encoder.params_data import partials_n_frames
+from torch.utils.data import Dataset, DataLoader
+from pathlib import Path
+# TODO: improve with a pool of speakers for data efficiency
+class SpeakerVerificationDataset(Dataset):
+    def __init__(self, datasets_root: Path):
+        self.root = datasets_root
+        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+        if len(speaker_dirs) == 0:
+            raise Exception("No speakers found. Make sure you are pointing to the directory "
+                            "containing all preprocessed speaker directories.")
+        self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+        self.speaker_cycler = RandomCycler(self.speakers)
+    def __len__(self):
+        return int(1e10)
+    def __getitem__(self, index):
+        return next(self.speaker_cycler)
+    def get_logs(self):
+        log_string = ""
+        for log_fpath in self.root.glob("*.txt"):
+            with log_fpath.open("r") as log_file:
+                log_string += "".join(log_file.readlines())
+        return log_string
+class SpeakerVerificationDataLoader(DataLoader):
+    def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
+                 batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
+                 worker_init_fn=None):
+        self.utterances_per_speaker = utterances_per_speaker
+        super().__init__(
+            dataset=dataset,
+            batch_size=speakers_per_batch,
+            shuffle=False,
+            sampler=sampler,
+            batch_sampler=batch_sampler,
+            num_workers=num_workers,
+            collate_fn=self.collate,
+            pin_memory=pin_memory,
+            drop_last=False,
+            timeout=timeout,
+            worker_init_fn=worker_init_fn
+        )
+    def collate(self, speakers):
+        return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)

speaker_encoder/data_objects/utterance.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import numpy as np
+class Utterance:
+    def __init__(self, frames_fpath, wave_fpath):
+        self.frames_fpath = frames_fpath
+        self.wave_fpath = wave_fpath
+    def get_frames(self):
+        return np.load(self.frames_fpath)
+    def random_partial(self, n_frames):
+        """
+        Crops the frames into a partial utterance of n_frames
+        :param n_frames: The number of frames of the partial utterance
+        :return: the partial utterance frames and a tuple indicating the start and end of the
+        partial utterance in the complete utterance.
+        """
+        frames = self.get_frames()
+        if frames.shape[0] == n_frames:
+            start = 0
+        else:
+            start = np.random.randint(0, frames.shape[0] - n_frames)
+        end = start + n_frames
+        return frames[start:end], (start, end)

speaker_encoder/hparams.py ADDED Viewed

	@@ -0,0 +1,31 @@

+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3

speaker_encoder/inference.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from speaker_encoder.params_data import *
+from speaker_encoder.model import SpeakerEncoder
+from speaker_encoder.audio import preprocess_wav   # We want to expose this function from here
+from matplotlib import cm
+from speaker_encoder import audio
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+def load_model(weights_fpath: Path, device=None):
+    """
+    Loads the model in memory. If this function is not explicitely called, it will be run on the
+    first call to embed_frames() with the default weights file.
+    :param weights_fpath: the path to saved model weights.
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
+    model will be loaded and will run on this device. Outputs will however always be on the cpu.
+    If None, will default to your GPU if it"s available, otherwise your CPU.
+    """
+    # TODO: I think the slow loading of the encoder might have something to do with the device it
+    #   was saved on. Worth investigating.
+    global _model, _device
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        _device = torch.device(device)
+    _model = SpeakerEncoder(_device, torch.device("cpu"))
+    checkpoint = torch.load(weights_fpath)
+    _model.load_state_dict(checkpoint["model_state"])
+    _model.eval()
+    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+def is_loaded():
+    return _model is not None
+def embed_frames_batch(frames_batch):
+    """
+    Computes embeddings for a batch of mel spectrogram.
+    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
+    (batch_size, n_frames, n_channels)
+    :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
+    """
+    if _model is None:
+        raise Exception("Model was not loaded. Call load_model() before inference.")
+    frames = torch.from_numpy(frames_batch).to(_device)
+    embed = _model.forward(frames).detach().cpu().numpy()
+    return embed
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+                           min_pad_coverage=0.75, overlap=0.5):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those
+    defined in params_data.py.
+    The returned ranges may be indexing further than the length of the waveform. It is
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+    utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
+    utterances are entirely disjoint.
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial
+    utterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * samples_per_frame
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+    return wav_slices, mel_slices
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+    """
+    Computes an embedding for a single utterance.
+    # TODO: handle multiple wavs to benefit from batching on GPU
+    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+    :param using_partials: if True, then the utterance is split in partial utterances of
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their
+    normalized average. If False, the utterance is instead computed from feeding the entire
+    spectogram to the network.
+    :param return_partials: if True, the partial embeddings will also be returned along with the
+    wav slices that correspond to the partial embeddings.
+    :param kwargs: additional arguments to compute_partial_splits()
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+    returned. If <using_partials> is simultaneously set to False, both these values will be None
+    instead.
+    """
+    # Process the entire utterance if not using partials
+    if not using_partials:
+        frames = audio.wav_to_mel_spectrogram(wav)
+        embed = embed_frames_batch(frames[None, ...])[0]
+        if return_partials:
+            return embed, None, None
+        return embed
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    # Split the utterance into partials
+    frames = audio.wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = embed_frames_batch(frames_batch)
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    if return_partials:
+        return embed, partial_embeds, wave_slices
+    return embed
+def embed_speaker(wavs, **kwargs):
+    raise NotImplemented()
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    if ax is None:
+        ax = plt.gca()
+    if shape is None:
+        height = int(np.sqrt(len(embed)))
+        shape = (height, -1)
+    embed = embed.reshape(shape)
+    cmap = cm.get_cmap()
+    mappable = ax.imshow(embed, cmap=cmap)
+    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+    cbar.set_clim(*color_range)
+    ax.set_xticks([]), ax.set_yticks([])
+    ax.set_title(title)

speaker_encoder/model.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from speaker_encoder.params_model import *
+from speaker_encoder.params_data import *
+from scipy.interpolate import interp1d
+from sklearn.metrics import roc_curve
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+class SpeakerEncoder(nn.Module):
+    def __init__(self, device, loss_device):
+        super().__init__()
+        self.loss_device = loss_device
+        # Network defition
+        self.lstm = nn.LSTM(input_size=mel_n_channels,     # 40
+                            hidden_size=model_hidden_size, # 256
+                            num_layers=model_num_layers,   # 3
+                            batch_first=True).to(device)
+        self.linear = nn.Linear(in_features=model_hidden_size,
+                                out_features=model_embedding_size).to(device)
+        self.relu = torch.nn.ReLU().to(device)
+        # Cosine similarity scaling (with fixed initial parameter values)
+        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+        # Loss
+        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+    def do_gradient_ops(self):
+        # Gradient scale
+        self.similarity_weight.grad *= 0.01
+        self.similarity_bias.grad *= 0.01
+        # Gradient clipping
+        clip_grad_norm_(self.parameters(), 3, norm_type=2)
+    def forward(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+        (batch_size, n_frames, n_channels)
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        # We take only the hidden state of the last layer
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        # L2-normalize it
+        embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+        return embeds
+    def similarity_matrix(self, embeds):
+        """
+        Computes the similarity matrix according the section 2.1 of GE2E.
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, embedding_size)
+        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, speakers_per_batch)
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+        centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+        centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
+        # Exclusive centroids (1 per utterance)
+        centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
+        centroids_excl /= (utterances_per_speaker - 1)
+        centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
+        # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+        # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+        # We vectorize the computation for efficiency.
+        sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
+                                 speakers_per_batch).to(self.loss_device)
+        mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
+        for j in range(speakers_per_batch):
+            mask = np.where(mask_matrix[j])[0]
+            sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+            sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+        ## Even more vectorized version (slower maybe because of transpose)
+        # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+        #                           ).to(self.loss_device)
+        # eye = np.eye(speakers_per_batch, dtype=np.int)
+        # mask = np.where(1 - eye)
+        # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+        # mask = np.where(eye)
+        # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+        # sim_matrix2 = sim_matrix2.transpose(1, 2)
+        sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+        return sim_matrix
+    def loss(self, embeds):
+        """
+        Computes the softmax loss according the section 2.1 of GE2E.
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, embedding_size)
+        :return: the loss and the EER for this batch of embeddings.
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        # Loss
+        sim_matrix = self.similarity_matrix(embeds)
+        sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
+                                         speakers_per_batch))
+        ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+        target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+        loss = self.loss_fn(sim_matrix, target)
+        # EER (not backpropagated)
+        with torch.no_grad():
+            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
+            labels = np.array([inv_argmax(i) for i in ground_truth])
+            preds = sim_matrix.detach().cpu().numpy()
+            # Snippet from https://yangcha.github.io/EER-ROC/
+            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
+            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+        return loss, eer

speaker_encoder/params_data.py ADDED Viewed

	@@ -0,0 +1,29 @@

+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80     #  800 ms
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+## Audio volume normalization
+audio_norm_target_dBFS = -30

speaker_encoder/params_model.py ADDED Viewed

	@@ -0,0 +1,11 @@

+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 64
+utterances_per_speaker = 10

speaker_encoder/preprocess.py ADDED Viewed

	@@ -0,0 +1,285 @@

+from multiprocess.pool import ThreadPool
+from speaker_encoder.params_data import *
+from speaker_encoder.config import librispeech_datasets, anglophone_nationalites
+from datetime import datetime
+from speaker_encoder import audio
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+class DatasetLog:
+    """
+    Registers metadata about the dataset in a text file.
+    """
+    def __init__(self, root, name):
+        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+        self.sample_data = dict()
+        start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Creating dataset %s on %s" % (name, start_time))
+        self.write_line("-----")
+        self._log_params()
+    def _log_params(self):
+        from speaker_encoder import params_data
+        self.write_line("Parameter values:")
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            self.write_line("\t%s: %s" % (param_name, value))
+        self.write_line("-----")
+    def write_line(self, line):
+        self.text_file.write("%s\n" % line)
+    def add_sample(self, **kwargs):
+        for param_name, value in kwargs.items():
+            if not param_name in self.sample_data:
+                self.sample_data[param_name] = []
+            self.sample_data[param_name].append(value)
+    def finalize(self):
+        self.write_line("Statistics:")
+        for param_name, values in self.sample_data.items():
+            self.write_line("\t%s:" % param_name)
+            self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
+            self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
+        self.write_line("-----")
+        end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Finished on %s" % end_time)
+        self.text_file.close()
+def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
+    dataset_root = datasets_root.joinpath(dataset_name)
+    if not dataset_root.exists():
+        print("Couldn\'t find %s, skipping this dataset." % dataset_root)
+        return None, None
+    return dataset_root, DatasetLog(out_dir, dataset_name)
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    # Function to preprocess utterances for one speaker
+    def preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        # Create an output directory with that name, as well as a txt file containing a
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        # There's a possibility that the preprocessing was interrupted earlier, check if
+        # there already is a sources file.
+        if sources_fpath.exists():
+            try:
+                with sources_fpath.open("r") as sources_file:
+                    existing_fnames = {line.split(",")[0] for line in sources_file}
+            except:
+                existing_fnames = {}
+        else:
+            existing_fnames = {}
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        sources_file.close()
+    # Process the utterances for each speaker
+    with ThreadPool(8) as pool:
+        list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+                  unit="speakers"))
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+# Function to preprocess utterances for one speaker
+def __preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, extension: str, skip_existing: bool):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        # Create an output directory with that name, as well as a txt file containing a
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        # There's a possibility that the preprocessing was interrupted earlier, check if
+        # there already is a sources file.
+        # if sources_fpath.exists():
+        #     try:
+        #         with sources_fpath.open("r") as sources_file:
+        #             existing_fnames = {line.split(",")[0] for line in sources_file}
+        #     except:
+        #         existing_fnames = {}
+        # else:
+        #     existing_fnames = {}
+        existing_fnames = {}
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            # logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        sources_file.close()
+        return len(wav)
+def _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    # from multiprocessing import Pool, cpu_count
+    from pathos.multiprocessing import ProcessingPool as Pool
+    # Function to preprocess utterances for one speaker
+    def __preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        # Create an output directory with that name, as well as a txt file containing a
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        existing_fnames = {}
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        wav_lens = []
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            # logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+            wav_lens.append(len(wav))
+        sources_file.close()
+        return wav_lens
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    # Process the utterances for each speaker
+    # with ThreadPool(8) as pool:
+    #     list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+    #               unit="speakers"))
+    pool = Pool(processes=20)
+    for i, wav_lens in enumerate(pool.map(__preprocess_speaker, speaker_dirs), 1):
+        for wav_len in wav_lens:
+            logger.add_sample(duration=wav_len / sampling_rate)
+        print(f'{i}/{len(speaker_dirs)} \r')
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
+    for dataset_name in librispeech_datasets["train"]["other"]:
+        # Initialize the preprocessing
+        dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+        if not dataset_root:
+            return
+        # Preprocess all speakers
+        speaker_dirs = list(dataset_root.glob("*"))
+        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
+                                 skip_existing, logger)
+def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb1"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    # Get the contents of the meta file
+    with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
+        metadata = [line.split("\t") for line in metafile][1:]
+    # Select the ID and the nationality, filter out non-anglophone speakers
+    nationalities = {line[0]: line[3] for line in metadata}
+    # keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
+    #                     nationality.lower() in anglophone_nationalites]
+    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items()]
+    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
+          (len(keep_speaker_ids), len(nationalities)))
+    # Get the speaker directories for anglophone speakers only
+    speaker_dirs = dataset_root.joinpath("wav").glob("*")
+    speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
+                    speaker_dir.name in keep_speaker_ids]
+    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
+          (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
+    # Preprocess all speakers
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+                             skip_existing, logger)
+def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb2"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    # Get the speaker directories
+    # Preprocess all speakers
+    speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
+    _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
+                             skip_existing, logger)

speaker_encoder/train.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from speaker_encoder.visualizations import Visualizations
+from speaker_encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
+from speaker_encoder.params_model import *
+from speaker_encoder.model import SpeakerEncoder
+from utils.profiler import Profiler
+from pathlib import Path
+import torch
+def sync(device: torch.device):
+    # FIXME
+    return
+    # For correct profiling (cuda operations are async)
+    if device.type == "cuda":
+        torch.cuda.synchronize(device)
+def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
+          backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
+          no_visdom: bool):
+    # Create a dataset and a dataloader
+    dataset = SpeakerVerificationDataset(clean_data_root)
+    loader = SpeakerVerificationDataLoader(
+        dataset,
+        speakers_per_batch,       # 64
+        utterances_per_speaker,   # 10
+        num_workers=8,
+    )
+    # Setup the device on which to run the forward pass and the loss. These can be different,
+    # because the forward pass is faster on the GPU whereas the loss is often (depending on your
+    # hyperparameters) faster on the CPU.
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # FIXME: currently, the gradient is None if loss_device is cuda
+    loss_device = torch.device("cpu")
+    # Create the model and the optimizer
+    model = SpeakerEncoder(device, loss_device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
+    init_step = 1
+    # Configure file path for the model
+    state_fpath = models_dir.joinpath(run_id + ".pt")
+    backup_dir = models_dir.joinpath(run_id + "_backups")
+    # Load any existing model
+    if not force_restart:
+        if state_fpath.exists():
+            print("Found existing model \"%s\", loading it and resuming training." % run_id)
+            checkpoint = torch.load(state_fpath)
+            init_step = checkpoint["step"]
+            model.load_state_dict(checkpoint["model_state"])
+            optimizer.load_state_dict(checkpoint["optimizer_state"])
+            optimizer.param_groups[0]["lr"] = learning_rate_init
+        else:
+            print("No model \"%s\" found, starting training from scratch." % run_id)
+    else:
+        print("Starting the training from scratch.")
+    model.train()
+    # Initialize the visualization environment
+    vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
+    vis.log_dataset(dataset)
+    vis.log_params()
+    device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
+    vis.log_implementation({"Device": device_name})
+    # Training loop
+    profiler = Profiler(summarize_every=10, disabled=False)
+    for step, speaker_batch in enumerate(loader, init_step):
+        profiler.tick("Blocking, waiting for batch (threaded)")
+        # Forward pass
+        inputs = torch.from_numpy(speaker_batch.data).to(device)
+        sync(device)
+        profiler.tick("Data to %s" % device)
+        embeds = model(inputs)
+        sync(device)
+        profiler.tick("Forward pass")
+        embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
+        loss, eer = model.loss(embeds_loss)
+        sync(loss_device)
+        profiler.tick("Loss")
+        # Backward pass
+        model.zero_grad()
+        loss.backward()
+        profiler.tick("Backward pass")
+        model.do_gradient_ops()
+        optimizer.step()
+        profiler.tick("Parameter update")
+        # Update visualizations
+        # learning_rate = optimizer.param_groups[0]["lr"]
+        vis.update(loss.item(), eer, step)
+        # Draw projections and save them to the backup folder
+        if umap_every != 0 and step % umap_every == 0:
+            print("Drawing and saving projections (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
+            embeds = embeds.detach().cpu().numpy()
+            vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
+            vis.save()
+        # Overwrite the latest version of the model
+        if save_every != 0 and step % save_every == 0:
+            print("Saving the model (step %d)" % step)
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, state_fpath)
+        # Make a backup
+        if backup_every != 0 and step % backup_every == 0:
+            print("Making a backup (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, backup_fpath)
+        profiler.tick("Extras (visualizations, saving)")

speaker_encoder/visualizations.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from datetime import datetime
+from time import perf_counter as timer
+import matplotlib.pyplot as plt
+import numpy as np
+# import webbrowser
+import visdom
+import umap
+colormap = np.array([
+    [76, 255, 0],
+    [0, 127, 70],
+    [255, 0, 0],
+    [255, 217, 38],
+    [0, 135, 255],
+    [165, 0, 165],
+    [255, 167, 255],
+    [0, 255, 255],
+    [255, 96, 38],
+    [142, 76, 0],
+    [33, 0, 127],
+    [0, 0, 0],
+    [183, 183, 183],
+], dtype=np.float) / 255
+class Visualizations:
+    def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
+        # Tracking data
+        self.last_update_timestamp = timer()
+        self.update_every = update_every
+        self.step_times = []
+        self.losses = []
+        self.eers = []
+        print("Updating the visualizations every %d steps." % update_every)
+        # If visdom is disabled TODO: use a better paradigm for that
+        self.disabled = disabled
+        if self.disabled:
+            return
+        # Set the environment name
+        now = str(datetime.now().strftime("%d-%m %Hh%M"))
+        if env_name is None:
+            self.env_name = now
+        else:
+            self.env_name = "%s (%s)" % (env_name, now)
+        # Connect to visdom and open the corresponding window in the browser
+        try:
+            self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
+        except ConnectionError:
+            raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
+                            "start it.")
+        # webbrowser.open("http://localhost:8097/env/" + self.env_name)
+        # Create the windows
+        self.loss_win = None
+        self.eer_win = None
+        # self.lr_win = None
+        self.implementation_win = None
+        self.projection_win = None
+        self.implementation_string = ""
+    def log_params(self):
+        if self.disabled:
+            return
+        from speaker_encoder import params_data
+        from speaker_encoder import params_model
+        param_string = "<b>Model parameters</b>:<br>"
+        for param_name in (p for p in dir(params_model) if not p.startswith("__")):
+            value = getattr(params_model, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        param_string += "<b>Data parameters</b>:<br>"
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        self.vis.text(param_string, opts={"title": "Parameters"})
+    def log_dataset(self, dataset: SpeakerVerificationDataset):
+        if self.disabled:
+            return
+        dataset_string = ""
+        dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
+        dataset_string += "\n" + dataset.get_logs()
+        dataset_string = dataset_string.replace("\n", "<br>")
+        self.vis.text(dataset_string, opts={"title": "Dataset"})
+    def log_implementation(self, params):
+        if self.disabled:
+            return
+        implementation_string = ""
+        for param, value in params.items():
+            implementation_string += "<b>%s</b>: %s\n" % (param, value)
+            implementation_string = implementation_string.replace("\n", "<br>")
+        self.implementation_string = implementation_string
+        self.implementation_win = self.vis.text(
+            implementation_string,
+            opts={"title": "Training implementation"}
+        )
+    def update(self, loss, eer, step):
+        # Update the tracking data
+        now = timer()
+        self.step_times.append(1000 * (now - self.last_update_timestamp))
+        self.last_update_timestamp = now
+        self.losses.append(loss)
+        self.eers.append(eer)
+        print(".", end="")
+        # Update the plots every <update_every> steps
+        if step % self.update_every != 0:
+            return
+        time_string = "Step time:  mean: %5dms  std: %5dms" % \
+                      (int(np.mean(self.step_times)), int(np.std(self.step_times)))
+        print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
+              (step, np.mean(self.losses), np.mean(self.eers), time_string))
+        if not self.disabled:
+            self.loss_win = self.vis.line(
+                [np.mean(self.losses)],
+                [step],
+                win=self.loss_win,
+                update="append" if self.loss_win else None,
+                opts=dict(
+                    legend=["Avg. loss"],
+                    xlabel="Step",
+                    ylabel="Loss",
+                    title="Loss",
+                )
+            )
+            self.eer_win = self.vis.line(
+                [np.mean(self.eers)],
+                [step],
+                win=self.eer_win,
+                update="append" if self.eer_win else None,
+                opts=dict(
+                    legend=["Avg. EER"],
+                    xlabel="Step",
+                    ylabel="EER",
+                    title="Equal error rate"
+                )
+            )
+            if self.implementation_win is not None:
+                self.vis.text(
+                    self.implementation_string + ("<b>%s</b>" % time_string),
+                    win=self.implementation_win,
+                    opts={"title": "Training implementation"},
+                )
+        # Reset the tracking
+        self.losses.clear()
+        self.eers.clear()
+        self.step_times.clear()
+    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
+                         max_speakers=10):
+        max_speakers = min(max_speakers, len(colormap))
+        embeds = embeds[:max_speakers * utterances_per_speaker]
+        n_speakers = len(embeds) // utterances_per_speaker
+        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
+        colors = [colormap[i] for i in ground_truth]
+        reducer = umap.UMAP()
+        projected = reducer.fit_transform(embeds)
+        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
+        plt.gca().set_aspect("equal", "datalim")
+        plt.title("UMAP projection (step %d)" % step)
+        if not self.disabled:
+            self.projection_win = self.vis.matplot(plt, win=self.projection_win)
+        if out_fpath is not None:
+            plt.savefig(out_fpath)
+        plt.clf()
+    def save(self):
+        if not self.disabled:
+            self.vis.save([self.env_name])

speaker_encoder/voice_encoder.py ADDED Viewed

	@@ -0,0 +1,173 @@

+from speaker_encoder.hparams import *
+from speaker_encoder import audio
+from pathlib import Path
+from typing import Union, List
+from torch import nn
+from time import perf_counter as timer
+import numpy as np
+import torch
+class SpeakerEncoder(nn.Module):
+    def __init__(self, weights_fpath, device: Union[str, torch.device]=None, verbose=True):
+        """
+        :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda").
+        If None, defaults to cuda if it is available on your machine, otherwise the model will
+        run on cpu. Outputs are always returned on the cpu, as numpy arrays.
+        """
+        super().__init__()
+        # Define the network
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+        # Get the target device
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        self.device = device
+        # Load the pretrained model'speaker weights
+        # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
+        # if not weights_fpath.exists():
+        #     raise Exception("Couldn't find the voice encoder pretrained model at %s." %
+        #                     weights_fpath)
+        start = timer()
+        checkpoint = torch.load(weights_fpath, map_location="cpu")
+        self.load_state_dict(checkpoint["model_state"], strict=False)
+        self.to(device)
+        if verbose:
+            print("Loaded the voice encoder model on %s in %.2f seconds." %
+                  (device.type, timer() - start))
+    def forward(self, mels: torch.FloatTensor):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape
+        (batch_size, n_frames, n_channels)
+        :return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size).
+        Embeddings are positive and L2-normed, thus they lay in the range [0, 1].
+        """
+        # Pass the input through the LSTM layers and retrieve the final hidden state of the last
+        # layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings.
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+    @staticmethod
+    def compute_partial_slices(n_samples: int, rate, min_coverage):
+        """
+        Computes where to split an utterance waveform and its corresponding mel spectrogram to
+        obtain partial utterances of <partials_n_frames> each. Both the waveform and the
+        mel spectrogram slices are returned, so as to make each partial utterance waveform
+        correspond to its spectrogram.
+        The returned ranges may be indexing further than the length of the waveform. It is
+        recommended that you pad the waveform with zeros up to wav_slices[-1].stop.
+        :param n_samples: the number of samples in the waveform
+        :param rate: how many partial utterances should occur per second. Partial utterances must
+        cover the span of the entire utterance, thus the rate should not be lower than the inverse
+        of the duration of a partial utterance. By default, partial utterances are 1.6s long and
+        the minimum rate is thus 0.625.
+        :param min_coverage: when reaching the last partial utterance, it may or may not have
+        enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
+        then the last partial utterance will be considered by zero-padding the audio. Otherwise,
+        it will be discarded. If there aren't enough frames for one partial utterance,
+        this parameter is ignored so that the function always returns at least one slice.
+        :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+        respectively the waveform and the mel spectrogram with these slices to obtain the partial
+        utterances.
+        """
+        assert 0 < min_coverage <= 1
+        # Compute how many frames separate two partial utterances
+        samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+        n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+        frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
+        assert 0 < frame_step, "The rate is too high"
+        assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
+            (sampling_rate / (samples_per_frame * partials_n_frames))
+        # Compute the slices
+        wav_slices, mel_slices = [], []
+        steps = max(1, n_frames - partials_n_frames + frame_step + 1)
+        for i in range(0, steps, frame_step):
+            mel_range = np.array([i, i + partials_n_frames])
+            wav_range = mel_range * samples_per_frame
+            mel_slices.append(slice(*mel_range))
+            wav_slices.append(slice(*wav_range))
+        # Evaluate whether extra padding is warranted or not
+        last_wav_range = wav_slices[-1]
+        coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+        if coverage < min_coverage and len(mel_slices) > 1:
+            mel_slices = mel_slices[:-1]
+            wav_slices = wav_slices[:-1]
+        return wav_slices, mel_slices
+    def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
+        """
+        Computes an embedding for a single utterance. The utterance is divided in partial
+        utterances and an embedding is computed for each. The complete utterance embedding is the
+        L2-normed average embedding of the partial utterances.
+        TODO: independent batched version of this function
+        :param wav: a preprocessed utterance waveform as a numpy array of float32
+        :param return_partials: if True, the partial embeddings will also be returned along with
+        the wav slices corresponding to each partial utterance.
+        :param rate: how many partial utterances should occur per second. Partial utterances must
+        cover the span of the entire utterance, thus the rate should not be lower than the inverse
+        of the duration of a partial utterance. By default, partial utterances are 1.6s long and
+        the minimum rate is thus 0.625.
+        :param min_coverage: when reaching the last partial utterance, it may or may not have
+        enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
+        then the last partial utterance will be considered by zero-padding the audio. Otherwise,
+        it will be discarded. If there aren't enough frames for one partial utterance,
+        this parameter is ignored so that the function always returns at least one slice.
+        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+        <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+        (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+        returned.
+        """
+        # Compute where to split the utterance into partials and pad the waveform with zeros if
+        # the partial utterances cover a larger range.
+        wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage)
+        max_wave_length = wav_slices[-1].stop
+        if max_wave_length >= len(wav):
+            wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+        # Split the utterance into partials and forward them through the model
+        mel = audio.wav_to_mel_spectrogram(wav)
+        mels = np.array([mel[s] for s in mel_slices])
+        with torch.no_grad():
+            mels = torch.from_numpy(mels).to(self.device)
+            partial_embeds = self(mels).cpu().numpy()
+        # Compute the utterance embedding from the partial embeddings
+        raw_embed = np.mean(partial_embeds, axis=0)
+        embed = raw_embed / np.linalg.norm(raw_embed, 2)
+        if return_partials:
+            return embed, partial_embeds, wav_slices
+        return embed
+    def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
+        """
+        Compute the embedding of a collection of wavs (presumably from the same speaker) by
+        averaging their embedding and L2-normalizing it.
+        :param wavs: list of wavs a numpy arrays of float32.
+        :param kwargs: extra arguments to embed_utterance()
+        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
+        """
+        raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) \
+                             for wav in wavs], axis=0)
+        return raw_embed / np.linalg.norm(raw_embed, 2)

src/audio2exp_models/audio2exp.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from tqdm import tqdm
+import torch
+from torch import nn
+class Audio2Exp(nn.Module):
+    def __init__(self, netG, cfg, device, prepare_training_loss=False):
+        super(Audio2Exp, self).__init__()
+        self.cfg = cfg
+        self.device = device
+        self.netG = netG.to(device)
+    def test(self, batch):
+        mel_input = batch['indiv_mels']                         # bs T 1 80 16
+        bs = mel_input.shape[0]
+        T = mel_input.shape[1]
+        exp_coeff_pred = []
+        for i in tqdm(range(0, T, 10),'audio2exp:'): # every 10 frames
+            current_mel_input = mel_input[:,i:i+10]
+            #ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1))           #bs T 64
+            ref = batch['ref'][:, :, :64][:, i:i+10]
+            ratio = batch['ratio_gt'][:, i:i+10]                               #bs T
+            audiox = current_mel_input.view(-1, 1, 80, 16)                  # bs*T 1 80 16
+            curr_exp_coeff_pred  = self.netG(audiox, ref, ratio)         # bs T 64
+            exp_coeff_pred += [curr_exp_coeff_pred]
+        # BS x T x 64
+        results_dict = {
+            'exp_coeff_pred': torch.cat(exp_coeff_pred, axis=1)
+            }
+        return results_dict

src/audio2exp_models/networks.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+class Conv2d(nn.Module):
+    def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, use_act = True, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_block = nn.Sequential(
+                            nn.Conv2d(cin, cout, kernel_size, stride, padding),
+                            nn.BatchNorm2d(cout)
+                            )
+        self.act = nn.ReLU()
+        self.residual = residual
+        self.use_act = use_act
+    def forward(self, x):
+        out = self.conv_block(x)
+        if self.residual:
+            out += x
+        if self.use_act:
+            return self.act(out)
+        else:
+            return out
+class SimpleWrapperV2(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.audio_encoder = nn.Sequential(
+            Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
+            Conv2d(512, 512, kernel_size=1, stride=1, padding=0),
+            )
+        #### load the pre-trained audio_encoder
+        #self.audio_encoder = self.audio_encoder.to(device)
+        '''
+        wav2lip_state_dict = torch.load('/apdcephfs_cq2/share_1290939/wenxuazhang/checkpoints/wav2lip.pth')['state_dict']
+        state_dict = self.audio_encoder.state_dict()
+        for k,v in wav2lip_state_dict.items():
+            if 'audio_encoder' in k:
+                print('init:', k)
+                state_dict[k.replace('module.audio_encoder.', '')] = v
+        self.audio_encoder.load_state_dict(state_dict)
+        '''
+        self.mapping1 = nn.Linear(512+64+1, 64)
+        #self.mapping2 = nn.Linear(30, 64)
+        #nn.init.constant_(self.mapping1.weight, 0.)
+        nn.init.constant_(self.mapping1.bias, 0.)
+    def forward(self, x, ref, ratio):
+        x = self.audio_encoder(x).view(x.size(0), -1)
+        ref_reshape = ref.reshape(x.size(0), -1)
+        ratio = ratio.reshape(x.size(0), -1)
+        y = self.mapping1(torch.cat([x, ref_reshape, ratio], dim=1))
+        out = y.reshape(ref.shape[0], ref.shape[1], -1) #+ ref # resudial
+        return out

src/audio2pose_models/audio2pose.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+from torch import nn
+from src.audio2pose_models.cvae import CVAE
+from src.audio2pose_models.discriminator import PoseSequenceDiscriminator
+from src.audio2pose_models.audio_encoder import AudioEncoder
+class Audio2Pose(nn.Module):
+    def __init__(self, cfg, wav2lip_checkpoint, device='cuda'):
+        super().__init__()
+        self.cfg = cfg
+        self.seq_len = cfg.MODEL.CVAE.SEQ_LEN
+        self.latent_dim = cfg.MODEL.CVAE.LATENT_SIZE
+        self.device = device
+        self.audio_encoder = AudioEncoder(wav2lip_checkpoint, device)
+        self.audio_encoder.eval()
+        for param in self.audio_encoder.parameters():
+            param.requires_grad = False
+        self.netG = CVAE(cfg)
+        self.netD_motion = PoseSequenceDiscriminator(cfg)
+    def forward(self, x):
+        batch = {}
+        coeff_gt = x['gt'].cuda().squeeze(0)           #bs frame_len+1 73
+        batch['pose_motion_gt'] = coeff_gt[:, 1:, 64:70] - coeff_gt[:, :1, 64:70] #bs frame_len 6
+        batch['ref'] = coeff_gt[:, 0, 64:70]  #bs  6
+        batch['class'] = x['class'].squeeze(0).cuda() # bs
+        indiv_mels= x['indiv_mels'].cuda().squeeze(0) # bs seq_len+1 80 16
+        # forward
+        audio_emb_list = []
+        audio_emb = self.audio_encoder(indiv_mels[:, 1:, :, :].unsqueeze(2)) #bs seq_len 512
+        batch['audio_emb'] = audio_emb
+        batch = self.netG(batch)
+        pose_motion_pred = batch['pose_motion_pred']           # bs frame_len 6
+        pose_gt = coeff_gt[:, 1:, 64:70].clone()               # bs frame_len 6
+        pose_pred = coeff_gt[:, :1, 64:70] + pose_motion_pred  # bs frame_len 6
+        batch['pose_pred'] = pose_pred
+        batch['pose_gt'] = pose_gt
+        return batch
+    def test(self, x):
+        batch = {}
+        ref = x['ref']                            #bs 1 70
+        batch['ref'] = x['ref'][:,0,-6:]
+        batch['class'] = x['class']
+        bs = ref.shape[0]
+        indiv_mels= x['indiv_mels']               # bs T 1 80 16
+        indiv_mels_use = indiv_mels[:, 1:]        # we regard the ref as the first frame
+        num_frames = x['num_frames']
+        num_frames = int(num_frames) - 1
+        #
+        div = num_frames//self.seq_len
+        re = num_frames%self.seq_len
+        audio_emb_list = []
+        pose_motion_pred_list = [torch.zeros(batch['ref'].unsqueeze(1).shape, dtype=batch['ref'].dtype,
+                                                device=batch['ref'].device)]
+        for i in range(div):
+            z = torch.randn(bs, self.latent_dim).to(ref.device)
+            batch['z'] = z
+            audio_emb = self.audio_encoder(indiv_mels_use[:, i*self.seq_len:(i+1)*self.seq_len,:,:,:]) #bs seq_len 512
+            batch['audio_emb'] = audio_emb
+            batch = self.netG.test(batch)
+            pose_motion_pred_list.append(batch['pose_motion_pred'])  #list of bs seq_len 6
+        if re != 0:
+            z = torch.randn(bs, self.latent_dim).to(ref.device)
+            batch['z'] = z
+            audio_emb = self.audio_encoder(indiv_mels_use[:, -1*self.seq_len:,:,:,:]) #bs seq_len  512
+            if audio_emb.shape[1] != self.seq_len:
+                pad_dim = self.seq_len-audio_emb.shape[1]
+                pad_audio_emb = audio_emb[:, :1].repeat(1, pad_dim, 1)
+                audio_emb = torch.cat([pad_audio_emb, audio_emb], 1)
+            batch['audio_emb'] = audio_emb
+            batch = self.netG.test(batch)
+            pose_motion_pred_list.append(batch['pose_motion_pred'][:,-1*re:,:])
+        pose_motion_pred = torch.cat(pose_motion_pred_list, dim = 1)
+        batch['pose_motion_pred'] = pose_motion_pred
+        pose_pred = ref[:, :1, -6:] + pose_motion_pred  # bs T 6
+        batch['pose_pred'] = pose_pred
+        return batch