.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ myenv
2
+ venv
api.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from fastapi import FastAPI
3
+ from pydantic import BaseModel
4
+ import numpy as np
5
+ import base64
6
+ import io
7
+ from scipy.io.wavfile import write
8
+ import sounddevice as sd
9
+
10
+ # 自定义模块
11
+ import commons
12
+ import utils
13
+ from models import SynthesizerTrn
14
+ from text.symbols import symbols
15
+ from text import text_to_sequence
16
+
17
+ # 检查 PyTorch 版本
18
+ print(torch.__version__)
19
+
20
+ # 检查 CUDA 是否可用
21
+ print(torch.cuda.is_available())
22
+
23
+ # 检查当前 CUDA 版本
24
+ print(torch.version.cuda)
25
+
26
+ # FastAPI 应用
27
+ app = FastAPI()
28
+
29
+ # 请求体模型
30
+ class TextRequest(BaseModel):
31
+ text: str
32
+
33
+ # 加载配置和模型
34
+ config_path = "configs/steins_gate_base.json"
35
+ checkpoint_path = "G_265000.pth"
36
+ hps = utils.get_hparams_from_file(config_path)
37
+ net_g = SynthesizerTrn(
38
+ len(symbols),
39
+ hps.data.filter_length // 2 + 1,
40
+ hps.train.segment_size // hps.data.hop_length,
41
+ **hps.model,
42
+ ).eval()
43
+ utils.load_checkpoint(checkpoint_path, net_g, None)
44
+
45
+ # 文本到语音合成
46
+ def text_to_speech(content):
47
+ stn_tst = text_to_sequence(content, hps.data.text_cleaners)
48
+ if hps.data.add_blank:
49
+ stn_tst = commons.intersperse(stn_tst, 0)
50
+ stn_tst = torch.LongTensor(stn_tst)
51
+ with torch.no_grad():
52
+ x_tst = stn_tst.unsqueeze(0)
53
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
54
+ audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=0.667, noise_scale_w=0.8, length_scale=1)[0][0, 0].data.float().numpy()
55
+
56
+ return hps.data.sampling_rate, audio
57
+
58
+ # API 路由:文本转语音
59
+ @app.post("/synthesize")
60
+ def synthesize(request: TextRequest):
61
+ # 假设 text_to_speech 是生成音频的函数
62
+ sampling_rate, audio = text_to_speech(request.text)
63
+
64
+ # 将音频数据保存到 BytesIO 对象
65
+ wav_bytes = io.BytesIO()
66
+ write(wav_bytes, sampling_rate, (audio * 32767).astype(np.int16))
67
+ wav_bytes.seek(0) # 将指针移动到文件开头
68
+
69
+ # 将 WAV 文件编码为 Base64
70
+ audio_base64 = base64.b64encode(wav_bytes.read()).decode("utf-8")
71
+ return {"audio": audio_base64}
72
+
73
+ # 主函数
74
+ if __name__ == "__main__":
75
+ import uvicorn
76
+ uvicorn.run(app, host="127.0.0.1", port=8000)
api_example.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import base64
4
+ import io
5
+ import sounddevice as sd
6
+ import soundfile as sf
7
+
8
+ # API 地址
9
+ url = "http://127.0.0.1:8000/synthesize"
10
+
11
+ # 请求体
12
+ payload = {"text": "Hello, world!"}
13
+
14
+ # 请求头
15
+ headers = {"Content-Type": "application/json"}
16
+
17
+ # 发送 POST 请求
18
+ try:
19
+ print("Sending request to the API...")
20
+ response = requests.post(url, data=json.dumps(payload), headers=headers)
21
+ response.raise_for_status() # 检查请求是否成功
22
+ result = response.json()
23
+ print("Response received.")
24
+
25
+ # 检查返回的音频数据
26
+ if "audio" in result:
27
+ print("Audio data received as Base64.")
28
+
29
+ # 解码 Base64 音频数据
30
+ audio_base64 = result["audio"]
31
+ audio_bytes = base64.b64decode(audio_base64)
32
+
33
+ # 使用 soundfile 直接从内存中读取音频数据
34
+ try:
35
+ audio_data, samplerate = sf.read(io.BytesIO(audio_bytes))
36
+ print(f"Playing audio (sample rate: {samplerate} Hz)...")
37
+ sd.play(audio_data, samplerate)
38
+ sd.wait() # 等待播放完成
39
+ print("Audio playback finished.")
40
+ except sf.LibsndfileError as e:
41
+ print(f"Error reading audio data: {e}")
42
+ else:
43
+ print("Unexpected response format. No audio data found.")
44
+ except requests.exceptions.RequestException as e:
45
+ print(f"Request failed: {e}")
46
+ except Exception as e:
47
+ print(f"An error occurred: {e}")
app.py CHANGED
@@ -1,25 +1,11 @@
1
-
2
-
3
-
4
-
5
- import os
6
- os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
7
- import gradio as gr
8
-
9
- import matplotlib.pyplot as plt
10
- import IPython.display as ipd
11
-
12
  import os
13
- import json
14
- import math
15
- import torch
16
- from torch import nn
17
- from torch.nn import functional as F
18
- from torch.utils.data import DataLoader
19
  import logging
 
 
 
20
 
21
- numba_logger = logging.getLogger('numba')
22
- numba_logger.setLevel(logging.WARNING)
23
  import commons
24
  import utils
25
  from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
@@ -27,44 +13,111 @@ from models import SynthesizerTrn
27
  from text.symbols import symbols
28
  from text import text_to_sequence
29
 
30
- from scipy.io.wavfile import write
31
-
32
-
33
- def get_text(text, hps):
34
- text_norm = text_to_sequence(text, hps.data.text_cleaners)
35
- if hps.data.add_blank:
36
- text_norm = commons.intersperse(text_norm, 0)
37
- text_norm = torch.LongTensor(text_norm)
38
- return text_norm
39
-
40
- hps = utils.get_hparams_from_file("configs/steins_gate_base.json")
41
-
42
- net_g = SynthesizerTrn(
43
- len(symbols),
44
- hps.data.filter_length // 2 + 1,
45
- hps.train.segment_size // hps.data.hop_length,
46
- **hps.model)
47
- _ = net_g.eval()
48
-
49
- _ = utils.load_checkpoint("G_265000.pth", net_g, None)
50
-
51
- def syn(content):
52
- stn_tst = get_text(content, hps)
53
- with torch.no_grad():
54
- x_tst = stn_tst.unsqueeze(0)
55
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
56
- audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy()
57
- return (hps.data.sampling_rate,audio)
58
- #ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate))
59
-
60
-
61
- app = gr.Blocks()
62
- with app:
63
- with gr.Tabs():
64
- with gr.TabItem("Basic"):
65
- input1 = gr.Textbox()
66
- submit = gr.Button("Convert", variant="primary")
67
- output1 = gr.Audio(label="Output Audio")
68
- submit.click(syn,input1,output1)
69
-
70
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import subprocess
 
 
 
 
 
3
  import logging
4
+ import torch
5
+ import gradio as gr
6
+ from scipy.io.wavfile import write
7
 
8
+ # 自定义模块
 
9
  import commons
10
  import utils
11
  from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
 
13
  from text.symbols import symbols
14
  from text import text_to_sequence
15
 
16
+ # 配置日志
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # 编译 monotonic_align 模块
21
+ def compile_monotonic_align():
22
+ try:
23
+
24
+ os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
25
+
26
+ logger.info("Successfully compiled monotonic_align.")
27
+ except subprocess.CalledProcessError as e:
28
+ logger.error(f"Failed to compile monotonic_align: {e}")
29
+ raise RuntimeError("Compilation of monotonic_align failed.")
30
+
31
+ # 加载配置和模型
32
+ def load_config_and_model(config_path, checkpoint_path):
33
+ if not os.path.exists(config_path):
34
+ raise FileNotFoundError(f"Config file not found: {config_path}")
35
+ if not os.path.exists(checkpoint_path):
36
+ raise FileNotFoundError(f"Checkpoint file not found: {checkpoint_path}")
37
+
38
+ # 加载超参数
39
+ hps = utils.get_hparams_from_file(config_path)
40
+ logger.info("Loaded hyperparameters from config file.")
41
+
42
+ # 初始化模型
43
+ net_g = SynthesizerTrn(
44
+ len(symbols),
45
+ hps.data.filter_length // 2 + 1,
46
+ hps.train.segment_size // hps.data.hop_length,
47
+ **hps.model,
48
+ )
49
+ net_g.eval()
50
+ logger.info("Initialized SynthesizerTrn model.")
51
+
52
+ # 加载预训练权重
53
+ utils.load_checkpoint(checkpoint_path, net_g, None)
54
+ logger.info(f"Loaded model checkpoint from {checkpoint_path}.")
55
+
56
+ return hps, net_g
57
+
58
+ # 文本到语音合成
59
+ def text_to_speech(content, hps, net_g):
60
+ if not content or not isinstance(content, str):
61
+ raise ValueError("Input text is empty or invalid.")
62
+
63
+ try:
64
+ # 将文本转换为序列
65
+ stn_tst = text_to_sequence(content, hps.data.text_cleaners)
66
+ if hps.data.add_blank:
67
+ stn_tst = commons.intersperse(stn_tst, 0)
68
+ stn_tst = torch.LongTensor(stn_tst)
69
+
70
+ # 模型推理
71
+ with torch.no_grad():
72
+ x_tst = stn_tst.unsqueeze(0)
73
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
74
+ audio = net_g.infer(
75
+ x_tst, x_tst_lengths, noise_scale=0.667, noise_scale_w=0.8, length_scale=1
76
+ )[0][0, 0].data.float().numpy()
77
+
78
+ return hps.data.sampling_rate, audio
79
+ except Exception as e:
80
+ logger.error(f"Error during text-to-speech synthesis: {e}")
81
+ raise RuntimeError("Failed to generate audio.")
82
+
83
+ # Gradio 界面
84
+ def create_gradio_interface(hps, net_g):
85
+ def safe_syn(content):
86
+ try:
87
+ return text_to_speech(content, hps, net_g)
88
+ except Exception as e:
89
+ logger.error(f"Error in Gradio interface: {e}")
90
+ return None
91
+
92
+ app = gr.Blocks()
93
+ with app:
94
+ with gr.Tabs():
95
+ with gr.TabItem("Basic"):
96
+ input1 = gr.Textbox(label="Input Text", placeholder="Enter text here...")
97
+ submit = gr.Button("Convert", variant="primary")
98
+ output1 = gr.Audio(label="Output Audio")
99
+ submit.click(safe_syn, input1, output1)
100
+
101
+ return app
102
+
103
+ # 主函数
104
+ def main():
105
+ try:
106
+ # 编译 monotonic_align
107
+ compile_monotonic_align()
108
+
109
+ # 加载配置和模型
110
+ config_path = "configs/steins_gate_base.json"
111
+ checkpoint_path = "G_265000.pth"
112
+ hps, net_g = load_config_and_model(config_path, checkpoint_path)
113
+
114
+ # 创建 Gradio 界面
115
+ app = create_gradio_interface(hps, net_g)
116
+ logger.info("Starting Gradio interface...")
117
+ app.launch()
118
+ except Exception as e:
119
+ logger.critical(f"Fatal error: {e}")
120
+ exit(1)
121
+
122
+ if __name__ == "__main__":
123
+ main()
monotonic_align/.gitignore ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+ .vscode/
monotonic_align/LICENSE.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Moonsik Park
4
+ Copyright (c) 2021 Jaehyeon Kim
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
monotonic_align/PKG-INFO ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: monotonic_align
3
+ Version: 1.0.0
4
+ Summary: Monotonic Alignment Search module
5
+ Author-email: Moonsik Park <[email protected]>
6
+ Project-URL: repository, https://github.com/moonsikpark/monotonic_align
7
+ Classifier: Development Status :: 5 - Production/Stable
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Cython
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Requires-Python: >=3.7
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE.md
16
+
17
+ # Monotonic Alignment Search
18
+
19
+ This module does Monotonic Alignment Search required by the [Vits](https://github.com/jaywalnut310/vits) TTS model, and has been copied from [this source](https://github.com/jaywalnut310/vits/tree/2b91ceff252082644bd507d13476a49ea260cadf/monotonic_align).
20
+
21
+ The module have been packaged properly as per [PEP 621](https://peps.python.org/pep-0621/), and has a binary distrubution compiled to nearly every UNIX environment.
22
+
23
+ ## License
24
+
25
+ Please note that the author(s) are not liable for any claim, damages or other liability thereof.
26
+
27
+ ```
28
+ MIT License
29
+
30
+ Copyright (c) 2023 Moonsik Park
31
+ Copyright (c) 2021 Jaehyeon Kim
32
+ ````
monotonic_align/README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Monotonic Alignment Search
2
+
3
+ This module does Monotonic Alignment Search required by the [Vits](https://github.com/jaywalnut310/vits) TTS model, and has been copied from [this source](https://github.com/jaywalnut310/vits/tree/2b91ceff252082644bd507d13476a49ea260cadf/monotonic_align).
4
+
5
+ The module have been packaged properly as per [PEP 621](https://peps.python.org/pep-0621/), and has a binary distrubution compiled to nearly every UNIX environment.
6
+
7
+ ## License
8
+
9
+ Please note that the author(s) are not liable for any claim, damages or other liability thereof.
10
+
11
+ ```
12
+ MIT License
13
+
14
+ Copyright (c) 2023 Moonsik Park
15
+ Copyright (c) 2021 Jaehyeon Kim
16
+ ````
monotonic_align/__init__.py DELETED
@@ -1,19 +0,0 @@
1
- import numpy as np
2
- import torch
3
- from .monotonic_align.core import maximum_path_c
4
-
5
-
6
- def maximum_path(neg_cent, mask):
7
- """ Cython optimized version.
8
- neg_cent: [b, t_t, t_s]
9
- mask: [b, t_t, t_s]
10
- """
11
- device = neg_cent.device
12
- dtype = neg_cent.dtype
13
- neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
14
- path = np.zeros(neg_cent.shape, dtype=np.int32)
15
-
16
- t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
17
- t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
18
- maximum_path_c(path, neg_cent, t_t_max, t_s_max)
19
- return torch.from_numpy(path).to(device=device, dtype=dtype)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monotonic_align/__pycache__/__init__.cpython-39.pyc DELETED
Binary file (804 Bytes)
 
monotonic_align/build/temp.win-amd64-3.9/Release/core.cp39-win_amd64.exp DELETED
Binary file (746 Bytes)
 
monotonic_align/build/temp.win-amd64-3.9/Release/core.cp39-win_amd64.lib DELETED
Binary file (1.94 kB)
 
monotonic_align/build/temp.win-amd64-3.9/Release/core.obj DELETED
Binary file (720 kB)
 
monotonic_align/monotonic_align/core.cp39-win_amd64.pyd DELETED
Binary file (151 kB)
 
monotonic_align/pyproject.toml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools==67.4.0", "setuptools-scm[toml]==7.1.0", "Cython==0.29.33"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "monotonic_align"
7
+ authors = [
8
+ {name = "Moonsik Park", email = "[email protected]"},
9
+ ]
10
+ description = "Monotonic Alignment Search module"
11
+ readme = "README.md"
12
+ requires-python = ">=3.7"
13
+ classifiers = [
14
+ "Development Status :: 5 - Production/Stable",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Cython",
19
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
20
+ ]
21
+ dynamic = ["version"]
22
+
23
+
24
+ [project.urls]
25
+ repository = "https://github.com/moonsikpark/monotonic_align"
26
+
27
+ [tool.setuptools]
28
+ include-package-data = false
29
+
30
+ [tool.setuptools_scm]
31
+
32
+ [tool.setuptools.packages.find]
33
+ where = ["src"]
monotonic_align/setup.cfg ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
monotonic_align/setup.py CHANGED
@@ -1,9 +1,10 @@
1
- from distutils.core import setup
2
- from Cython.Build import cythonize
3
- import numpy
4
 
5
  setup(
6
- name = 'monotonic_align',
7
- ext_modules = cythonize("core.pyx"),
8
- include_dirs=[numpy.get_include()]
 
 
 
9
  )
 
1
+ from setuptools import Extension, setup
 
 
2
 
3
  setup(
4
+ ext_modules=[
5
+ Extension(
6
+ name="monotonic_align.core",
7
+ sources=["src/core.pyx"],
8
+ ),
9
+ ]
10
  )
monotonic_align/{core.c → src/core.c} RENAMED
The diff for this file is too large to render. See raw diff
 
monotonic_align/{core.pyx → src/core.pyx} RENAMED
@@ -1,3 +1,4 @@
 
1
  cimport cython
2
  from cython.parallel import prange
3
 
 
1
+ #cython: language_level=3
2
  cimport cython
3
  from cython.parallel import prange
4
 
requirements.txt CHANGED
@@ -1,16 +1,20 @@
1
- Cython==0.29.21
2
- librosa==0.8.0
3
- matplotlib==3.3.1
4
- numpy==1.21.6
5
- phonemizer==2.2.1
6
- scipy==1.5.2
7
- tensorboard==2.3.0
8
  torch
9
  torchvision
10
- Unidecode==1.1.1
11
- jamo==0.4.1
12
- pypinyin==0.44.0
13
- jieba==0.42.1
14
- cn2an==0.5.17
15
  IPython
16
- pyopenjtalk==0.2.0
 
 
 
 
 
1
+ Cython
2
+ librosa
3
+ matplotlib
4
+ numpy
5
+ phonemizer
6
+ scipy
7
+ tensorboard
8
  torch
9
  torchvision
10
+ Unidecode
11
+ jamo
12
+ pypinyin
13
+ jieba
14
+ cn2an
15
  IPython
16
+ pyopenjtalk
17
+
18
+ Pillow
19
+ Cython
20
+ gradio
utils.py CHANGED
@@ -18,6 +18,8 @@ logger = logging
18
  def load_checkpoint(checkpoint_path, model, optimizer=None):
19
  assert os.path.isfile(checkpoint_path)
20
  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
 
 
21
  iteration = checkpoint_dict['iteration']
22
  learning_rate = checkpoint_dict['learning_rate']
23
  if optimizer is not None:
@@ -85,7 +87,7 @@ def plot_spectrogram_to_numpy(spectrogram):
85
  mpl_logger.setLevel(logging.WARNING)
86
  import matplotlib.pylab as plt
87
  import numpy as np
88
-
89
  fig, ax = plt.subplots(figsize=(10,2))
90
  im = ax.imshow(spectrogram, aspect="auto", origin="lower",
91
  interpolation='none')
@@ -147,7 +149,7 @@ def get_hparams(init=True):
147
  help='JSON file for configuration')
148
  parser.add_argument('-m', '--model', type=str, required=True,
149
  help='Model name')
150
-
151
  args = parser.parse_args()
152
  model_dir = os.path.join("../drive/MyDrive", args.model)
153
 
@@ -165,7 +167,7 @@ def get_hparams(init=True):
165
  with open(config_save_path, "r") as f:
166
  data = f.read()
167
  config = json.loads(data)
168
-
169
  hparams = HParams(**config)
170
  hparams.model_dir = model_dir
171
  return hparams
@@ -215,7 +217,7 @@ def get_logger(model_dir, filename="train.log"):
215
  global logger
216
  logger = logging.getLogger(os.path.basename(model_dir))
217
  logger.setLevel(logging.DEBUG)
218
-
219
  formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
220
  if not os.path.exists(model_dir):
221
  os.makedirs(model_dir)
@@ -232,7 +234,7 @@ class HParams():
232
  if type(v) == dict:
233
  v = HParams(**v)
234
  self[k] = v
235
-
236
  def keys(self):
237
  return self.__dict__.keys()
238
 
 
18
  def load_checkpoint(checkpoint_path, model, optimizer=None):
19
  assert os.path.isfile(checkpoint_path)
20
  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
21
+ #checkpoint_dict = torch.load(checkpoint_path, map_location='cuda:0', weights_only=True)
22
+
23
  iteration = checkpoint_dict['iteration']
24
  learning_rate = checkpoint_dict['learning_rate']
25
  if optimizer is not None:
 
87
  mpl_logger.setLevel(logging.WARNING)
88
  import matplotlib.pylab as plt
89
  import numpy as np
90
+
91
  fig, ax = plt.subplots(figsize=(10,2))
92
  im = ax.imshow(spectrogram, aspect="auto", origin="lower",
93
  interpolation='none')
 
149
  help='JSON file for configuration')
150
  parser.add_argument('-m', '--model', type=str, required=True,
151
  help='Model name')
152
+
153
  args = parser.parse_args()
154
  model_dir = os.path.join("../drive/MyDrive", args.model)
155
 
 
167
  with open(config_save_path, "r") as f:
168
  data = f.read()
169
  config = json.loads(data)
170
+
171
  hparams = HParams(**config)
172
  hparams.model_dir = model_dir
173
  return hparams
 
217
  global logger
218
  logger = logging.getLogger(os.path.basename(model_dir))
219
  logger.setLevel(logging.DEBUG)
220
+
221
  formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
222
  if not os.path.exists(model_dir):
223
  os.makedirs(model_dir)
 
234
  if type(v) == dict:
235
  v = HParams(**v)
236
  self[k] = v
237
+
238
  def keys(self):
239
  return self.__dict__.keys()
240