Spaces:
Runtime error
Runtime error
esnya
commited on
Commit
·
5034c86
1
Parent(s):
0016e60
:tada: feat!: First commit
Browse files- Pipfile +22 -0
- Pipfile.lock +0 -0
- app.py +102 -0
- flagged/log.csv +5 -0
- flagged/output 0/4ed6077f815534fa386a54ff81e6b16cd4e341d7/audio.wav +0 -0
- flagged/output 0/a8f2aef3e7de612867c3cfc431eb23507893b8fb/audio.wav +0 -0
- flagged/output 0/e83a2fdf4d1eea785fad09023d6fab00343e3329/audio.wav +0 -0
- flagged/output 0/f3b115fb191a3158b81ce9c14ea8be68d6ccbb13/audio.wav +0 -0
- flagged/speaker_embedding random generated/tmpe5hqy2sj.json +1 -0
- flagged/speaker_embedding random generated/tmpi0ftk1a4.json +1 -0
- flagged/speaker_embedding random generated/tmpi_0pz0y_.json +1 -0
- flagged/speaker_embedding random generated/tmpvxdd17vj.json +1 -0
- requirements.txt +1 -0
- speecht5_openjtalk_tokenizer.py +129 -0
Pipfile
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[[source]]
|
2 |
+
url = "https://pypi.org/simple"
|
3 |
+
verify_ssl = true
|
4 |
+
name = "pypi"
|
5 |
+
|
6 |
+
[[source]]
|
7 |
+
url = "https://download.pytorch.org/whl/cu118"
|
8 |
+
verify_ssl = true
|
9 |
+
name = "downloadpytorch"
|
10 |
+
|
11 |
+
[packages]
|
12 |
+
torch = {version = "*", index = "downloadpytorch"}
|
13 |
+
transformers = "*"
|
14 |
+
sentencepiece = "*"
|
15 |
+
gradio = "*"
|
16 |
+
pyopenjtalk-prebuilt = "*"
|
17 |
+
|
18 |
+
[dev-packages]
|
19 |
+
bandit = "*"
|
20 |
+
|
21 |
+
[requires]
|
22 |
+
python_version = "3.10"
|
Pipfile.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import cast
|
2 |
+
import gradio as gr
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan
|
6 |
+
from speecht5_openjtalk_tokenizer import SpeechT5OpenjtalkTokenizer
|
7 |
+
import pandas as pd
|
8 |
+
|
9 |
+
import transformers
|
10 |
+
|
11 |
+
setattr(transformers, SpeechT5OpenjtalkTokenizer.__name__, SpeechT5OpenjtalkTokenizer)
|
12 |
+
|
13 |
+
|
14 |
+
class SpeechT5OpenjtalkProcessor(SpeechT5Processor):
|
15 |
+
tokenizer_class = SpeechT5OpenjtalkTokenizer.__name__
|
16 |
+
|
17 |
+
|
18 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("esnya/japanese_speecht5_tts")
|
19 |
+
assert isinstance(model, SpeechT5ForTextToSpeech)
|
20 |
+
|
21 |
+
processor = SpeechT5OpenjtalkProcessor.from_pretrained("esnya/japanese_speecht5_tts")
|
22 |
+
assert isinstance(processor, SpeechT5OpenjtalkProcessor)
|
23 |
+
|
24 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
25 |
+
assert isinstance(vocoder, SpeechT5HifiGan)
|
26 |
+
|
27 |
+
|
28 |
+
if torch.cuda.is_available():
|
29 |
+
model = model.cuda()
|
30 |
+
vocoder = vocoder.cuda()
|
31 |
+
|
32 |
+
|
33 |
+
def convert_float32_to_int16(wav: np.ndarray) -> np.ndarray:
|
34 |
+
assert wav.dtype == np.float32
|
35 |
+
return np.clip(wav * 32768.0, -32768.0, 32767.0).astype(np.int16)
|
36 |
+
|
37 |
+
|
38 |
+
@torch.inference_mode()
|
39 |
+
def text_to_speech(
|
40 |
+
text: str,
|
41 |
+
threshold: float = 0.5,
|
42 |
+
minlenratio: float = 0.0,
|
43 |
+
maxlenratio: float = 10.0,
|
44 |
+
):
|
45 |
+
speaker_embeddings = (
|
46 |
+
torch.rand(
|
47 |
+
(1, model.config.speaker_embedding_dim),
|
48 |
+
dtype=torch.float32,
|
49 |
+
device=model.device,
|
50 |
+
)
|
51 |
+
* 2
|
52 |
+
- 1
|
53 |
+
)
|
54 |
+
|
55 |
+
input_ids = processor(text=text, return_tensors="pt")
|
56 |
+
assert input_ids is not None
|
57 |
+
input_ids = input_ids.input_ids.to(model.device)
|
58 |
+
|
59 |
+
speaker_embeddings = cast(torch.FloatTensor, speaker_embeddings)
|
60 |
+
|
61 |
+
wav = model.generate_speech(
|
62 |
+
input_ids,
|
63 |
+
speaker_embeddings,
|
64 |
+
threshold=threshold,
|
65 |
+
minlenratio=minlenratio,
|
66 |
+
maxlenratio=maxlenratio,
|
67 |
+
vocoder=vocoder,
|
68 |
+
)
|
69 |
+
wav = cast(torch.FloatTensor, wav)
|
70 |
+
|
71 |
+
wav = convert_float32_to_int16(wav.reshape(-1).cpu().float().numpy())
|
72 |
+
|
73 |
+
return [
|
74 |
+
(vocoder.config.sampling_rate, wav),
|
75 |
+
pd.DataFrame(
|
76 |
+
{
|
77 |
+
"dim": range(speaker_embeddings.shape[-1]),
|
78 |
+
"value": speaker_embeddings[0].cpu().float().numpy(),
|
79 |
+
}
|
80 |
+
),
|
81 |
+
]
|
82 |
+
|
83 |
+
|
84 |
+
demo = gr.Interface(
|
85 |
+
fn=text_to_speech,
|
86 |
+
inputs=[
|
87 |
+
"text",
|
88 |
+
gr.Slider(0, 0.5, 0.5, label="threshold"),
|
89 |
+
gr.Slider(0, 100, 0, label="minlenratio"),
|
90 |
+
gr.Slider(0, 100, 10, label="maxlenratio"),
|
91 |
+
],
|
92 |
+
outputs=[
|
93 |
+
"audio",
|
94 |
+
gr.BarPlot(
|
95 |
+
label="speaker_embedding (random generated)",
|
96 |
+
x="dim",
|
97 |
+
y="value",
|
98 |
+
y_lim=[-1, 1],
|
99 |
+
),
|
100 |
+
],
|
101 |
+
)
|
102 |
+
demo.launch()
|
flagged/log.csv
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
component 0,threshold,minlenratio,maxlenratio,output 0,speaker_embedding (random generated),flag,username,timestamp
|
2 |
+
吾輩は猫である。名前はまだ無い。,0.5,0,10,C:\Users\esnya\Documents\space-japanese-tts\flagged\output 0\f3b115fb191a3158b81ce9c14ea8be68d6ccbb13\audio.wav,C:\Users\esnya\Documents\space-japanese-tts\flagged\speaker_embedding random generated\tmpi0ftk1a4.json,,,2023-08-09 19:10:28.404847
|
3 |
+
吾輩は猫である。名前はまだ無い。,0.5,0,10,C:\Users\esnya\Documents\space-japanese-tts\flagged\output 0\a8f2aef3e7de612867c3cfc431eb23507893b8fb\audio.wav,C:\Users\esnya\Documents\space-japanese-tts\flagged\speaker_embedding random generated\tmpi_0pz0y_.json,,,2023-08-09 19:10:59.342286
|
4 |
+
エリス帰りぬと答ふる間もなく。,0.5,0,10,C:\Users\esnya\Documents\space-japanese-tts\flagged\output 0\e83a2fdf4d1eea785fad09023d6fab00343e3329\audio.wav,C:\Users\esnya\Documents\space-japanese-tts\flagged\speaker_embedding random generated\tmpe5hqy2sj.json,,,2023-08-09 19:11:12.202799
|
5 |
+
エリス帰りぬと答ふる間もなく。,0.5,0,10,C:\Users\esnya\Documents\space-japanese-tts\flagged\output 0\4ed6077f815534fa386a54ff81e6b16cd4e341d7\audio.wav,C:\Users\esnya\Documents\space-japanese-tts\flagged\speaker_embedding random generated\tmpvxdd17vj.json,,,2023-08-09 19:11:33.145021
|
flagged/output 0/4ed6077f815534fa386a54ff81e6b16cd4e341d7/audio.wav
ADDED
Binary file (96.3 kB). View file
|
|
flagged/output 0/a8f2aef3e7de612867c3cfc431eb23507893b8fb/audio.wav
ADDED
Binary file (130 kB). View file
|
|
flagged/output 0/e83a2fdf4d1eea785fad09023d6fab00343e3329/audio.wav
ADDED
Binary file (104 kB). View file
|
|
flagged/output 0/f3b115fb191a3158b81ce9c14ea8be68d6ccbb13/audio.wav
ADDED
Binary file (126 kB). View file
|
|
flagged/speaker_embedding random generated/tmpe5hqy2sj.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"type": "altair", "plot": "{\n \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.8.0.json\",\n \"background\": \"transparent\",\n \"config\": {\n \"view\": {\n \"continuousHeight\": 300,\n \"continuousWidth\": 300\n }\n },\n \"data\": {\n \"name\": \"data-0ae9df14f2f8f47d301fa5647272660a\"\n },\n \"datasets\": {\n \"data-0ae9df14f2f8f47d301fa5647272660a\": [\n {\n \"dim\": 0,\n \"value\": 0.865778923034668\n },\n {\n \"dim\": 1,\n \"value\": 0.598870038986206\n },\n {\n \"dim\": 2,\n \"value\": 0.42185306549072266\n },\n {\n \"dim\": 3,\n \"value\": 0.2511407136917114\n },\n {\n \"dim\": 4,\n \"value\": -0.3122257590293884\n },\n {\n \"dim\": 5,\n \"value\": 0.3540457487106323\n },\n {\n \"dim\": 6,\n \"value\": 0.06121230125427246\n },\n {\n \"dim\": 7,\n \"value\": -0.6955808401107788\n },\n {\n \"dim\": 8,\n \"value\": 0.5161528587341309\n },\n {\n \"dim\": 9,\n \"value\": -0.34081584215164185\n },\n {\n \"dim\": 10,\n \"value\": 0.2803090810775757\n },\n {\n \"dim\": 11,\n \"value\": 0.4033799171447754\n },\n {\n \"dim\": 12,\n \"value\": -0.9365829825401306\n },\n {\n \"dim\": 13,\n \"value\": -0.2028934359550476\n },\n {\n \"dim\": 14,\n \"value\": -0.6850658655166626\n },\n {\n \"dim\": 15,\n \"value\": 0.9078857898712158\n }\n ]\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"dim\",\n \"title\": \"dim\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"aggregate\": \"sum\",\n \"field\": \"value\",\n \"scale\": {\n \"domain\": [\n -1,\n 1\n ]\n },\n \"title\": \"value\",\n \"type\": \"quantitative\"\n }\n },\n \"mark\": {\n \"type\": \"bar\"\n },\n \"params\": [\n {\n \"bind\": \"scales\",\n \"name\": \"param_46\",\n \"select\": {\n \"encodings\": [\n \"x\",\n \"y\"\n ],\n \"type\": \"interval\"\n }\n }\n ]\n}", "chart": "bar"}
|
flagged/speaker_embedding random generated/tmpi0ftk1a4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"type": "altair", "plot": "{\n \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.8.0.json\",\n \"background\": \"transparent\",\n \"config\": {\n \"view\": {\n \"continuousHeight\": 300,\n \"continuousWidth\": 300\n }\n },\n \"data\": {\n \"name\": \"data-9c5b21524e1befbb2ba9cb7a56a0869d\"\n },\n \"datasets\": {\n \"data-9c5b21524e1befbb2ba9cb7a56a0869d\": [\n {\n \"dim\": 0,\n \"value\": 0.13475024700164795\n },\n {\n \"dim\": 1,\n \"value\": -0.6327165961265564\n },\n {\n \"dim\": 2,\n \"value\": 0.142317533493042\n },\n {\n \"dim\": 3,\n \"value\": 0.21169781684875488\n },\n {\n \"dim\": 4,\n \"value\": -0.16417354345321655\n },\n {\n \"dim\": 5,\n \"value\": -0.8099290728569031\n },\n {\n \"dim\": 6,\n \"value\": 0.007316946983337402\n },\n {\n \"dim\": 7,\n \"value\": -0.07458484172821045\n },\n {\n \"dim\": 8,\n \"value\": 0.08054876327514648\n },\n {\n \"dim\": 9,\n \"value\": -0.21262741088867188\n },\n {\n \"dim\": 10,\n \"value\": 0.18876373767852783\n },\n {\n \"dim\": 11,\n \"value\": 0.9569865465164185\n },\n {\n \"dim\": 12,\n \"value\": -0.6338413953781128\n },\n {\n \"dim\": 13,\n \"value\": 0.4344193935394287\n },\n {\n \"dim\": 14,\n \"value\": 0.4842950105667114\n },\n {\n \"dim\": 15,\n \"value\": 0.8240410089492798\n }\n ]\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"dim\",\n \"title\": \"dim\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"aggregate\": \"sum\",\n \"field\": \"value\",\n \"scale\": {\n \"domain\": [\n -1,\n 1\n ]\n },\n \"title\": \"value\",\n \"type\": \"quantitative\"\n }\n },\n \"mark\": {\n \"type\": \"bar\"\n },\n \"params\": [\n {\n \"bind\": \"scales\",\n \"name\": \"param_39\",\n \"select\": {\n \"encodings\": [\n \"x\",\n \"y\"\n ],\n \"type\": \"interval\"\n }\n }\n ]\n}", "chart": "bar"}
|
flagged/speaker_embedding random generated/tmpi_0pz0y_.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"type": "altair", "plot": "{\n \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.8.0.json\",\n \"background\": \"transparent\",\n \"config\": {\n \"view\": {\n \"continuousHeight\": 300,\n \"continuousWidth\": 300\n }\n },\n \"data\": {\n \"name\": \"data-6fb308b55e0eb4e1b35e48e0e7d97e92\"\n },\n \"datasets\": {\n \"data-6fb308b55e0eb4e1b35e48e0e7d97e92\": [\n {\n \"dim\": 0,\n \"value\": 0.29687047004699707\n },\n {\n \"dim\": 1,\n \"value\": 0.9556258916854858\n },\n {\n \"dim\": 2,\n \"value\": -0.6389535665512085\n },\n {\n \"dim\": 3,\n \"value\": -0.611727774143219\n },\n {\n \"dim\": 4,\n \"value\": -0.21317017078399658\n },\n {\n \"dim\": 5,\n \"value\": 0.7883336544036865\n },\n {\n \"dim\": 6,\n \"value\": -0.018283069133758545\n },\n {\n \"dim\": 7,\n \"value\": 0.35531842708587646\n },\n {\n \"dim\": 8,\n \"value\": 0.26885783672332764\n },\n {\n \"dim\": 9,\n \"value\": -0.4866262674331665\n },\n {\n \"dim\": 10,\n \"value\": -0.23639953136444092\n },\n {\n \"dim\": 11,\n \"value\": 0.7181340456008911\n },\n {\n \"dim\": 12,\n \"value\": -0.4331989884376526\n },\n {\n \"dim\": 13,\n \"value\": 0.2988170385360718\n },\n {\n \"dim\": 14,\n \"value\": -0.46783244609832764\n },\n {\n \"dim\": 15,\n \"value\": 0.12830126285552979\n }\n ]\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"dim\",\n \"title\": \"dim\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"aggregate\": \"sum\",\n \"field\": \"value\",\n \"scale\": {\n \"domain\": [\n -1,\n 1\n ]\n },\n \"title\": \"value\",\n \"type\": \"quantitative\"\n }\n },\n \"mark\": {\n \"type\": \"bar\"\n },\n \"params\": [\n {\n \"bind\": \"scales\",\n \"name\": \"param_45\",\n \"select\": {\n \"encodings\": [\n \"x\",\n \"y\"\n ],\n \"type\": \"interval\"\n }\n }\n ]\n}", "chart": "bar"}
|
flagged/speaker_embedding random generated/tmpvxdd17vj.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"type": "altair", "plot": "{\n \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.8.0.json\",\n \"background\": \"transparent\",\n \"config\": {\n \"view\": {\n \"continuousHeight\": 300,\n \"continuousWidth\": 300\n }\n },\n \"data\": {\n \"name\": \"data-3d7bb59ef86b225300e3b1339e0be1fd\"\n },\n \"datasets\": {\n \"data-3d7bb59ef86b225300e3b1339e0be1fd\": [\n {\n \"dim\": 0,\n \"value\": -0.6320345401763916\n },\n {\n \"dim\": 1,\n \"value\": -0.45626479387283325\n },\n {\n \"dim\": 2,\n \"value\": 0.9872877597808838\n },\n {\n \"dim\": 3,\n \"value\": 0.5087642669677734\n },\n {\n \"dim\": 4,\n \"value\": -0.5172603726387024\n },\n {\n \"dim\": 5,\n \"value\": -0.5950517058372498\n },\n {\n \"dim\": 6,\n \"value\": -0.8185831308364868\n },\n {\n \"dim\": 7,\n \"value\": -0.6966900825500488\n },\n {\n \"dim\": 8,\n \"value\": 0.5786945819854736\n },\n {\n \"dim\": 9,\n \"value\": -0.9495899677276611\n },\n {\n \"dim\": 10,\n \"value\": 0.7928179502487183\n },\n {\n \"dim\": 11,\n \"value\": 0.0871659517288208\n },\n {\n \"dim\": 12,\n \"value\": 0.3648104667663574\n },\n {\n \"dim\": 13,\n \"value\": 0.451604962348938\n },\n {\n \"dim\": 14,\n \"value\": -0.3141704201698303\n },\n {\n \"dim\": 15,\n \"value\": -0.6407181024551392\n }\n ]\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"dim\",\n \"title\": \"dim\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"aggregate\": \"sum\",\n \"field\": \"value\",\n \"scale\": {\n \"domain\": [\n -1,\n 1\n ]\n },\n \"title\": \"value\",\n \"type\": \"quantitative\"\n }\n },\n \"mark\": {\n \"type\": \"bar\"\n },\n \"params\": [\n {\n \"bind\": \"scales\",\n \"name\": \"param_50\",\n \"select\": {\n \"encodings\": [\n \"x\",\n \"y\"\n ],\n \"type\": \"interval\"\n }\n }\n ]\n}", "chart": "bar"}
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pyopenjtalk-prebuilt==0.3.0
|
speecht5_openjtalk_tokenizer.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
from pathlib import Path
|
4 |
+
import re
|
5 |
+
from transformers import SpeechT5Tokenizer
|
6 |
+
from transformers.models.speecht5.tokenization_speecht5 import (
|
7 |
+
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
|
8 |
+
)
|
9 |
+
from itertools import chain
|
10 |
+
from typing import List, Optional
|
11 |
+
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
NP_CHARCTERS = " !\"#$%&'()=~|`{+*}<>?_-^\\@[;:],./ !”#$%&’()=~|`{+*}<>?_ー^¥@「;:」、。・`"
|
16 |
+
|
17 |
+
|
18 |
+
def _g2p_with_np(text: str, np_lsit: str) -> List[str]:
|
19 |
+
from pyopenjtalk import g2p
|
20 |
+
|
21 |
+
np_pattern = re.compile(f"([{re.escape(np_lsit)}])")
|
22 |
+
|
23 |
+
return list(
|
24 |
+
chain.from_iterable(
|
25 |
+
[
|
26 |
+
(text,) if text in np_lsit else g2p(text, kana=False, join=False)
|
27 |
+
for text in np_pattern.split(text)
|
28 |
+
if len(text) > 0
|
29 |
+
]
|
30 |
+
)
|
31 |
+
)
|
32 |
+
|
33 |
+
|
34 |
+
VOCAB_FILES_NAMES = {
|
35 |
+
"vocab_file": "vocab.json",
|
36 |
+
}
|
37 |
+
|
38 |
+
PRETRAINED_VOCAB_FILES_MAP = {
|
39 |
+
"vocab_file": {
|
40 |
+
"esnya/japanese_speecht5_tts": "https://huggingface.co/esnya/japanese_speecht5_tts/resolve/main/vocab.json",
|
41 |
+
},
|
42 |
+
}
|
43 |
+
|
44 |
+
|
45 |
+
class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
|
46 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
47 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
48 |
+
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
49 |
+
model_input_names = ["input_ids", "attention_mask"]
|
50 |
+
|
51 |
+
def __init__(
|
52 |
+
self,
|
53 |
+
vocab_file,
|
54 |
+
bos_token: str = "<s>",
|
55 |
+
eos_token: str = "</s>",
|
56 |
+
unk_token: str = "<unk>",
|
57 |
+
pad_token: str = "<pad>",
|
58 |
+
non_phenome_characters: str = NP_CHARCTERS,
|
59 |
+
**kwargs,
|
60 |
+
):
|
61 |
+
try:
|
62 |
+
super().__init__(
|
63 |
+
vocab_file=None,
|
64 |
+
bos_token=bos_token,
|
65 |
+
eos_token=eos_token,
|
66 |
+
unk_token=unk_token,
|
67 |
+
pad_token=pad_token,
|
68 |
+
**kwargs,
|
69 |
+
)
|
70 |
+
except TypeError:
|
71 |
+
pass
|
72 |
+
|
73 |
+
self.non_phenome_characters = non_phenome_characters
|
74 |
+
self.vocab_file = vocab_file
|
75 |
+
|
76 |
+
self._load_vocab()
|
77 |
+
|
78 |
+
def _load_vocab(self):
|
79 |
+
if isinstance(self.vocab_file, str) and self.vocab_file.endswith(".json"):
|
80 |
+
with open(self.vocab_file, encoding="utf-8") as f:
|
81 |
+
self.label2id = json.load(f)
|
82 |
+
self.id2label = {v: k for k, v in self.label2id.items()}
|
83 |
+
|
84 |
+
@property
|
85 |
+
def bos_token_id(self) -> int | None:
|
86 |
+
return super().bos_token_id
|
87 |
+
|
88 |
+
@property
|
89 |
+
def vocab_size(self):
|
90 |
+
return len(self.label2id)
|
91 |
+
|
92 |
+
def get_vocab(self):
|
93 |
+
return self.label2id
|
94 |
+
|
95 |
+
def __getstate__(self):
|
96 |
+
state = super().__getstate__()
|
97 |
+
del state["sp_model"]
|
98 |
+
return state
|
99 |
+
|
100 |
+
def __setstate__(self, d):
|
101 |
+
self.__dict__ = d
|
102 |
+
self._load_vocab()
|
103 |
+
|
104 |
+
def save_vocabulary(
|
105 |
+
self, save_directory: str, filename_prefix: Optional[str] = None
|
106 |
+
):
|
107 |
+
if filename_prefix is None:
|
108 |
+
filename_prefix = ".json"
|
109 |
+
|
110 |
+
save_path = Path(save_directory)
|
111 |
+
if not save_path.is_dir():
|
112 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
113 |
+
return
|
114 |
+
|
115 |
+
vocab_path = Path(save_directory) / Path(f"vocab{filename_prefix}")
|
116 |
+
vocab_path.parent.mkdir(parents=True, exist_ok=True)
|
117 |
+
with open(vocab_path, "w", encoding="utf-8") as f:
|
118 |
+
json.dump(self.label2id, f, ensure_ascii=False, indent=2)
|
119 |
+
|
120 |
+
return (str(vocab_path),)
|
121 |
+
|
122 |
+
def _tokenize(self, text: str) -> List[str]:
|
123 |
+
return _g2p_with_np(text, self.non_phenome_characters)
|
124 |
+
|
125 |
+
def _convert_token_to_id(self, token):
|
126 |
+
return self.label2id.get(token, self.label2id.get(self.unk_token))
|
127 |
+
|
128 |
+
def _convert_id_to_token(self, index):
|
129 |
+
return self.id2label.get(index, self.unk_token)
|