esnya commited on
Commit
5034c86
·
1 Parent(s): 0016e60

:tada: feat!: First commit

Browse files
Pipfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [[source]]
2
+ url = "https://pypi.org/simple"
3
+ verify_ssl = true
4
+ name = "pypi"
5
+
6
+ [[source]]
7
+ url = "https://download.pytorch.org/whl/cu118"
8
+ verify_ssl = true
9
+ name = "downloadpytorch"
10
+
11
+ [packages]
12
+ torch = {version = "*", index = "downloadpytorch"}
13
+ transformers = "*"
14
+ sentencepiece = "*"
15
+ gradio = "*"
16
+ pyopenjtalk-prebuilt = "*"
17
+
18
+ [dev-packages]
19
+ bandit = "*"
20
+
21
+ [requires]
22
+ python_version = "3.10"
Pipfile.lock ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import cast
2
+ import gradio as gr
3
+ import numpy as np
4
+ import torch
5
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan
6
+ from speecht5_openjtalk_tokenizer import SpeechT5OpenjtalkTokenizer
7
+ import pandas as pd
8
+
9
+ import transformers
10
+
11
+ setattr(transformers, SpeechT5OpenjtalkTokenizer.__name__, SpeechT5OpenjtalkTokenizer)
12
+
13
+
14
+ class SpeechT5OpenjtalkProcessor(SpeechT5Processor):
15
+ tokenizer_class = SpeechT5OpenjtalkTokenizer.__name__
16
+
17
+
18
+ model = SpeechT5ForTextToSpeech.from_pretrained("esnya/japanese_speecht5_tts")
19
+ assert isinstance(model, SpeechT5ForTextToSpeech)
20
+
21
+ processor = SpeechT5OpenjtalkProcessor.from_pretrained("esnya/japanese_speecht5_tts")
22
+ assert isinstance(processor, SpeechT5OpenjtalkProcessor)
23
+
24
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
25
+ assert isinstance(vocoder, SpeechT5HifiGan)
26
+
27
+
28
+ if torch.cuda.is_available():
29
+ model = model.cuda()
30
+ vocoder = vocoder.cuda()
31
+
32
+
33
+ def convert_float32_to_int16(wav: np.ndarray) -> np.ndarray:
34
+ assert wav.dtype == np.float32
35
+ return np.clip(wav * 32768.0, -32768.0, 32767.0).astype(np.int16)
36
+
37
+
38
+ @torch.inference_mode()
39
+ def text_to_speech(
40
+ text: str,
41
+ threshold: float = 0.5,
42
+ minlenratio: float = 0.0,
43
+ maxlenratio: float = 10.0,
44
+ ):
45
+ speaker_embeddings = (
46
+ torch.rand(
47
+ (1, model.config.speaker_embedding_dim),
48
+ dtype=torch.float32,
49
+ device=model.device,
50
+ )
51
+ * 2
52
+ - 1
53
+ )
54
+
55
+ input_ids = processor(text=text, return_tensors="pt")
56
+ assert input_ids is not None
57
+ input_ids = input_ids.input_ids.to(model.device)
58
+
59
+ speaker_embeddings = cast(torch.FloatTensor, speaker_embeddings)
60
+
61
+ wav = model.generate_speech(
62
+ input_ids,
63
+ speaker_embeddings,
64
+ threshold=threshold,
65
+ minlenratio=minlenratio,
66
+ maxlenratio=maxlenratio,
67
+ vocoder=vocoder,
68
+ )
69
+ wav = cast(torch.FloatTensor, wav)
70
+
71
+ wav = convert_float32_to_int16(wav.reshape(-1).cpu().float().numpy())
72
+
73
+ return [
74
+ (vocoder.config.sampling_rate, wav),
75
+ pd.DataFrame(
76
+ {
77
+ "dim": range(speaker_embeddings.shape[-1]),
78
+ "value": speaker_embeddings[0].cpu().float().numpy(),
79
+ }
80
+ ),
81
+ ]
82
+
83
+
84
+ demo = gr.Interface(
85
+ fn=text_to_speech,
86
+ inputs=[
87
+ "text",
88
+ gr.Slider(0, 0.5, 0.5, label="threshold"),
89
+ gr.Slider(0, 100, 0, label="minlenratio"),
90
+ gr.Slider(0, 100, 10, label="maxlenratio"),
91
+ ],
92
+ outputs=[
93
+ "audio",
94
+ gr.BarPlot(
95
+ label="speaker_embedding (random generated)",
96
+ x="dim",
97
+ y="value",
98
+ y_lim=[-1, 1],
99
+ ),
100
+ ],
101
+ )
102
+ demo.launch()
flagged/log.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ component 0,threshold,minlenratio,maxlenratio,output 0,speaker_embedding (random generated),flag,username,timestamp
2
+ 吾輩は猫である。名前はまだ無い。,0.5,0,10,C:\Users\esnya\Documents\space-japanese-tts\flagged\output 0\f3b115fb191a3158b81ce9c14ea8be68d6ccbb13\audio.wav,C:\Users\esnya\Documents\space-japanese-tts\flagged\speaker_embedding random generated\tmpi0ftk1a4.json,,,2023-08-09 19:10:28.404847
3
+ 吾輩は猫である。名前はまだ無い。,0.5,0,10,C:\Users\esnya\Documents\space-japanese-tts\flagged\output 0\a8f2aef3e7de612867c3cfc431eb23507893b8fb\audio.wav,C:\Users\esnya\Documents\space-japanese-tts\flagged\speaker_embedding random generated\tmpi_0pz0y_.json,,,2023-08-09 19:10:59.342286
4
+ エリス帰りぬと答ふる間もなく。,0.5,0,10,C:\Users\esnya\Documents\space-japanese-tts\flagged\output 0\e83a2fdf4d1eea785fad09023d6fab00343e3329\audio.wav,C:\Users\esnya\Documents\space-japanese-tts\flagged\speaker_embedding random generated\tmpe5hqy2sj.json,,,2023-08-09 19:11:12.202799
5
+ エリス帰りぬと答ふる間もなく。,0.5,0,10,C:\Users\esnya\Documents\space-japanese-tts\flagged\output 0\4ed6077f815534fa386a54ff81e6b16cd4e341d7\audio.wav,C:\Users\esnya\Documents\space-japanese-tts\flagged\speaker_embedding random generated\tmpvxdd17vj.json,,,2023-08-09 19:11:33.145021
flagged/output 0/4ed6077f815534fa386a54ff81e6b16cd4e341d7/audio.wav ADDED
Binary file (96.3 kB). View file
 
flagged/output 0/a8f2aef3e7de612867c3cfc431eb23507893b8fb/audio.wav ADDED
Binary file (130 kB). View file
 
flagged/output 0/e83a2fdf4d1eea785fad09023d6fab00343e3329/audio.wav ADDED
Binary file (104 kB). View file
 
flagged/output 0/f3b115fb191a3158b81ce9c14ea8be68d6ccbb13/audio.wav ADDED
Binary file (126 kB). View file
 
flagged/speaker_embedding random generated/tmpe5hqy2sj.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"type": "altair", "plot": "{\n \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.8.0.json\",\n \"background\": \"transparent\",\n \"config\": {\n \"view\": {\n \"continuousHeight\": 300,\n \"continuousWidth\": 300\n }\n },\n \"data\": {\n \"name\": \"data-0ae9df14f2f8f47d301fa5647272660a\"\n },\n \"datasets\": {\n \"data-0ae9df14f2f8f47d301fa5647272660a\": [\n {\n \"dim\": 0,\n \"value\": 0.865778923034668\n },\n {\n \"dim\": 1,\n \"value\": 0.598870038986206\n },\n {\n \"dim\": 2,\n \"value\": 0.42185306549072266\n },\n {\n \"dim\": 3,\n \"value\": 0.2511407136917114\n },\n {\n \"dim\": 4,\n \"value\": -0.3122257590293884\n },\n {\n \"dim\": 5,\n \"value\": 0.3540457487106323\n },\n {\n \"dim\": 6,\n \"value\": 0.06121230125427246\n },\n {\n \"dim\": 7,\n \"value\": -0.6955808401107788\n },\n {\n \"dim\": 8,\n \"value\": 0.5161528587341309\n },\n {\n \"dim\": 9,\n \"value\": -0.34081584215164185\n },\n {\n \"dim\": 10,\n \"value\": 0.2803090810775757\n },\n {\n \"dim\": 11,\n \"value\": 0.4033799171447754\n },\n {\n \"dim\": 12,\n \"value\": -0.9365829825401306\n },\n {\n \"dim\": 13,\n \"value\": -0.2028934359550476\n },\n {\n \"dim\": 14,\n \"value\": -0.6850658655166626\n },\n {\n \"dim\": 15,\n \"value\": 0.9078857898712158\n }\n ]\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"dim\",\n \"title\": \"dim\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"aggregate\": \"sum\",\n \"field\": \"value\",\n \"scale\": {\n \"domain\": [\n -1,\n 1\n ]\n },\n \"title\": \"value\",\n \"type\": \"quantitative\"\n }\n },\n \"mark\": {\n \"type\": \"bar\"\n },\n \"params\": [\n {\n \"bind\": \"scales\",\n \"name\": \"param_46\",\n \"select\": {\n \"encodings\": [\n \"x\",\n \"y\"\n ],\n \"type\": \"interval\"\n }\n }\n ]\n}", "chart": "bar"}
flagged/speaker_embedding random generated/tmpi0ftk1a4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"type": "altair", "plot": "{\n \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.8.0.json\",\n \"background\": \"transparent\",\n \"config\": {\n \"view\": {\n \"continuousHeight\": 300,\n \"continuousWidth\": 300\n }\n },\n \"data\": {\n \"name\": \"data-9c5b21524e1befbb2ba9cb7a56a0869d\"\n },\n \"datasets\": {\n \"data-9c5b21524e1befbb2ba9cb7a56a0869d\": [\n {\n \"dim\": 0,\n \"value\": 0.13475024700164795\n },\n {\n \"dim\": 1,\n \"value\": -0.6327165961265564\n },\n {\n \"dim\": 2,\n \"value\": 0.142317533493042\n },\n {\n \"dim\": 3,\n \"value\": 0.21169781684875488\n },\n {\n \"dim\": 4,\n \"value\": -0.16417354345321655\n },\n {\n \"dim\": 5,\n \"value\": -0.8099290728569031\n },\n {\n \"dim\": 6,\n \"value\": 0.007316946983337402\n },\n {\n \"dim\": 7,\n \"value\": -0.07458484172821045\n },\n {\n \"dim\": 8,\n \"value\": 0.08054876327514648\n },\n {\n \"dim\": 9,\n \"value\": -0.21262741088867188\n },\n {\n \"dim\": 10,\n \"value\": 0.18876373767852783\n },\n {\n \"dim\": 11,\n \"value\": 0.9569865465164185\n },\n {\n \"dim\": 12,\n \"value\": -0.6338413953781128\n },\n {\n \"dim\": 13,\n \"value\": 0.4344193935394287\n },\n {\n \"dim\": 14,\n \"value\": 0.4842950105667114\n },\n {\n \"dim\": 15,\n \"value\": 0.8240410089492798\n }\n ]\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"dim\",\n \"title\": \"dim\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"aggregate\": \"sum\",\n \"field\": \"value\",\n \"scale\": {\n \"domain\": [\n -1,\n 1\n ]\n },\n \"title\": \"value\",\n \"type\": \"quantitative\"\n }\n },\n \"mark\": {\n \"type\": \"bar\"\n },\n \"params\": [\n {\n \"bind\": \"scales\",\n \"name\": \"param_39\",\n \"select\": {\n \"encodings\": [\n \"x\",\n \"y\"\n ],\n \"type\": \"interval\"\n }\n }\n ]\n}", "chart": "bar"}
flagged/speaker_embedding random generated/tmpi_0pz0y_.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"type": "altair", "plot": "{\n \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.8.0.json\",\n \"background\": \"transparent\",\n \"config\": {\n \"view\": {\n \"continuousHeight\": 300,\n \"continuousWidth\": 300\n }\n },\n \"data\": {\n \"name\": \"data-6fb308b55e0eb4e1b35e48e0e7d97e92\"\n },\n \"datasets\": {\n \"data-6fb308b55e0eb4e1b35e48e0e7d97e92\": [\n {\n \"dim\": 0,\n \"value\": 0.29687047004699707\n },\n {\n \"dim\": 1,\n \"value\": 0.9556258916854858\n },\n {\n \"dim\": 2,\n \"value\": -0.6389535665512085\n },\n {\n \"dim\": 3,\n \"value\": -0.611727774143219\n },\n {\n \"dim\": 4,\n \"value\": -0.21317017078399658\n },\n {\n \"dim\": 5,\n \"value\": 0.7883336544036865\n },\n {\n \"dim\": 6,\n \"value\": -0.018283069133758545\n },\n {\n \"dim\": 7,\n \"value\": 0.35531842708587646\n },\n {\n \"dim\": 8,\n \"value\": 0.26885783672332764\n },\n {\n \"dim\": 9,\n \"value\": -0.4866262674331665\n },\n {\n \"dim\": 10,\n \"value\": -0.23639953136444092\n },\n {\n \"dim\": 11,\n \"value\": 0.7181340456008911\n },\n {\n \"dim\": 12,\n \"value\": -0.4331989884376526\n },\n {\n \"dim\": 13,\n \"value\": 0.2988170385360718\n },\n {\n \"dim\": 14,\n \"value\": -0.46783244609832764\n },\n {\n \"dim\": 15,\n \"value\": 0.12830126285552979\n }\n ]\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"dim\",\n \"title\": \"dim\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"aggregate\": \"sum\",\n \"field\": \"value\",\n \"scale\": {\n \"domain\": [\n -1,\n 1\n ]\n },\n \"title\": \"value\",\n \"type\": \"quantitative\"\n }\n },\n \"mark\": {\n \"type\": \"bar\"\n },\n \"params\": [\n {\n \"bind\": \"scales\",\n \"name\": \"param_45\",\n \"select\": {\n \"encodings\": [\n \"x\",\n \"y\"\n ],\n \"type\": \"interval\"\n }\n }\n ]\n}", "chart": "bar"}
flagged/speaker_embedding random generated/tmpvxdd17vj.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"type": "altair", "plot": "{\n \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.8.0.json\",\n \"background\": \"transparent\",\n \"config\": {\n \"view\": {\n \"continuousHeight\": 300,\n \"continuousWidth\": 300\n }\n },\n \"data\": {\n \"name\": \"data-3d7bb59ef86b225300e3b1339e0be1fd\"\n },\n \"datasets\": {\n \"data-3d7bb59ef86b225300e3b1339e0be1fd\": [\n {\n \"dim\": 0,\n \"value\": -0.6320345401763916\n },\n {\n \"dim\": 1,\n \"value\": -0.45626479387283325\n },\n {\n \"dim\": 2,\n \"value\": 0.9872877597808838\n },\n {\n \"dim\": 3,\n \"value\": 0.5087642669677734\n },\n {\n \"dim\": 4,\n \"value\": -0.5172603726387024\n },\n {\n \"dim\": 5,\n \"value\": -0.5950517058372498\n },\n {\n \"dim\": 6,\n \"value\": -0.8185831308364868\n },\n {\n \"dim\": 7,\n \"value\": -0.6966900825500488\n },\n {\n \"dim\": 8,\n \"value\": 0.5786945819854736\n },\n {\n \"dim\": 9,\n \"value\": -0.9495899677276611\n },\n {\n \"dim\": 10,\n \"value\": 0.7928179502487183\n },\n {\n \"dim\": 11,\n \"value\": 0.0871659517288208\n },\n {\n \"dim\": 12,\n \"value\": 0.3648104667663574\n },\n {\n \"dim\": 13,\n \"value\": 0.451604962348938\n },\n {\n \"dim\": 14,\n \"value\": -0.3141704201698303\n },\n {\n \"dim\": 15,\n \"value\": -0.6407181024551392\n }\n ]\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"dim\",\n \"title\": \"dim\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"aggregate\": \"sum\",\n \"field\": \"value\",\n \"scale\": {\n \"domain\": [\n -1,\n 1\n ]\n },\n \"title\": \"value\",\n \"type\": \"quantitative\"\n }\n },\n \"mark\": {\n \"type\": \"bar\"\n },\n \"params\": [\n {\n \"bind\": \"scales\",\n \"name\": \"param_50\",\n \"select\": {\n \"encodings\": [\n \"x\",\n \"y\"\n ],\n \"type\": \"interval\"\n }\n }\n ]\n}", "chart": "bar"}
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ pyopenjtalk-prebuilt==0.3.0
speecht5_openjtalk_tokenizer.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ from pathlib import Path
4
+ import re
5
+ from transformers import SpeechT5Tokenizer
6
+ from transformers.models.speecht5.tokenization_speecht5 import (
7
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
8
+ )
9
+ from itertools import chain
10
+ from typing import List, Optional
11
+
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ NP_CHARCTERS = " !\"#$%&'()=~|`{+*}<>?_-^\\@[;:],./ !”#$%&’()=~|`{+*}<>?_ー^¥@「;:」、。・`"
16
+
17
+
18
+ def _g2p_with_np(text: str, np_lsit: str) -> List[str]:
19
+ from pyopenjtalk import g2p
20
+
21
+ np_pattern = re.compile(f"([{re.escape(np_lsit)}])")
22
+
23
+ return list(
24
+ chain.from_iterable(
25
+ [
26
+ (text,) if text in np_lsit else g2p(text, kana=False, join=False)
27
+ for text in np_pattern.split(text)
28
+ if len(text) > 0
29
+ ]
30
+ )
31
+ )
32
+
33
+
34
+ VOCAB_FILES_NAMES = {
35
+ "vocab_file": "vocab.json",
36
+ }
37
+
38
+ PRETRAINED_VOCAB_FILES_MAP = {
39
+ "vocab_file": {
40
+ "esnya/japanese_speecht5_tts": "https://huggingface.co/esnya/japanese_speecht5_tts/resolve/main/vocab.json",
41
+ },
42
+ }
43
+
44
+
45
+ class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
46
+ vocab_files_names = VOCAB_FILES_NAMES
47
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
48
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
49
+ model_input_names = ["input_ids", "attention_mask"]
50
+
51
+ def __init__(
52
+ self,
53
+ vocab_file,
54
+ bos_token: str = "<s>",
55
+ eos_token: str = "</s>",
56
+ unk_token: str = "<unk>",
57
+ pad_token: str = "<pad>",
58
+ non_phenome_characters: str = NP_CHARCTERS,
59
+ **kwargs,
60
+ ):
61
+ try:
62
+ super().__init__(
63
+ vocab_file=None,
64
+ bos_token=bos_token,
65
+ eos_token=eos_token,
66
+ unk_token=unk_token,
67
+ pad_token=pad_token,
68
+ **kwargs,
69
+ )
70
+ except TypeError:
71
+ pass
72
+
73
+ self.non_phenome_characters = non_phenome_characters
74
+ self.vocab_file = vocab_file
75
+
76
+ self._load_vocab()
77
+
78
+ def _load_vocab(self):
79
+ if isinstance(self.vocab_file, str) and self.vocab_file.endswith(".json"):
80
+ with open(self.vocab_file, encoding="utf-8") as f:
81
+ self.label2id = json.load(f)
82
+ self.id2label = {v: k for k, v in self.label2id.items()}
83
+
84
+ @property
85
+ def bos_token_id(self) -> int | None:
86
+ return super().bos_token_id
87
+
88
+ @property
89
+ def vocab_size(self):
90
+ return len(self.label2id)
91
+
92
+ def get_vocab(self):
93
+ return self.label2id
94
+
95
+ def __getstate__(self):
96
+ state = super().__getstate__()
97
+ del state["sp_model"]
98
+ return state
99
+
100
+ def __setstate__(self, d):
101
+ self.__dict__ = d
102
+ self._load_vocab()
103
+
104
+ def save_vocabulary(
105
+ self, save_directory: str, filename_prefix: Optional[str] = None
106
+ ):
107
+ if filename_prefix is None:
108
+ filename_prefix = ".json"
109
+
110
+ save_path = Path(save_directory)
111
+ if not save_path.is_dir():
112
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
113
+ return
114
+
115
+ vocab_path = Path(save_directory) / Path(f"vocab{filename_prefix}")
116
+ vocab_path.parent.mkdir(parents=True, exist_ok=True)
117
+ with open(vocab_path, "w", encoding="utf-8") as f:
118
+ json.dump(self.label2id, f, ensure_ascii=False, indent=2)
119
+
120
+ return (str(vocab_path),)
121
+
122
+ def _tokenize(self, text: str) -> List[str]:
123
+ return _g2p_with_np(text, self.non_phenome_characters)
124
+
125
+ def _convert_token_to_id(self, token):
126
+ return self.label2id.get(token, self.label2id.get(self.unk_token))
127
+
128
+ def _convert_id_to_token(self, index):
129
+ return self.id2label.get(index, self.unk_token)