doberst commited on
Commit
c2cb40e
·
verified ·
1 Parent(s): 05155df

Upload 19 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SpeechT5ForTextToSpeech"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "decoder_attention_heads": 12,
38
+ "decoder_ffn_dim": 3072,
39
+ "decoder_layerdrop": 0.1,
40
+ "decoder_layers": 6,
41
+ "decoder_start_token_id": 2,
42
+ "encoder_attention_heads": 12,
43
+ "encoder_ffn_dim": 3072,
44
+ "encoder_layerdrop": 0.1,
45
+ "encoder_layers": 12,
46
+ "encoder_max_relative_position": 160,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "guided_attention_loss_num_heads": 2,
52
+ "guided_attention_loss_scale": 10.0,
53
+ "guided_attention_loss_sigma": 0.4,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0.1,
56
+ "hidden_size": 768,
57
+ "initializer_range": 0.02,
58
+ "is_encoder_decoder": true,
59
+ "layer_norm_eps": 1e-05,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_min_masks": 0,
62
+ "mask_feature_prob": 0.0,
63
+ "mask_time_length": 10,
64
+ "mask_time_min_masks": 2,
65
+ "mask_time_prob": 0.05,
66
+ "max_length": 1876,
67
+ "max_speech_positions": 1876,
68
+ "max_text_positions": 600,
69
+ "model_type": "speecht5",
70
+ "num_conv_pos_embedding_groups": 16,
71
+ "num_conv_pos_embeddings": 128,
72
+ "num_feat_extract_layers": 7,
73
+ "num_mel_bins": 80,
74
+ "pad_token_id": 1,
75
+ "positional_dropout": 0.1,
76
+ "reduction_factor": 2,
77
+ "scale_embedding": false,
78
+ "speaker_embedding_dim": 512,
79
+ "speech_decoder_postnet_dropout": 0.5,
80
+ "speech_decoder_postnet_kernel": 5,
81
+ "speech_decoder_postnet_layers": 5,
82
+ "speech_decoder_postnet_units": 256,
83
+ "speech_decoder_prenet_dropout": 0.5,
84
+ "speech_decoder_prenet_layers": 2,
85
+ "speech_decoder_prenet_units": 256,
86
+ "torch_dtype": "float32",
87
+ "transformers_version": "4.52.4",
88
+ "use_cache": true,
89
+ "use_guided_attention_loss": true,
90
+ "vocab_size": 81
91
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "max_length": 1876,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.52.4"
9
+ }
openvino_decoder_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ef7ec83da8cbd071694df76dd432dc7293a376990504e6ab14d0aeb867b1376
3
+ size 238175084
openvino_decoder_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
openvino_detokenizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7e523a020fcd210e6f4f2349b37371c2b315f8e7f7f691be547a4109b6fe2d4
3
+ size 238520
openvino_detokenizer.xml ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="detokenizer" version="11">
3
+ <layers>
4
+ <layer id="0" name="Parameter_67208" type="Parameter" version="opset1">
5
+ <data shape="?,?" element_type="i64" />
6
+ <output>
7
+ <port id="0" precision="I64" names="Parameter_67208">
8
+ <dim>-1</dim>
9
+ <dim>-1</dim>
10
+ </port>
11
+ </output>
12
+ </layer>
13
+ <layer id="1" name="Constant_67180" type="Const" version="opset1">
14
+ <data element_type="u8" shape="238520" offset="0" size="238520" />
15
+ <output>
16
+ <port id="0" precision="U8">
17
+ <dim>238520</dim>
18
+ </port>
19
+ </output>
20
+ </layer>
21
+ <layer id="2" name="Convert_67386" type="Convert" version="opset1">
22
+ <data destination_type="i32" />
23
+ <input>
24
+ <port id="0" precision="I64">
25
+ <dim>-1</dim>
26
+ <dim>-1</dim>
27
+ </port>
28
+ </input>
29
+ <output>
30
+ <port id="1" precision="I32">
31
+ <dim>-1</dim>
32
+ <dim>-1</dim>
33
+ </port>
34
+ </output>
35
+ </layer>
36
+ <layer id="3" name="SentencepieceDetokenizer_67209" type="SentencepieceDetokenizer" version="extension">
37
+ <input>
38
+ <port id="0" precision="U8">
39
+ <dim>238520</dim>
40
+ </port>
41
+ <port id="1" precision="I32">
42
+ <dim>-1</dim>
43
+ <dim>-1</dim>
44
+ </port>
45
+ </input>
46
+ <output>
47
+ <port id="2" precision="I32">
48
+ <dim>-1</dim>
49
+ </port>
50
+ <port id="3" precision="I32">
51
+ <dim>-1</dim>
52
+ </port>
53
+ <port id="4" precision="U8">
54
+ <dim>-1</dim>
55
+ </port>
56
+ </output>
57
+ </layer>
58
+ <layer id="4" name="UTF8Validate_67210" type="UTF8Validate" version="extension">
59
+ <data replace_mode="true" />
60
+ <input>
61
+ <port id="0" precision="I32">
62
+ <dim>-1</dim>
63
+ </port>
64
+ <port id="1" precision="I32">
65
+ <dim>-1</dim>
66
+ </port>
67
+ <port id="2" precision="U8">
68
+ <dim>-1</dim>
69
+ </port>
70
+ </input>
71
+ <output>
72
+ <port id="3" precision="I32">
73
+ <dim>-1</dim>
74
+ </port>
75
+ <port id="4" precision="I32">
76
+ <dim>-1</dim>
77
+ </port>
78
+ <port id="5" precision="U8">
79
+ <dim>-1</dim>
80
+ </port>
81
+ </output>
82
+ </layer>
83
+ <layer id="5" name="StringTensorPack_67211" type="StringTensorPack" version="opset15">
84
+ <input>
85
+ <port id="0" precision="I32">
86
+ <dim>-1</dim>
87
+ </port>
88
+ <port id="1" precision="I32">
89
+ <dim>-1</dim>
90
+ </port>
91
+ <port id="2" precision="U8">
92
+ <dim>-1</dim>
93
+ </port>
94
+ </input>
95
+ <output>
96
+ <port id="3" precision="STRING" names="string_output">
97
+ <dim>-1</dim>
98
+ </port>
99
+ </output>
100
+ </layer>
101
+ <layer id="6" name="Result_67212" type="Result" version="opset1" output_names="string_output">
102
+ <input>
103
+ <port id="0" precision="STRING">
104
+ <dim>-1</dim>
105
+ </port>
106
+ </input>
107
+ </layer>
108
+ </layers>
109
+ <edges>
110
+ <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
111
+ <edge from-layer="1" from-port="0" to-layer="3" to-port="0" />
112
+ <edge from-layer="2" from-port="1" to-layer="3" to-port="1" />
113
+ <edge from-layer="3" from-port="2" to-layer="4" to-port="0" />
114
+ <edge from-layer="3" from-port="3" to-layer="4" to-port="1" />
115
+ <edge from-layer="3" from-port="4" to-layer="4" to-port="2" />
116
+ <edge from-layer="4" from-port="3" to-layer="5" to-port="0" />
117
+ <edge from-layer="4" from-port="4" to-layer="5" to-port="1" />
118
+ <edge from-layer="4" from-port="5" to-layer="5" to-port="2" />
119
+ <edge from-layer="5" from-port="3" to-layer="6" to-port="0" />
120
+ </edges>
121
+ <rt_info>
122
+ <add_attention_mask value="True" />
123
+ <add_prefix_space />
124
+ <add_special_tokens value="True" />
125
+ <bos_token_id value="0" />
126
+ <clean_up_tokenization_spaces value="False" />
127
+ <detokenizer_input_type value="i64" />
128
+ <eos_token_id value="2" />
129
+ <handle_special_tokens_with_re value="False" />
130
+ <max_length />
131
+ <number_of_inputs value="1" />
132
+ <openvino_tokenizers_version value="2025.2.0.1-567-7885335c24b" />
133
+ <openvino_version value="2025.2.0-19140-c01cd93e24d-releases/2025/2" />
134
+ <original_tokenizer_class value="&lt;class 'transformers.models.speecht5.tokenization_speecht5.SpeechT5Tokenizer'>" />
135
+ <pad_token_id value="1" />
136
+ <sentencepiece_version value="0.2.0" />
137
+ <skip_special_tokens value="True" />
138
+ <streaming_detokenizer value="False" />
139
+ <tokenizer_output_type value="i64" />
140
+ <tokenizers_version value="0.21.2" />
141
+ <transformers_version value="4.52.4" />
142
+ <use_max_padding value="False" />
143
+ <use_sentencepiece_backend value="False" />
144
+ <utf8_replace_mode value="replace" />
145
+ <with_detokenizer value="True" />
146
+ </rt_info>
147
+ </net>
openvino_encoder_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3afa2d6a7a5689a3484a23bd4865d341a623fa5b179daec1af16389850c947ac
3
+ size 342398248
openvino_encoder_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
openvino_postnet.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efbf9ebdcfd6db466709837f69b09b1d5d07166e3105250d553e5d543f475e95
3
+ size 4755828
openvino_postnet.xml ADDED
@@ -0,0 +1,674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="Model6" version="11">
3
+ <layers>
4
+ <layer id="0" name="raw_spectrogram" type="Parameter" version="opset1">
5
+ <data shape="?,?,2,80" element_type="f32" />
6
+ <output>
7
+ <port id="0" precision="FP32" names="raw_spectrogram">
8
+ <dim>-1</dim>
9
+ <dim>-1</dim>
10
+ <dim>2</dim>
11
+ <dim>80</dim>
12
+ </port>
13
+ </output>
14
+ </layer>
15
+ <layer id="1" name="aten::transpose/Constant" type="Const" version="opset1">
16
+ <data element_type="i32" shape="4" offset="0" size="16" />
17
+ <output>
18
+ <port id="0" precision="I32">
19
+ <dim>4</dim>
20
+ </port>
21
+ </output>
22
+ </layer>
23
+ <layer id="2" name="aten::transpose/Transpose" type="Transpose" version="opset1">
24
+ <input>
25
+ <port id="0" precision="FP32">
26
+ <dim>-1</dim>
27
+ <dim>-1</dim>
28
+ <dim>2</dim>
29
+ <dim>80</dim>
30
+ </port>
31
+ <port id="1" precision="I32">
32
+ <dim>4</dim>
33
+ </port>
34
+ </input>
35
+ <output>
36
+ <port id="2" precision="FP32" names="19">
37
+ <dim>-1</dim>
38
+ <dim>-1</dim>
39
+ <dim>2</dim>
40
+ <dim>80</dim>
41
+ </port>
42
+ </output>
43
+ </layer>
44
+ <layer id="3" name="Constant_57260" type="Const" version="opset1">
45
+ <data element_type="i32" shape="1" offset="16" size="4" />
46
+ <output>
47
+ <port id="0" precision="I32">
48
+ <dim>1</dim>
49
+ </port>
50
+ </output>
51
+ </layer>
52
+ <layer id="4" name="aten::flatten/Constant_2" type="Const" version="opset1">
53
+ <data element_type="i32" shape="1" offset="20" size="4" />
54
+ <output>
55
+ <port id="0" precision="I32">
56
+ <dim>1</dim>
57
+ </port>
58
+ </output>
59
+ </layer>
60
+ <layer id="5" name="ShapeOf_57144" type="ShapeOf" version="opset3">
61
+ <data output_type="i32" />
62
+ <input>
63
+ <port id="0" precision="FP32">
64
+ <dim>-1</dim>
65
+ <dim>-1</dim>
66
+ <dim>2</dim>
67
+ <dim>80</dim>
68
+ </port>
69
+ </input>
70
+ <output>
71
+ <port id="1" precision="I32">
72
+ <dim>4</dim>
73
+ </port>
74
+ </output>
75
+ </layer>
76
+ <layer id="6" name="Constant_57151" type="Const" version="opset1">
77
+ <data element_type="i64" shape="1" offset="24" size="8" />
78
+ <output>
79
+ <port id="0" precision="I64">
80
+ <dim>1</dim>
81
+ </port>
82
+ </output>
83
+ </layer>
84
+ <layer id="7" name="Constant_57152" type="Const" version="opset1">
85
+ <data element_type="i64" shape="" offset="32" size="8" />
86
+ <output>
87
+ <port id="0" precision="I64" />
88
+ </output>
89
+ </layer>
90
+ <layer id="8" name="Gather_57153" type="Gather" version="opset8">
91
+ <data batch_dims="0" />
92
+ <input>
93
+ <port id="0" precision="I32">
94
+ <dim>4</dim>
95
+ </port>
96
+ <port id="1" precision="I64">
97
+ <dim>1</dim>
98
+ </port>
99
+ <port id="2" precision="I64" />
100
+ </input>
101
+ <output>
102
+ <port id="3" precision="I32">
103
+ <dim>1</dim>
104
+ </port>
105
+ </output>
106
+ </layer>
107
+ <layer id="9" name="aten::flatten/Concat" type="Concat" version="opset1">
108
+ <data axis="0" />
109
+ <input>
110
+ <port id="0" precision="I32">
111
+ <dim>1</dim>
112
+ </port>
113
+ <port id="1" precision="I32">
114
+ <dim>1</dim>
115
+ </port>
116
+ <port id="2" precision="I32">
117
+ <dim>1</dim>
118
+ </port>
119
+ </input>
120
+ <output>
121
+ <port id="3" precision="I32">
122
+ <dim>3</dim>
123
+ </port>
124
+ </output>
125
+ </layer>
126
+ <layer id="10" name="aten::flatten/Reshape" type="Reshape" version="opset1">
127
+ <data special_zero="true" />
128
+ <input>
129
+ <port id="0" precision="FP32">
130
+ <dim>-1</dim>
131
+ <dim>-1</dim>
132
+ <dim>2</dim>
133
+ <dim>80</dim>
134
+ </port>
135
+ <port id="1" precision="I32">
136
+ <dim>3</dim>
137
+ </port>
138
+ </input>
139
+ <output>
140
+ <port id="2" precision="FP32" names="22,hidden_states">
141
+ <dim>-1</dim>
142
+ <dim>-1</dim>
143
+ <dim>80</dim>
144
+ </port>
145
+ </output>
146
+ </layer>
147
+ <layer id="11" name="aten::transpose/Constant_1" type="Const" version="opset1">
148
+ <data element_type="i32" shape="3" offset="40" size="12" />
149
+ <output>
150
+ <port id="0" precision="I32">
151
+ <dim>3</dim>
152
+ </port>
153
+ </output>
154
+ </layer>
155
+ <layer id="12" name="aten::transpose/Transpose_1" type="Transpose" version="opset1">
156
+ <input>
157
+ <port id="0" precision="FP32">
158
+ <dim>-1</dim>
159
+ <dim>-1</dim>
160
+ <dim>80</dim>
161
+ </port>
162
+ <port id="1" precision="I32">
163
+ <dim>3</dim>
164
+ </port>
165
+ </input>
166
+ <output>
167
+ <port id="2" precision="FP32" names="25,input.1">
168
+ <dim>-1</dim>
169
+ <dim>80</dim>
170
+ <dim>-1</dim>
171
+ </port>
172
+ </output>
173
+ </layer>
174
+ <layer id="13" name="Multiply_57081" type="Const" version="opset1">
175
+ <data element_type="f32" shape="256, 80, 5" offset="52" size="409600" />
176
+ <output>
177
+ <port id="0" precision="FP32">
178
+ <dim>256</dim>
179
+ <dim>80</dim>
180
+ <dim>5</dim>
181
+ </port>
182
+ </output>
183
+ </layer>
184
+ <layer id="14" name="Multiply_57044" type="Convolution" version="opset1">
185
+ <data strides="1" dilations="1" pads_begin="2" pads_end="2" auto_pad="explicit" />
186
+ <input>
187
+ <port id="0" precision="FP32">
188
+ <dim>-1</dim>
189
+ <dim>80</dim>
190
+ <dim>-1</dim>
191
+ </port>
192
+ <port id="1" precision="FP32">
193
+ <dim>256</dim>
194
+ <dim>80</dim>
195
+ <dim>5</dim>
196
+ </port>
197
+ </input>
198
+ <output>
199
+ <port id="2" precision="FP32">
200
+ <dim>-1</dim>
201
+ <dim>256</dim>
202
+ <dim>-1</dim>
203
+ </port>
204
+ </output>
205
+ </layer>
206
+ <layer id="15" name="Constant_57049" type="Const" version="opset1">
207
+ <data element_type="f32" shape="1, 256, 1" offset="409652" size="1024" />
208
+ <output>
209
+ <port id="0" precision="FP32">
210
+ <dim>1</dim>
211
+ <dim>256</dim>
212
+ <dim>1</dim>
213
+ </port>
214
+ </output>
215
+ </layer>
216
+ <layer id="16" name="__module.speech_decoder_postnet.layers.0.batch_norm/aten::batch_norm/BatchNormInference" type="Add" version="opset1">
217
+ <data auto_broadcast="numpy" />
218
+ <input>
219
+ <port id="0" precision="FP32">
220
+ <dim>-1</dim>
221
+ <dim>256</dim>
222
+ <dim>-1</dim>
223
+ </port>
224
+ <port id="1" precision="FP32">
225
+ <dim>1</dim>
226
+ <dim>256</dim>
227
+ <dim>1</dim>
228
+ </port>
229
+ </input>
230
+ <output>
231
+ <port id="2" precision="FP32" names="60,input.5">
232
+ <dim>-1</dim>
233
+ <dim>256</dim>
234
+ <dim>-1</dim>
235
+ </port>
236
+ </output>
237
+ </layer>
238
+ <layer id="17" name="__module.speech_decoder_postnet.layers.0.activation/aten::tanh/Tanh" type="Tanh" version="opset1">
239
+ <input>
240
+ <port id="0" precision="FP32">
241
+ <dim>-1</dim>
242
+ <dim>256</dim>
243
+ <dim>-1</dim>
244
+ </port>
245
+ </input>
246
+ <output>
247
+ <port id="1" precision="FP32" names="61,input.7">
248
+ <dim>-1</dim>
249
+ <dim>256</dim>
250
+ <dim>-1</dim>
251
+ </port>
252
+ </output>
253
+ </layer>
254
+ <layer id="18" name="Multiply_57085" type="Const" version="opset1">
255
+ <data element_type="f32" shape="256, 256, 5" offset="410676" size="1310720" />
256
+ <output>
257
+ <port id="0" precision="FP32">
258
+ <dim>256</dim>
259
+ <dim>256</dim>
260
+ <dim>5</dim>
261
+ </port>
262
+ </output>
263
+ </layer>
264
+ <layer id="19" name="Multiply_57051" type="Convolution" version="opset1">
265
+ <data strides="1" dilations="1" pads_begin="2" pads_end="2" auto_pad="explicit" />
266
+ <input>
267
+ <port id="0" precision="FP32">
268
+ <dim>-1</dim>
269
+ <dim>256</dim>
270
+ <dim>-1</dim>
271
+ </port>
272
+ <port id="1" precision="FP32">
273
+ <dim>256</dim>
274
+ <dim>256</dim>
275
+ <dim>5</dim>
276
+ </port>
277
+ </input>
278
+ <output>
279
+ <port id="2" precision="FP32">
280
+ <dim>-1</dim>
281
+ <dim>256</dim>
282
+ <dim>-1</dim>
283
+ </port>
284
+ </output>
285
+ </layer>
286
+ <layer id="20" name="Constant_57056" type="Const" version="opset1">
287
+ <data element_type="f32" shape="1, 256, 1" offset="1721396" size="1024" />
288
+ <output>
289
+ <port id="0" precision="FP32">
290
+ <dim>1</dim>
291
+ <dim>256</dim>
292
+ <dim>1</dim>
293
+ </port>
294
+ </output>
295
+ </layer>
296
+ <layer id="21" name="__module.speech_decoder_postnet.layers.1.batch_norm/aten::batch_norm/BatchNormInference" type="Add" version="opset1">
297
+ <data auto_broadcast="numpy" />
298
+ <input>
299
+ <port id="0" precision="FP32">
300
+ <dim>-1</dim>
301
+ <dim>256</dim>
302
+ <dim>-1</dim>
303
+ </port>
304
+ <port id="1" precision="FP32">
305
+ <dim>1</dim>
306
+ <dim>256</dim>
307
+ <dim>1</dim>
308
+ </port>
309
+ </input>
310
+ <output>
311
+ <port id="2" precision="FP32" names="86,input.13">
312
+ <dim>-1</dim>
313
+ <dim>256</dim>
314
+ <dim>-1</dim>
315
+ </port>
316
+ </output>
317
+ </layer>
318
+ <layer id="22" name="__module.speech_decoder_postnet.layers.1.activation/aten::tanh/Tanh" type="Tanh" version="opset1">
319
+ <input>
320
+ <port id="0" precision="FP32">
321
+ <dim>-1</dim>
322
+ <dim>256</dim>
323
+ <dim>-1</dim>
324
+ </port>
325
+ </input>
326
+ <output>
327
+ <port id="1" precision="FP32" names="87,input.15">
328
+ <dim>-1</dim>
329
+ <dim>256</dim>
330
+ <dim>-1</dim>
331
+ </port>
332
+ </output>
333
+ </layer>
334
+ <layer id="23" name="Multiply_57089" type="Const" version="opset1">
335
+ <data element_type="f32" shape="256, 256, 5" offset="1722420" size="1310720" />
336
+ <output>
337
+ <port id="0" precision="FP32">
338
+ <dim>256</dim>
339
+ <dim>256</dim>
340
+ <dim>5</dim>
341
+ </port>
342
+ </output>
343
+ </layer>
344
+ <layer id="24" name="Multiply_57058" type="Convolution" version="opset1">
345
+ <data strides="1" dilations="1" pads_begin="2" pads_end="2" auto_pad="explicit" />
346
+ <input>
347
+ <port id="0" precision="FP32">
348
+ <dim>-1</dim>
349
+ <dim>256</dim>
350
+ <dim>-1</dim>
351
+ </port>
352
+ <port id="1" precision="FP32">
353
+ <dim>256</dim>
354
+ <dim>256</dim>
355
+ <dim>5</dim>
356
+ </port>
357
+ </input>
358
+ <output>
359
+ <port id="2" precision="FP32">
360
+ <dim>-1</dim>
361
+ <dim>256</dim>
362
+ <dim>-1</dim>
363
+ </port>
364
+ </output>
365
+ </layer>
366
+ <layer id="25" name="Constant_57063" type="Const" version="opset1">
367
+ <data element_type="f32" shape="1, 256, 1" offset="3033140" size="1024" />
368
+ <output>
369
+ <port id="0" precision="FP32">
370
+ <dim>1</dim>
371
+ <dim>256</dim>
372
+ <dim>1</dim>
373
+ </port>
374
+ </output>
375
+ </layer>
376
+ <layer id="26" name="__module.speech_decoder_postnet.layers.2.batch_norm/aten::batch_norm/BatchNormInference" type="Add" version="opset1">
377
+ <data auto_broadcast="numpy" />
378
+ <input>
379
+ <port id="0" precision="FP32">
380
+ <dim>-1</dim>
381
+ <dim>256</dim>
382
+ <dim>-1</dim>
383
+ </port>
384
+ <port id="1" precision="FP32">
385
+ <dim>1</dim>
386
+ <dim>256</dim>
387
+ <dim>1</dim>
388
+ </port>
389
+ </input>
390
+ <output>
391
+ <port id="2" precision="FP32" names="112,input.21">
392
+ <dim>-1</dim>
393
+ <dim>256</dim>
394
+ <dim>-1</dim>
395
+ </port>
396
+ </output>
397
+ </layer>
398
+ <layer id="27" name="__module.speech_decoder_postnet.layers.2.activation/aten::tanh/Tanh" type="Tanh" version="opset1">
399
+ <input>
400
+ <port id="0" precision="FP32">
401
+ <dim>-1</dim>
402
+ <dim>256</dim>
403
+ <dim>-1</dim>
404
+ </port>
405
+ </input>
406
+ <output>
407
+ <port id="1" precision="FP32" names="113,input.23">
408
+ <dim>-1</dim>
409
+ <dim>256</dim>
410
+ <dim>-1</dim>
411
+ </port>
412
+ </output>
413
+ </layer>
414
+ <layer id="28" name="Multiply_57093" type="Const" version="opset1">
415
+ <data element_type="f32" shape="256, 256, 5" offset="3034164" size="1310720" />
416
+ <output>
417
+ <port id="0" precision="FP32">
418
+ <dim>256</dim>
419
+ <dim>256</dim>
420
+ <dim>5</dim>
421
+ </port>
422
+ </output>
423
+ </layer>
424
+ <layer id="29" name="Multiply_57065" type="Convolution" version="opset1">
425
+ <data strides="1" dilations="1" pads_begin="2" pads_end="2" auto_pad="explicit" />
426
+ <input>
427
+ <port id="0" precision="FP32">
428
+ <dim>-1</dim>
429
+ <dim>256</dim>
430
+ <dim>-1</dim>
431
+ </port>
432
+ <port id="1" precision="FP32">
433
+ <dim>256</dim>
434
+ <dim>256</dim>
435
+ <dim>5</dim>
436
+ </port>
437
+ </input>
438
+ <output>
439
+ <port id="2" precision="FP32">
440
+ <dim>-1</dim>
441
+ <dim>256</dim>
442
+ <dim>-1</dim>
443
+ </port>
444
+ </output>
445
+ </layer>
446
+ <layer id="30" name="Constant_57070" type="Const" version="opset1">
447
+ <data element_type="f32" shape="1, 256, 1" offset="4344884" size="1024" />
448
+ <output>
449
+ <port id="0" precision="FP32">
450
+ <dim>1</dim>
451
+ <dim>256</dim>
452
+ <dim>1</dim>
453
+ </port>
454
+ </output>
455
+ </layer>
456
+ <layer id="31" name="__module.speech_decoder_postnet.layers.3.batch_norm/aten::batch_norm/BatchNormInference" type="Add" version="opset1">
457
+ <data auto_broadcast="numpy" />
458
+ <input>
459
+ <port id="0" precision="FP32">
460
+ <dim>-1</dim>
461
+ <dim>256</dim>
462
+ <dim>-1</dim>
463
+ </port>
464
+ <port id="1" precision="FP32">
465
+ <dim>1</dim>
466
+ <dim>256</dim>
467
+ <dim>1</dim>
468
+ </port>
469
+ </input>
470
+ <output>
471
+ <port id="2" precision="FP32" names="138,input.29">
472
+ <dim>-1</dim>
473
+ <dim>256</dim>
474
+ <dim>-1</dim>
475
+ </port>
476
+ </output>
477
+ </layer>
478
+ <layer id="32" name="__module.speech_decoder_postnet.layers.3.activation/aten::tanh/Tanh" type="Tanh" version="opset1">
479
+ <input>
480
+ <port id="0" precision="FP32">
481
+ <dim>-1</dim>
482
+ <dim>256</dim>
483
+ <dim>-1</dim>
484
+ </port>
485
+ </input>
486
+ <output>
487
+ <port id="1" precision="FP32" names="139,input.31">
488
+ <dim>-1</dim>
489
+ <dim>256</dim>
490
+ <dim>-1</dim>
491
+ </port>
492
+ </output>
493
+ </layer>
494
+ <layer id="33" name="Multiply_57097" type="Const" version="opset1">
495
+ <data element_type="f32" shape="80, 256, 5" offset="4345908" size="409600" />
496
+ <output>
497
+ <port id="0" precision="FP32">
498
+ <dim>80</dim>
499
+ <dim>256</dim>
500
+ <dim>5</dim>
501
+ </port>
502
+ </output>
503
+ </layer>
504
+ <layer id="34" name="Multiply_57072" type="Convolution" version="opset1">
505
+ <data strides="1" dilations="1" pads_begin="2" pads_end="2" auto_pad="explicit" />
506
+ <input>
507
+ <port id="0" precision="FP32">
508
+ <dim>-1</dim>
509
+ <dim>256</dim>
510
+ <dim>-1</dim>
511
+ </port>
512
+ <port id="1" precision="FP32">
513
+ <dim>80</dim>
514
+ <dim>256</dim>
515
+ <dim>5</dim>
516
+ </port>
517
+ </input>
518
+ <output>
519
+ <port id="2" precision="FP32">
520
+ <dim>-1</dim>
521
+ <dim>80</dim>
522
+ <dim>-1</dim>
523
+ </port>
524
+ </output>
525
+ </layer>
526
+ <layer id="35" name="Constant_57077" type="Const" version="opset1">
527
+ <data element_type="f32" shape="1, 80, 1" offset="4755508" size="320" />
528
+ <output>
529
+ <port id="0" precision="FP32">
530
+ <dim>1</dim>
531
+ <dim>80</dim>
532
+ <dim>1</dim>
533
+ </port>
534
+ </output>
535
+ </layer>
536
+ <layer id="36" name="__module.speech_decoder_postnet.layers.4.batch_norm/aten::batch_norm/BatchNormInference" type="Add" version="opset1">
537
+ <data auto_broadcast="numpy" />
538
+ <input>
539
+ <port id="0" precision="FP32">
540
+ <dim>-1</dim>
541
+ <dim>80</dim>
542
+ <dim>-1</dim>
543
+ </port>
544
+ <port id="1" precision="FP32">
545
+ <dim>1</dim>
546
+ <dim>80</dim>
547
+ <dim>1</dim>
548
+ </port>
549
+ </input>
550
+ <output>
551
+ <port id="2" precision="FP32" names="163,input">
552
+ <dim>-1</dim>
553
+ <dim>80</dim>
554
+ <dim>-1</dim>
555
+ </port>
556
+ </output>
557
+ </layer>
558
+ <layer id="37" name="aten::transpose/Constant_2" type="Const" version="opset1">
559
+ <data element_type="i32" shape="3" offset="40" size="12" />
560
+ <output>
561
+ <port id="0" precision="I32">
562
+ <dim>3</dim>
563
+ </port>
564
+ </output>
565
+ </layer>
566
+ <layer id="38" name="aten::transpose/Transpose_2" type="Transpose" version="opset1">
567
+ <input>
568
+ <port id="0" precision="FP32">
569
+ <dim>-1</dim>
570
+ <dim>80</dim>
571
+ <dim>-1</dim>
572
+ </port>
573
+ <port id="1" precision="I32">
574
+ <dim>3</dim>
575
+ </port>
576
+ </input>
577
+ <output>
578
+ <port id="2" precision="FP32" names="33">
579
+ <dim>-1</dim>
580
+ <dim>-1</dim>
581
+ <dim>80</dim>
582
+ </port>
583
+ </output>
584
+ </layer>
585
+ <layer id="39" name="aten::add/Add" type="Add" version="opset1">
586
+ <data auto_broadcast="numpy" />
587
+ <input>
588
+ <port id="0" precision="FP32">
589
+ <dim>-1</dim>
590
+ <dim>-1</dim>
591
+ <dim>80</dim>
592
+ </port>
593
+ <port id="1" precision="FP32">
594
+ <dim>-1</dim>
595
+ <dim>-1</dim>
596
+ <dim>80</dim>
597
+ </port>
598
+ </input>
599
+ <output>
600
+ <port id="2" precision="FP32" names="postnet_spectrogram">
601
+ <dim>-1</dim>
602
+ <dim>-1</dim>
603
+ <dim>80</dim>
604
+ </port>
605
+ </output>
606
+ </layer>
607
+ <layer id="40" name="Result_54709" type="Result" version="opset1" output_names="postnet_spectrogram">
608
+ <input>
609
+ <port id="0" precision="FP32">
610
+ <dim>-1</dim>
611
+ <dim>-1</dim>
612
+ <dim>80</dim>
613
+ </port>
614
+ </input>
615
+ </layer>
616
+ </layers>
617
+ <edges>
618
+ <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
619
+ <edge from-layer="1" from-port="0" to-layer="2" to-port="1" />
620
+ <edge from-layer="2" from-port="2" to-layer="5" to-port="0" />
621
+ <edge from-layer="2" from-port="2" to-layer="10" to-port="0" />
622
+ <edge from-layer="3" from-port="0" to-layer="9" to-port="0" />
623
+ <edge from-layer="4" from-port="0" to-layer="9" to-port="1" />
624
+ <edge from-layer="5" from-port="1" to-layer="8" to-port="0" />
625
+ <edge from-layer="6" from-port="0" to-layer="8" to-port="1" />
626
+ <edge from-layer="7" from-port="0" to-layer="8" to-port="2" />
627
+ <edge from-layer="8" from-port="3" to-layer="9" to-port="2" />
628
+ <edge from-layer="9" from-port="3" to-layer="10" to-port="1" />
629
+ <edge from-layer="10" from-port="2" to-layer="12" to-port="0" />
630
+ <edge from-layer="10" from-port="2" to-layer="39" to-port="0" />
631
+ <edge from-layer="11" from-port="0" to-layer="12" to-port="1" />
632
+ <edge from-layer="12" from-port="2" to-layer="14" to-port="0" />
633
+ <edge from-layer="13" from-port="0" to-layer="14" to-port="1" />
634
+ <edge from-layer="14" from-port="2" to-layer="16" to-port="0" />
635
+ <edge from-layer="15" from-port="0" to-layer="16" to-port="1" />
636
+ <edge from-layer="16" from-port="2" to-layer="17" to-port="0" />
637
+ <edge from-layer="17" from-port="1" to-layer="19" to-port="0" />
638
+ <edge from-layer="18" from-port="0" to-layer="19" to-port="1" />
639
+ <edge from-layer="19" from-port="2" to-layer="21" to-port="0" />
640
+ <edge from-layer="20" from-port="0" to-layer="21" to-port="1" />
641
+ <edge from-layer="21" from-port="2" to-layer="22" to-port="0" />
642
+ <edge from-layer="22" from-port="1" to-layer="24" to-port="0" />
643
+ <edge from-layer="23" from-port="0" to-layer="24" to-port="1" />
644
+ <edge from-layer="24" from-port="2" to-layer="26" to-port="0" />
645
+ <edge from-layer="25" from-port="0" to-layer="26" to-port="1" />
646
+ <edge from-layer="26" from-port="2" to-layer="27" to-port="0" />
647
+ <edge from-layer="27" from-port="1" to-layer="29" to-port="0" />
648
+ <edge from-layer="28" from-port="0" to-layer="29" to-port="1" />
649
+ <edge from-layer="29" from-port="2" to-layer="31" to-port="0" />
650
+ <edge from-layer="30" from-port="0" to-layer="31" to-port="1" />
651
+ <edge from-layer="31" from-port="2" to-layer="32" to-port="0" />
652
+ <edge from-layer="32" from-port="1" to-layer="34" to-port="0" />
653
+ <edge from-layer="33" from-port="0" to-layer="34" to-port="1" />
654
+ <edge from-layer="34" from-port="2" to-layer="36" to-port="0" />
655
+ <edge from-layer="35" from-port="0" to-layer="36" to-port="1" />
656
+ <edge from-layer="36" from-port="2" to-layer="38" to-port="0" />
657
+ <edge from-layer="37" from-port="0" to-layer="38" to-port="1" />
658
+ <edge from-layer="38" from-port="2" to-layer="39" to-port="1" />
659
+ <edge from-layer="39" from-port="2" to-layer="40" to-port="0" />
660
+ </edges>
661
+ <rt_info>
662
+ <Runtime_version value="2025.2.0-19140-c01cd93e24d-releases/2025/2" />
663
+ <conversion_parameters>
664
+ <framework value="pytorch" />
665
+ <is_python_object value="True" />
666
+ </conversion_parameters>
667
+ <optimum>
668
+ <optimum_intel_version value="1.24.0" />
669
+ <optimum_version value="1.26.1" />
670
+ <pytorch_version value="2.7.1" />
671
+ <transformers_version value="4.52.4" />
672
+ </optimum>
673
+ </rt_info>
674
+ </net>
openvino_tokenizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b50bf1d7b766607fa8b7ba49d1ff0712f1c11eb11308ca124a9d10dd7c6fe985
3
+ size 238556
openvino_tokenizer.xml ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="tokenizer" version="11">
3
+ <layers>
4
+ <layer id="0" name="string_input" type="Parameter" version="opset1">
5
+ <data shape="?" element_type="string" />
6
+ <output>
7
+ <port id="0" precision="STRING" names="string_input">
8
+ <dim>-1</dim>
9
+ </port>
10
+ </output>
11
+ </layer>
12
+ <layer id="1" name="Constant_67183" type="Const" version="opset1">
13
+ <data element_type="i32" shape="" offset="0" size="4" />
14
+ <output>
15
+ <port id="0" precision="I32" />
16
+ </output>
17
+ </layer>
18
+ <layer id="2" name="Constant_67179" type="Const" version="opset1">
19
+ <data element_type="u8" shape="238516" offset="4" size="238516" />
20
+ <output>
21
+ <port id="0" precision="U8">
22
+ <dim>238516</dim>
23
+ </port>
24
+ </output>
25
+ </layer>
26
+ <layer id="3" name="SentencepieceTokenizer_67182" type="SentencepieceTokenizer" version="extension">
27
+ <data nbest_size="1" alpha="1" add_bos="false" add_eos="true" reverse="false" />
28
+ <input>
29
+ <port id="0" precision="U8">
30
+ <dim>238516</dim>
31
+ </port>
32
+ <port id="1" precision="STRING">
33
+ <dim>-1</dim>
34
+ </port>
35
+ </input>
36
+ <output>
37
+ <port id="2" precision="I64">
38
+ <dim>-1</dim>
39
+ <dim>2</dim>
40
+ </port>
41
+ <port id="3" precision="I32">
42
+ <dim>-1</dim>
43
+ </port>
44
+ <port id="4" precision="I64">
45
+ <dim>2</dim>
46
+ </port>
47
+ </output>
48
+ </layer>
49
+ <layer id="4" name="Broadcast_67184" type="Broadcast" version="opset3">
50
+ <data mode="numpy" />
51
+ <input>
52
+ <port id="0" precision="I32" />
53
+ <port id="1" precision="I64">
54
+ <dim>2</dim>
55
+ </port>
56
+ </input>
57
+ <output>
58
+ <port id="2" precision="I32">
59
+ <dim>-1</dim>
60
+ <dim>-1</dim>
61
+ </port>
62
+ </output>
63
+ </layer>
64
+ <layer id="5" name="Constant_67185" type="Const" version="opset1">
65
+ <data element_type="i32" shape="" offset="238520" size="4" />
66
+ <output>
67
+ <port id="0" precision="I32" />
68
+ </output>
69
+ </layer>
70
+ <layer id="6" name="ShapeOf_67186" type="ShapeOf" version="opset3">
71
+ <data output_type="i64" />
72
+ <input>
73
+ <port id="0" precision="I32">
74
+ <dim>-1</dim>
75
+ </port>
76
+ </input>
77
+ <output>
78
+ <port id="1" precision="I64">
79
+ <dim>1</dim>
80
+ </port>
81
+ </output>
82
+ </layer>
83
+ <layer id="7" name="Broadcast_67187" type="Broadcast" version="opset3">
84
+ <data mode="numpy" />
85
+ <input>
86
+ <port id="0" precision="I32" />
87
+ <port id="1" precision="I64">
88
+ <dim>1</dim>
89
+ </port>
90
+ </input>
91
+ <output>
92
+ <port id="2" precision="I32">
93
+ <dim>-1</dim>
94
+ </port>
95
+ </output>
96
+ </layer>
97
+ <layer id="8" name="ScatterNDUpdate_67190" type="ScatterNDUpdate" version="opset4">
98
+ <input>
99
+ <port id="0" precision="I32">
100
+ <dim>-1</dim>
101
+ <dim>-1</dim>
102
+ </port>
103
+ <port id="1" precision="I64">
104
+ <dim>-1</dim>
105
+ <dim>2</dim>
106
+ </port>
107
+ <port id="2" precision="I32">
108
+ <dim>-1</dim>
109
+ </port>
110
+ </input>
111
+ <output>
112
+ <port id="3" precision="I32">
113
+ <dim>-1</dim>
114
+ <dim>-1</dim>
115
+ </port>
116
+ </output>
117
+ </layer>
118
+ <layer id="9" name="Constant_67199" type="Const" version="opset1">
119
+ <data element_type="i64" shape="1" offset="238524" size="8" />
120
+ <output>
121
+ <port id="0" precision="I64">
122
+ <dim>1</dim>
123
+ </port>
124
+ </output>
125
+ </layer>
126
+ <layer id="10" name="Constant_67200" type="Const" version="opset1">
127
+ <data element_type="i64" shape="1" offset="238532" size="8" />
128
+ <output>
129
+ <port id="0" precision="I64">
130
+ <dim>1</dim>
131
+ </port>
132
+ </output>
133
+ </layer>
134
+ <layer id="11" name="Constant_67201" type="Const" version="opset1">
135
+ <data element_type="i64" shape="1" offset="238540" size="8" />
136
+ <output>
137
+ <port id="0" precision="I64">
138
+ <dim>1</dim>
139
+ </port>
140
+ </output>
141
+ </layer>
142
+ <layer id="12" name="Constant_67202" type="Const" version="opset1">
143
+ <data element_type="i64" shape="1" offset="238548" size="8" />
144
+ <output>
145
+ <port id="0" precision="I64">
146
+ <dim>1</dim>
147
+ </port>
148
+ </output>
149
+ </layer>
150
+ <layer id="13" name="Slice_67203" type="Slice" version="opset8">
151
+ <input>
152
+ <port id="0" precision="I32">
153
+ <dim>-1</dim>
154
+ <dim>-1</dim>
155
+ </port>
156
+ <port id="1" precision="I64">
157
+ <dim>1</dim>
158
+ </port>
159
+ <port id="2" precision="I64">
160
+ <dim>1</dim>
161
+ </port>
162
+ <port id="3" precision="I64">
163
+ <dim>1</dim>
164
+ </port>
165
+ <port id="4" precision="I64">
166
+ <dim>1</dim>
167
+ </port>
168
+ </input>
169
+ <output>
170
+ <port id="5" precision="I32">
171
+ <dim>-1</dim>
172
+ <dim>-1</dim>
173
+ </port>
174
+ </output>
175
+ </layer>
176
+ <layer id="14" name="Slice_67203.0" type="Convert" version="opset1">
177
+ <data destination_type="i64" />
178
+ <input>
179
+ <port id="0" precision="I32">
180
+ <dim>-1</dim>
181
+ <dim>-1</dim>
182
+ </port>
183
+ </input>
184
+ <output>
185
+ <port id="1" precision="I64" names="attention_mask">
186
+ <dim>-1</dim>
187
+ <dim>-1</dim>
188
+ </port>
189
+ </output>
190
+ </layer>
191
+ <layer id="16" name="Constant_67191" type="Const" version="opset1">
192
+ <data element_type="i32" shape="" offset="238520" size="4" />
193
+ <output>
194
+ <port id="0" precision="I32" />
195
+ </output>
196
+ </layer>
197
+ <layer id="17" name="Broadcast_67192" type="Broadcast" version="opset3">
198
+ <data mode="bidirectional" />
199
+ <input>
200
+ <port id="0" precision="I32" />
201
+ <port id="1" precision="I64">
202
+ <dim>2</dim>
203
+ </port>
204
+ </input>
205
+ <output>
206
+ <port id="2" precision="I32">
207
+ <dim>-1</dim>
208
+ <dim>-1</dim>
209
+ </port>
210
+ </output>
211
+ </layer>
212
+ <layer id="18" name="ScatterNDUpdate_67193" type="ScatterNDUpdate" version="opset4">
213
+ <input>
214
+ <port id="0" precision="I32">
215
+ <dim>-1</dim>
216
+ <dim>-1</dim>
217
+ </port>
218
+ <port id="1" precision="I64">
219
+ <dim>-1</dim>
220
+ <dim>2</dim>
221
+ </port>
222
+ <port id="2" precision="I32">
223
+ <dim>-1</dim>
224
+ </port>
225
+ </input>
226
+ <output>
227
+ <port id="3" precision="I32">
228
+ <dim>-1</dim>
229
+ <dim>-1</dim>
230
+ </port>
231
+ </output>
232
+ </layer>
233
+ <layer id="19" name="Constant_67194" type="Const" version="opset1">
234
+ <data element_type="i64" shape="1" offset="238524" size="8" />
235
+ <output>
236
+ <port id="0" precision="I64">
237
+ <dim>1</dim>
238
+ </port>
239
+ </output>
240
+ </layer>
241
+ <layer id="20" name="Constant_67195" type="Const" version="opset1">
242
+ <data element_type="i64" shape="1" offset="238532" size="8" />
243
+ <output>
244
+ <port id="0" precision="I64">
245
+ <dim>1</dim>
246
+ </port>
247
+ </output>
248
+ </layer>
249
+ <layer id="21" name="Constant_67196" type="Const" version="opset1">
250
+ <data element_type="i64" shape="1" offset="238540" size="8" />
251
+ <output>
252
+ <port id="0" precision="I64">
253
+ <dim>1</dim>
254
+ </port>
255
+ </output>
256
+ </layer>
257
+ <layer id="22" name="Constant_67197" type="Const" version="opset1">
258
+ <data element_type="i64" shape="1" offset="238548" size="8" />
259
+ <output>
260
+ <port id="0" precision="I64">
261
+ <dim>1</dim>
262
+ </port>
263
+ </output>
264
+ </layer>
265
+ <layer id="23" name="Slice_67198" type="Slice" version="opset8">
266
+ <input>
267
+ <port id="0" precision="I32">
268
+ <dim>-1</dim>
269
+ <dim>-1</dim>
270
+ </port>
271
+ <port id="1" precision="I64">
272
+ <dim>1</dim>
273
+ </port>
274
+ <port id="2" precision="I64">
275
+ <dim>1</dim>
276
+ </port>
277
+ <port id="3" precision="I64">
278
+ <dim>1</dim>
279
+ </port>
280
+ <port id="4" precision="I64">
281
+ <dim>1</dim>
282
+ </port>
283
+ </input>
284
+ <output>
285
+ <port id="5" precision="I32">
286
+ <dim>-1</dim>
287
+ <dim>-1</dim>
288
+ </port>
289
+ </output>
290
+ </layer>
291
+ <layer id="24" name="Slice_67198.0" type="Convert" version="opset1">
292
+ <data destination_type="i64" />
293
+ <input>
294
+ <port id="0" precision="I32">
295
+ <dim>-1</dim>
296
+ <dim>-1</dim>
297
+ </port>
298
+ </input>
299
+ <output>
300
+ <port id="1" precision="I64" names="input_ids">
301
+ <dim>-1</dim>
302
+ <dim>-1</dim>
303
+ </port>
304
+ </output>
305
+ </layer>
306
+ <layer id="25" name="Result_67204" type="Result" version="opset1" output_names="input_ids">
307
+ <input>
308
+ <port id="0" precision="I64">
309
+ <dim>-1</dim>
310
+ <dim>-1</dim>
311
+ </port>
312
+ </input>
313
+ </layer>
314
+ <layer id="15" name="Result_67205" type="Result" version="opset1" output_names="attention_mask">
315
+ <input>
316
+ <port id="0" precision="I64">
317
+ <dim>-1</dim>
318
+ <dim>-1</dim>
319
+ </port>
320
+ </input>
321
+ </layer>
322
+ </layers>
323
+ <edges>
324
+ <edge from-layer="0" from-port="0" to-layer="3" to-port="1" />
325
+ <edge from-layer="1" from-port="0" to-layer="4" to-port="0" />
326
+ <edge from-layer="2" from-port="0" to-layer="3" to-port="0" />
327
+ <edge from-layer="3" from-port="4" to-layer="4" to-port="1" />
328
+ <edge from-layer="3" from-port="3" to-layer="6" to-port="0" />
329
+ <edge from-layer="3" from-port="2" to-layer="8" to-port="1" />
330
+ <edge from-layer="3" from-port="4" to-layer="17" to-port="1" />
331
+ <edge from-layer="3" from-port="2" to-layer="18" to-port="1" />
332
+ <edge from-layer="3" from-port="3" to-layer="18" to-port="2" />
333
+ <edge from-layer="4" from-port="2" to-layer="8" to-port="0" />
334
+ <edge from-layer="5" from-port="0" to-layer="7" to-port="0" />
335
+ <edge from-layer="6" from-port="1" to-layer="7" to-port="1" />
336
+ <edge from-layer="7" from-port="2" to-layer="8" to-port="2" />
337
+ <edge from-layer="8" from-port="3" to-layer="13" to-port="0" />
338
+ <edge from-layer="9" from-port="0" to-layer="13" to-port="1" />
339
+ <edge from-layer="10" from-port="0" to-layer="13" to-port="2" />
340
+ <edge from-layer="11" from-port="0" to-layer="13" to-port="3" />
341
+ <edge from-layer="12" from-port="0" to-layer="13" to-port="4" />
342
+ <edge from-layer="13" from-port="5" to-layer="14" to-port="0" />
343
+ <edge from-layer="14" from-port="1" to-layer="15" to-port="0" />
344
+ <edge from-layer="16" from-port="0" to-layer="17" to-port="0" />
345
+ <edge from-layer="17" from-port="2" to-layer="18" to-port="0" />
346
+ <edge from-layer="18" from-port="3" to-layer="23" to-port="0" />
347
+ <edge from-layer="19" from-port="0" to-layer="23" to-port="1" />
348
+ <edge from-layer="20" from-port="0" to-layer="23" to-port="2" />
349
+ <edge from-layer="21" from-port="0" to-layer="23" to-port="3" />
350
+ <edge from-layer="22" from-port="0" to-layer="23" to-port="4" />
351
+ <edge from-layer="23" from-port="5" to-layer="24" to-port="0" />
352
+ <edge from-layer="24" from-port="1" to-layer="25" to-port="0" />
353
+ </edges>
354
+ <rt_info>
355
+ <add_attention_mask value="True" />
356
+ <add_prefix_space />
357
+ <add_special_tokens value="True" />
358
+ <bos_token_id value="0" />
359
+ <clean_up_tokenization_spaces value="False" />
360
+ <detokenizer_input_type value="i64" />
361
+ <eos_token_id value="2" />
362
+ <handle_special_tokens_with_re value="False" />
363
+ <max_length />
364
+ <number_of_inputs value="1" />
365
+ <openvino_tokenizers_version value="2025.2.0.1-567-7885335c24b" />
366
+ <openvino_version value="2025.2.0-19140-c01cd93e24d-releases/2025/2" />
367
+ <original_tokenizer_class value="&lt;class 'transformers.models.speecht5.tokenization_speecht5.SpeechT5Tokenizer'>" />
368
+ <pad_token_id value="1" />
369
+ <sentencepiece_version value="0.2.0" />
370
+ <skip_special_tokens value="True" />
371
+ <streaming_detokenizer value="False" />
372
+ <tokenizer_output_type value="i64" />
373
+ <tokenizers_version value="0.21.2" />
374
+ <transformers_version value="4.52.4" />
375
+ <use_max_padding value="False" />
376
+ <use_sentencepiece_backend value="False" />
377
+ <utf8_replace_mode value="replace" />
378
+ <with_detokenizer value="True" />
379
+ </rt_info>
380
+ </net>
openvino_vocoder.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47aa4c2ecc62f865967098906d99a250ac86aa443fcd84eb5dc785809059ace8
3
+ size 50625700
openvino_vocoder.xml ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": false,
3
+ "feature_extractor_type": "SpeechT5FeatureExtractor",
4
+ "feature_size": 1,
5
+ "fmax": 7600,
6
+ "fmin": 80,
7
+ "frame_signal_scale": 1.0,
8
+ "hop_length": 16,
9
+ "mel_floor": 1e-10,
10
+ "num_mel_bins": 80,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "processor_class": "SpeechT5Processor",
14
+ "reduction_factor": 2,
15
+ "return_attention_mask": true,
16
+ "sampling_rate": 16000,
17
+ "win_function": "hann_window",
18
+ "win_length": 64
19
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "mask_token": {
5
+ "content": "<mask>",
6
+ "lstrip": true,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "pad_token": "<pad>",
12
+ "unk_token": "<unk>"
13
+ }
spm_char.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
3
+ size 238473
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "79": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "80": {
44
+ "content": "<ctc_blank>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ }
51
+ },
52
+ "bos_token": "<s>",
53
+ "clean_up_tokenization_spaces": false,
54
+ "eos_token": "</s>",
55
+ "extra_special_tokens": {},
56
+ "mask_token": "<mask>",
57
+ "model_max_length": 600,
58
+ "normalize": false,
59
+ "pad_token": "<pad>",
60
+ "processor_class": "SpeechT5Processor",
61
+ "sp_model_kwargs": {},
62
+ "tokenizer_class": "SpeechT5Tokenizer",
63
+ "unk_token": "<unk>"
64
+ }