daviddrzik commited on
Commit
cf9a7c2
·
verified ·
1 Parent(s): 35300ed

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ SKMT_lib_v2/dolezite_slova_MDBSNK filter=lfs diff=lfs merge=lfs -text
37
+ SKMT_lib_v2/rootWordsOnlyInText filter=lfs diff=lfs merge=lfs -text
38
+ SKMT_lib_v2/slova_MDBSNK filter=lfs diff=lfs merge=lfs -text
39
+ SKMT_lib_v2/word_root_20231210_sorted filter=lfs diff=lfs merge=lfs -text
SKMT_lib_v2/SKMT_BPE.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter, defaultdict
2
+ from tqdm import tqdm
3
+ from transformers import AutoTokenizer
4
+ from pathlib import Path
5
+ import json
6
+ import pickle
7
+ import os
8
+ import re
9
+ from transformers.tokenization_utils_base import BatchEncoding
10
+ import torch
11
+
12
+ class SKMorfoTokenizer:
13
+ def __init__(self):
14
+ self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
15
+ self.dictionary = None
16
+ self.roots = None
17
+ self.vocab_MDBSNK = None
18
+ self.important_vocab_MDBSNK = None
19
+ self.vocab = None
20
+ self.merges = None
21
+ self.reverse_vocab = None
22
+ self.load_suplementary_files()
23
+
24
+ def load_suplementary_files(self):
25
+ current_dir = os.path.dirname(__file__) # Adresár, kde sa nachádza tento súbor
26
+ root_file = os.path.join(current_dir, 'word_root_20231210_sorted')
27
+ vocab_file = os.path.join(current_dir, 'slova_MDBSNK')
28
+ important_vocab_file = os.path.join(current_dir, 'dolezite_slova_MDBSNK')
29
+ dictionary_file = os.path.join(current_dir, 'kodovanie.json')
30
+ vocab_json_file = os.path.join(current_dir, 'tokenizers/SKMT_BPE/vocab.json')
31
+ merges_txt_file = os.path.join(current_dir, 'tokenizers/SKMT_BPE/merges.txt')
32
+
33
+ with open(root_file, 'rb') as f:
34
+ self.roots = pickle.load(f)
35
+
36
+ with open(vocab_file, 'rb') as f:
37
+ self.vocab_MDBSNK = pickle.load(f)
38
+
39
+ with open(important_vocab_file, 'rb') as f:
40
+ self.important_vocab_MDBSNK = pickle.load(f)
41
+ self.important_vocab_MDBSNK = set(self.important_vocab_MDBSNK)
42
+
43
+ with open(dictionary_file, "r", encoding="utf-8") as f:
44
+ self.dictionary = json.load(f)
45
+
46
+ try:
47
+ with open(vocab_json_file, "r", encoding="utf-8") as file:
48
+ loaded_vocab = json.load(file)
49
+ self.vocab = {prvok: index for prvok, index in loaded_vocab.items()}
50
+ self.reverse_vocab = {v: k for k, v in self.vocab.items()}
51
+ except FileNotFoundError:
52
+ print("Súbor s vocab neexistuje.")
53
+
54
+ try:
55
+ with open(merges_txt_file, "r", encoding="utf-8") as file:
56
+ loaded_merges = [tuple(line.split()) for line in file]
57
+ self.merges = {pair: pair[0]+pair[1] for pair in loaded_merges}
58
+ except FileNotFoundError:
59
+ print("Súbor s merges neexistuje.")
60
+
61
+ def decode(self, token):
62
+ for k, v in self.dictionary.items():
63
+ if k in token:
64
+ token = token.replace(k, v)
65
+ return token
66
+
67
+ def split_word(self, text):
68
+ """Tu sa rozdeluje slovo na znaky a korene, ak korene existujú pre dané slovo"""
69
+ pattern = re.compile(r'§{([^}]+)}§|([^§{}]+)')
70
+
71
+ result = []
72
+ for match in pattern.finditer(text):
73
+ inside_brackets, outside_brackets = match.groups()
74
+ if inside_brackets is not None:
75
+ result.append((inside_brackets, 1))
76
+ if outside_brackets is not None:
77
+ result.append((outside_brackets, 0))
78
+
79
+ def replace_letters(string):
80
+ for key, value in self.dictionary.items():
81
+ string = re.sub(re.escape(value), key, string)
82
+ return string
83
+
84
+ result = [(replace_letters(s), n) for s, n in result]
85
+
86
+ new_list = []
87
+ for text, flag in result:
88
+ if flag == 0:
89
+ new_list.extend((char) for char in text)
90
+ elif flag == 1:
91
+ new_list.append((text))
92
+ return new_list
93
+
94
+ def valid_word(self, word):
95
+ decoded = self.decode(word)
96
+ if decoded.startswith("Ġ"):
97
+ decoded = decoded[1:]
98
+ if decoded[0].lower() in self.vocab_MDBSNK:
99
+ if decoded in self.vocab_MDBSNK[decoded[0].lower()]:
100
+ return True
101
+ return False
102
+
103
+ def all_words_spaces(self, word_freqs):
104
+ def is_valid_word(word):
105
+ special_chars = "jžxďqitürpľuknŕemfšřýťhzčäwáécóösyoĺěvôdlňabígú"
106
+ pattern = f"^[a-z{special_chars}]+$"
107
+ return re.search(pattern, word) is not None
108
+
109
+ def decode(token):
110
+ for k, v in self.dictionary.items():
111
+ if k in token:
112
+ token = token.replace(k, v)
113
+ return token
114
+
115
+ unified_word_freqs = {}
116
+
117
+ for word, freq in word_freqs.items():
118
+ if word[0] == 'Ġ':
119
+ if is_valid_word(decode(word[1:])):
120
+ if unified_word_freqs.get(word, 0) == 0:
121
+ pokus = word_freqs.get(word[1:], 0)
122
+ unified_word_freqs[word] = pokus + freq
123
+ else:
124
+ unified_word_freqs[word] = freq
125
+ else:
126
+ if is_valid_word(decode(word)):
127
+ if unified_word_freqs.get("Ġ"+word, 0) == 0:
128
+ pokus = word_freqs.get("Ġ"+word, 0)
129
+ unified_word_freqs["Ġ"+word] = pokus + freq
130
+ else:
131
+ unified_word_freqs[word] = freq
132
+
133
+ return unified_word_freqs
134
+
135
+ def all_words_spaces_tokenize(self, tokenized_text):
136
+ def is_valid_word(word):
137
+ special_chars = "jžxďqitürpľuknŕemfšřýťhzčäwáécóösyoĺěvôdlňabígú"
138
+ pattern = f"^[a-z{special_chars}]+$"
139
+ return re.search(pattern, word) is not None
140
+
141
+ def decode(token):
142
+ for k, v in self.dictionary.items():
143
+ if k in token:
144
+ token = token.replace(k, v)
145
+ return token
146
+
147
+ unified_tokenized_text = []
148
+
149
+ for word in tokenized_text:
150
+ if word[0] == 'Ġ':
151
+ unified_tokenized_text.append(word)
152
+ else:
153
+ if is_valid_word(decode(word)):
154
+ unified_tokenized_text.append("Ġ"+word)
155
+ else:
156
+ unified_tokenized_text.append(word)
157
+
158
+ return unified_tokenized_text
159
+
160
+ def tokenize_half(self, text):
161
+
162
+ pre_tokenize_result = self.tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
163
+ pre_tokenized_text = [word for word, offset in pre_tokenize_result]
164
+ pre_tokenized_text = self.all_words_spaces_tokenize(pre_tokenized_text)
165
+
166
+ splits = {}
167
+
168
+ # Use tqdm to create a progress bar for the loop
169
+ for word in pre_tokenized_text:
170
+ decoded = self.decode(word)
171
+ try:
172
+ if decoded.startswith("Ġ"):
173
+ decoded = decoded[1:]
174
+ rooted = self.roots[decoded]
175
+ splits[word] = ["Ġ"] + self.split_word(rooted)
176
+ else:
177
+ rooted = roots[decoded]
178
+ splits[word] = self.split_word(rooted)
179
+ except:
180
+ splits[word] = list(word)
181
+
182
+ for pair, merge in self.merges.items():
183
+ for idx, split in splits.items():
184
+ i = 0
185
+ while i < len(split) - 1:
186
+ if split[i] == pair[0] and split[i + 1] == pair[1]:
187
+ split = split[:i] + [merge] + split[i + 2 :]
188
+ else:
189
+ i += 1
190
+ splits[idx] = split
191
+
192
+ zoznam = []
193
+ for slovo in pre_tokenized_text:
194
+ if slovo in splits:
195
+ zoznam.extend(splits[slovo])
196
+
197
+ return zoznam
198
+
199
+ def tokenize_additionally(self, word):
200
+ split = list(word)
201
+
202
+ for pair, merge in self.merges.items():
203
+ i = 0
204
+ while i < len(split) - 1:
205
+ if split[i] == pair[0] and split[i + 1] == pair[1]:
206
+ split = split[:i] + [merge] + split[i + 2 :]
207
+ else:
208
+ i += 1
209
+ return split
210
+
211
+
212
+ def tokenize(self, text, max_length=None, return_tensors=None, return_subword=False):
213
+
214
+ casti = text.lower().split("<mask>", 1)
215
+
216
+ if len(casti) == 1:
217
+ zoznam = self.tokenize_half(text)
218
+ else:
219
+ zoznam = self.tokenize_half(casti[0].strip()) + ["<mask>"] + self.tokenize_half(casti[1])
220
+
221
+ # Upraviť input_ids a attention_mask na základe max_length
222
+ if max_length == None:
223
+ return [prvok if prvok in self.vocab else "<unk>" for prvok in zoznam]
224
+
225
+ # Ak sa token nenachádza v vocab, tak mu priradíme UNK idčko = 3
226
+ input_ids = []
227
+ for prvok in zoznam:
228
+ if prvok in self.vocab:
229
+ input_ids.append(self.vocab[prvok])
230
+ else:
231
+ try:
232
+ prvky_add = self.tokenize_additionally(prvok)
233
+ for prvok_add in prvky_add:
234
+ if prvok_add in self.vocab:
235
+ input_ids.append(self.vocab[prvok_add])
236
+ else:
237
+ input_ids.append(self.vocab["<unk>"])
238
+ except Exception as e:
239
+ input_ids.append(self.vocab["<unk>"])
240
+
241
+ if len(input_ids) >= max_length - 2:
242
+ input_ids = input_ids[:max_length - 2]
243
+ attention_mask = [1] * (max_length - 2)
244
+ input_ids = [self.vocab["<s>"]] + input_ids + [self.vocab["</s>"]]
245
+ attention_mask = [1] + attention_mask + [1]
246
+ else:
247
+ padding_length = max_length - len(input_ids) - 2
248
+ input_ids = [self.vocab["<s>"]] + input_ids + [self.vocab["</s>"]]
249
+ attention_mask = [1] * len(input_ids)
250
+ input_ids += [self.vocab["<pad>"]] * padding_length
251
+ attention_mask += [0] * padding_length
252
+
253
+ # Zmena tu - Zabalíme výsledné tenzory do zoznamu jedného prvku
254
+ output = {"input_ids": [input_ids], "attention_mask": [attention_mask]}
255
+ if return_tensors == "pt":
256
+ output = {key: torch.tensor(val) for key, val in output.items()}
257
+
258
+ if return_subword:
259
+ tokens = [self.reverse_vocab[idx] for idx in input_ids]
260
+ return tokens
261
+
262
+ return BatchEncoding(output)
263
+
264
+ def tokenizeQA(self, text1, text2, max_length=None, return_tensors=None, return_subword=False):
265
+
266
+ zoznam1 = self.tokenize_half(text1.lower().strip())
267
+ zoznam2 = self.tokenize_half(text2.lower().strip())
268
+
269
+ # Ak sa token nenachádza v vocab, tak mu priradíme UNK idčko = 3
270
+ input_ids1 = []
271
+ for prvok in zoznam1:
272
+ if prvok in self.vocab:
273
+ input_ids1.append(self.vocab[prvok])
274
+ else:
275
+ # print(f"Nemáme token pre: {prvok}")
276
+ try:
277
+ prvky_add = self.tokenize_additionally(prvok)
278
+ for prvok_add in prvky_add:
279
+ if prvok_add in self.vocab:
280
+ input_ids1.append(self.vocab[prvok_add])
281
+ else:
282
+ input_ids1.append(self.vocab["<unk>"])
283
+ except Exception as e:
284
+ print(f"Chyba pri spracovaní prvku {prvok}: {e}")
285
+ input_ids1.append(self.vocab["<unk>"])
286
+
287
+ # Ak sa token nenachádza v vocab, tak mu priradíme UNK idčko = 3
288
+ input_ids2 = []
289
+ for prvok in zoznam2:
290
+ if prvok in self.vocab:
291
+ input_ids2.append(self.vocab[prvok])
292
+ else:
293
+ # print(f"Nemáme token pre: {prvok}")
294
+ try:
295
+ prvky_add = self.tokenize_additionally(prvok)
296
+ for prvok_add in prvky_add:
297
+ if prvok_add in self.vocab:
298
+ input_ids2.append(self.vocab[prvok_add])
299
+ else:
300
+ input_ids2.append(self.vocab["<unk>"])
301
+ except Exception as e:
302
+ print(f"Chyba pri spracovaní prvku {prvok}: {e}")
303
+ input_ids2.append(self.vocab["<unk>"])
304
+
305
+ total_length = len(input_ids1) + len(input_ids2)
306
+
307
+ if total_length >= max_length - 4:
308
+ excess_length = total_length - (max_length - 4)
309
+ while excess_length > 0:
310
+ if len(input_ids1) >= len(input_ids2):
311
+ input_ids1 = input_ids1[:-1]
312
+ else:
313
+ input_ids2 = input_ids2[:-1]
314
+ excess_length -= 1
315
+
316
+ input_ids1 = [self.vocab["<s>"]] + input_ids1 + [self.vocab["</s>"]]
317
+ input_ids2 = [self.vocab["</s>"]] + input_ids2 + [self.vocab["</s>"]]
318
+ input_ids = input_ids1 + input_ids2
319
+
320
+
321
+ if len(input_ids) >= max_length:
322
+ input_ids = input_ids[:max_length]
323
+ attention_mask = [1] * (max_length)
324
+ else:
325
+ padding_length = max_length - len(input_ids)
326
+ attention_mask = [1] * len(input_ids)
327
+ input_ids += [self.vocab["<pad>"]] * padding_length
328
+ attention_mask += [0] * padding_length
329
+
330
+ # Zmena tu - Zabalíme výsledné tenzory do zoznamu jedného prvku
331
+ output = {"input_ids": [input_ids], "attention_mask": [attention_mask]}
332
+
333
+ if return_tensors == "pt":
334
+ output = {key: torch.tensor(val) for key, val in output.items()}
335
+
336
+ if return_subword:
337
+ tokens = [self.reverse_vocab[idx] for idx in input_ids]
338
+ return tokens
339
+
340
+ return BatchEncoding(output)
341
+
342
+ def convert_ids_to_tokens(self, input_id):
343
+ return self.decode(self.reverse_vocab[input_id])
344
+
345
+ def convert_list_ids_to_tokens(self, input_ids):
346
+ tokens = []
347
+ for input_id in input_ids:
348
+ tokens.append(self.decode(self.reverse_vocab[input_id.item() if isinstance(input_id, torch.Tensor) else input_id]))
349
+ return tokens
350
+
351
+ def convert_tokens_to_ids(self, token):
352
+ return self.vocab[token]
353
+
354
+ def convert_list_tokens_to_ids(self, tokens):
355
+ ids = []
356
+ for token in tokens:
357
+ ids.append(self.vocab[token])
358
+ return ids
SKMT_lib_v2/dolezite_slova_MDBSNK ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc119ce955ba3bf7a6b3ab450860106b7c919dd3a33d24fbdaa221a850985ffd
3
+ size 14771409
SKMT_lib_v2/kodovanie.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\u00c3\u00a1": "\u00e1",
3
+ "\u00c3\u00a4": "\u00e4",
4
+ "\u00c3\u00a9": "\u00e9",
5
+ "\u00c3\u00b3": "\u00f3",
6
+ "\u00c3\u00b4": "\u00f4",
7
+ "\u00c3\u00b6": "\u00f6",
8
+ "\u00c3\u00ba": "\u00fa",
9
+ "\u00c3\u00bc": "\u00fc",
10
+ "\u00c3\u00bd": "\u00fd",
11
+ "\u00c3\u0123": "\u00c1",
12
+ "\u00c3\u0126": "\u00c4",
13
+ "\u00c3\u012b": "\u00c9",
14
+ "\u00c3\u012f": "\u00cd",
15
+ "\u00c3\u0135": "\u00d3",
16
+ "\u00c3\u0136": "\u00d4",
17
+ "\u00c3\u0138": "\u00d6",
18
+ "\u00c3\u013c": "\u00da",
19
+ "\u00c3\u013e": "\u00dc",
20
+ "\u00c3\u013f": "\u00dd",
21
+ "\u00c3\u0143": "\u00ed",
22
+ "\u00c4\u00b9": "\u0139",
23
+ "\u00c4\u00ba": "\u013a",
24
+ "\u00c4\u00bd": "\u013d",
25
+ "\u00c4\u00be": "\u013e",
26
+ "\u00c4\u012e": "\u010c",
27
+ "\u00c4\u012f": "\u010d",
28
+ "\u00c4\u0130": "\u010e",
29
+ "\u00c4\u0131": "\u010f",
30
+ "\u00c5\u00a1": "\u0161",
31
+ "\u00c5\u00a4": "\u0164",
32
+ "\u00c5\u00a5": "\u0165",
33
+ "\u00c5\u00ae": "\u016e",
34
+ "\u00c5\u00af": "\u016f",
35
+ "\u00c5\u00bd": "\u017d",
36
+ "\u00c5\u00be": "\u017e",
37
+ "\u00c5\u0129": "\u0147",
38
+ "\u00c5\u012a": "\u0148",
39
+ "\u00c5\u0137": "\u0155",
40
+ "\u00c5\u013a": "\u0158",
41
+ "\u00c5\u013b": "\u0159",
42
+ "\u00c5\u0142": "\u0160"
43
+ }
SKMT_lib_v2/rootWordsOnlyInText ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44d5ce33f0f9fdf85e6cdba9fc0e82275368a21ce0a427132f3c7a4341a88bea
3
+ size 6755728
SKMT_lib_v2/slova_MDBSNK ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f7f5099ccb47cfe00cc8024a291c49175cd6a3d4fd41d922709a8f5b7eb6a15
3
+ size 17177101
SKMT_lib_v2/tokenizers/SKMT_BPE/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
SKMT_lib_v2/tokenizers/SKMT_BPE/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
SKMT_lib_v2/word_root_20231210_sorted ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e9aaf300e9fbdb2692b5d68a90a6911e4c9ae2c24f1b6da1369a974b95a5ee1
3
+ size 37023337