AlzbetaStrompova commited on
Commit
7e6964a
·
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .ruff_cache
2
+ .idea
3
+ # Byte-compiled / optimized / DLL files
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+ cover/
55
+
56
+ # Translations
57
+ *.mo
58
+ *.pot
59
+
60
+ # Django stuff:
61
+ *.log
62
+ local_settings.py
63
+ db.sqlite3
64
+ db.sqlite3-journal
65
+
66
+ # Flask stuff:
67
+ instance/
68
+ .webassets-cache
69
+
70
+ # Scrapy stuff:
71
+ .scrapy
72
+
73
+ # Sphinx documentation
74
+ docs/_build/
75
+
76
+ # PyBuilder
77
+ .pybuilder/
78
+ target/
79
+
80
+ # Jupyter Notebook
81
+ .ipynb_checkpoints
82
+
83
+ # IPython
84
+ profile_default/
85
+ ipython_config.py
86
+
87
+ # pyenv
88
+ # For a library or package, you might want to ignore these files since the code is
89
+ # intended to run in multiple environments; otherwise, check them in:
90
+ # .python-version
91
+
92
+ # pipenv
93
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
95
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
96
+ # install all needed dependencies.
97
+ #Pipfile.lock
98
+
99
+ # poetry
100
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
102
+ # commonly ignored for libraries.
103
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104
+ #poetry.lock
105
+
106
+ # pdm
107
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108
+ #pdm.lock
109
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110
+ # in version control.
111
+ # https://pdm.fming.dev/#use-with-ide
112
+ .pdm.toml
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: NerRoB Czech
3
+ emoji: 🌖
4
+ colorFrom: purple
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.29.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from website_script import load, run
3
+
4
+ print("Loading model")
5
+ tokenizer, model, gazetteers_for_matching = load()
6
+ print("Loaded model")
7
+
8
+ examples = [
9
+ "Masarykova univerzita",
10
+ ]
11
+
12
+ def ner(text):
13
+ result = run(tokenizer, model, gazetteers_for_matching, text)
14
+ return result
15
+
16
+ demo = gr.Interface(ner,
17
+ gr.Textbox(placeholder="Enter sentence here..."),
18
+ "textbox",
19
+ #gr.HighlightedText(), # TODO https://www.gradio.app/guides/named-entity-recognition
20
+ examples=examples)
21
+
22
+ if __name__ == "__main__":
23
+ demo.launch()
data_manipulation/create_gazetteers.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import itertools
4
+ import pandas as pd
5
+
6
+ from names_dataset import NameDataset
7
+
8
+
9
+ def load_gazetteers(path):
10
+ """
11
+ Load gazetteers from a file
12
+ :param path: path to the gazetteer file
13
+ :return: a dict of gazetteers
14
+ """
15
+ with open(path, 'rb') as f:
16
+ gazetteers = pickle.load(f)
17
+ return gazetteers
18
+
19
+ def save_gazetteers(gazetteers, path):
20
+ """
21
+ Save gazetteers to a file
22
+ :param path: path to the gazetteer file
23
+ :param gazetteers: a dict of gazetteers
24
+ """
25
+ with open(path, 'wb') as f:
26
+ pickle.dump(gazetteers, f)
27
+
28
+ def load_gazetteers_from_paper(path="/home/xstromp/dp/data/gazetteers_data/paper/Locations.Cities.Europe"):
29
+ """
30
+ Load gazetteers from the paper
31
+ :param path: path to the gazetteer file
32
+ :return: a dict of gazetteers
33
+ """
34
+ with open(path, 'r') as f:
35
+ gazetteers = f.readlines()
36
+ gazetteers = {gazetteer.strip() for gazetteer in gazetteers}
37
+ return gazetteers
38
+
39
+ def merge_gazetteers(*gazetteers):
40
+ # Initialize a new dictionary to store merged results
41
+ merged_gazetteers = {}
42
+ # Iterate over each dictionary provided
43
+ for gaz in gazetteers:
44
+ # Iterate over each key and set in the current dictionary
45
+ for key, value_set in gaz.items():
46
+ if key in merged_gazetteers:
47
+ # If the key already exists in the result, union the sets
48
+ merged_gazetteers[key] |= value_set
49
+ else:
50
+ # Otherwise, initialize the key with the set from the current dictionary
51
+ merged_gazetteers[key] = value_set.copy() # Use copy to avoid mutating the original sets
52
+ return merged_gazetteers
53
+
54
+
55
+ ####################################################################################################
56
+ ### GENERATED LISTS ################################################################################
57
+ ####################################################################################################
58
+
59
+ nationalities = [
60
+ "Čech", "Češka", "Češi",
61
+ "Slovák", "Slovenka", "Slováci",
62
+ "Němec", "Němka", "Němci",
63
+ "Polák", "Polka", "Poláci",
64
+ "Maďar", "Maďarka", "Maďaři",
65
+ "Rakušan", "Rakušanka", "Rakušané",
66
+ "Ukrajinec", "Ukrajinka", "Ukrajinci",
67
+ "Rus", "Ruska", "Rusové",
68
+ "Angličan", "Angličanka", "Angličané",
69
+ "Američan", "Američanka", "Američané",
70
+ "Francouz", "Francouzka", "Francouzi",
71
+ "Ital", "Italka", "Italové",
72
+ "Španěl", "Španělka", "Španělé",
73
+ "Portugalec", "Portugalka", "Portugalci",
74
+ "Řek", "Řekyně", "Řekové",
75
+ "Bulhar", "Bulharka", "Bulhaři",
76
+ "Rumun", "Rumunka", "Rumuni",
77
+ "Belgičan", "Belgičanka", "Belgičané",
78
+ "Holanďan", "Holanďanka", "Holandci",
79
+ "Švýcar", "Švýcarka", "Švýcaři",
80
+ "Slovinec", "Slovinka", "Slovinci",
81
+ "Chorvat", "Chorvatka", "Chorvaté",
82
+ "Srb", "Srbka", "Srbové",
83
+ "Bosňák", "Bosňačka", "Bosňáci",
84
+ "Černohorec", "Černohorka", "Černohorci",
85
+ "Makedonec", "Makedonka", "Makedonci",
86
+ "Albánec", "Albánka", "Albánci",
87
+ "Turek", "Turkyně", "Turci",
88
+ "Kanaďan", "Kanaďanka", "Kanaďané",
89
+ "Mexičan", "Mexičanka", "Mexičané",
90
+ "Brazilec", "Brazilka", "Brazilci",
91
+ "Argentinc", "Argentinka", "Argentinci",
92
+ "Chilan", "Chilanka", "Chilané",
93
+ "Australan", "Australanka", "Australané",
94
+ "Novozélanďan", "Novozélanďanka", "Novozélanďané",
95
+ "Číňan", "Číňanka", "Číňané",
96
+ "Japonec", "Japonka", "Japonci",
97
+ "Korejec", "Korejka", "Korejci",
98
+ "Vietnamec", "Vietnamka", "Vietnamci",
99
+ "Ind", "Indka", "Indové",
100
+ "Pákistánec", "Pákistánka", "Pákistánci",
101
+ "Iráčan", "Iráčanka", "Iráčané",
102
+ "Íránec", "Íránka", "Íránci",
103
+ "Syřan", "Syřanka", "Syrští",
104
+ "Izraelan", "Izraelanka", "Izraelci",
105
+ "Egyptan", "Egyptanka", "Egyptané",
106
+ "Súdánec", "Súdánka", "Súdánci",
107
+ "Maročan", "Maročanka", "Maročané",
108
+ "Alžířan", "Alžírka", "Alžířané",
109
+ "Libanonec", "Libanonka", "Libanonci",
110
+ "Jordánec", "Jordánka", "Jordánci",
111
+ "Kuvajťan", "Kuvajťanka", "Kuvajťané"
112
+ ]
113
+
114
+ titles = "Bc., BcA., Ing., Ing. arch., MgA., Mgr., MBA, Ph.D., JuDr., PhDr., Th.D., MuDr., RNDr., MVDr., PharmDr., DrSc., MVDR., MDDr., CSc, DRSc., doc., RNDr., prof., PhMr., Akad. Mal., Bc. et Bc., Mgr. et Mgr.".split(", ")
115
+
116
+ relig_myth = ["Bůh", "Ježíš Kristus", "Mojžíš", "Muhammad", "Buddha", "Krishna", "Thor", "Zeus",
117
+ "Odin", "Héraklés", "Anubis", "Osiris", "Izida", "Shiva", "Vishnu", "Ganesha",
118
+ "Athena", "Apolón", "Héra", "Artemis", "Dionýsos", "Quetzalcoatl", "Tezcatlipoca",
119
+ "Amaterasu", "Izanagi", "Izanami", "Freya", "Loki", "Baldur", "Saraswati", "Lakshmi",
120
+ "Hanuman", "Rama", "Sita", "Parvati", "Durga", "Kali", "Tara", "Vajrapani",
121
+ "Maitreya", "Avalokiteśvara"]
122
+
123
+ ####################################################################################################
124
+ ### WIKIANN GAZETTEERS #############################################################################
125
+ ####################################################################################################
126
+ def determine_category(line):
127
+ categories = ["PER", "LOC", "ORG"]
128
+ for category in categories:
129
+ if category in line:
130
+ return category
131
+ return ""
132
+
133
+ def load_document(file_name):
134
+ with open(file_name, 'r') as file:
135
+ lines = file.readlines()
136
+
137
+ categories = {"LOC": set(), "PER": set(), "ORG": set()}
138
+ current_text, current_category = "", ""
139
+
140
+ for line in lines:
141
+ category = determine_category(line)
142
+ if not category:
143
+ continue
144
+
145
+ parts = line.strip().split("\t")
146
+ tag, word = parts[1], parts[0].split(":")[1]
147
+
148
+ if tag.startswith("B-"):
149
+ if current_category:
150
+ categories[current_category].add(current_text.strip())
151
+ current_category = category
152
+ current_text = word
153
+ elif tag.startswith("I-") and current_category == category:
154
+ current_text += " " + word
155
+ else:
156
+ if current_category:
157
+ categories[current_category].add(current_text.strip())
158
+ current_category, current_text = "", ""
159
+
160
+ if current_category:
161
+ categories[current_category].add(current_text.strip())
162
+
163
+ return categories
164
+
165
+ def load_gazetteers_from_wikiann(path="/home/xstromp/dp/data/wikiann/cs"):
166
+ gazetteers = {"LOC": set(), "PER": set(), "ORG": set()}
167
+ for data_split in ['train', 'extra', 'dev']:
168
+ additional_data = load_document(os.path.join(path, data_split))
169
+ for key, values in additional_data.items():
170
+ gazetteers[key].update(values)
171
+ return gazetteers
172
+
173
+ ####################################################################################################
174
+ ### GENERATION OF GAZETTEERS TO EXPAND TRAIN DATASET ###############################################
175
+ ####################################################################################################
176
+
177
+ def get_complex_person():
178
+ pass
179
+
180
+ ####################################################################################################
181
+ ### GENERATION OF GAZETTEERS TO FIND MATCH FOR EXTENDED EMBEDDINGS #################################
182
+ ####################################################################################################
183
+
184
+ def get_persons():
185
+ nd = NameDataset()
186
+ per = set()
187
+ # first names
188
+ first = nd.get_top_names(n=10000, country_alpha2='CZ')
189
+ per.update(first["CZ"]["M"])
190
+ per.update(first["CZ"]["F"])
191
+ # surnames
192
+ surnames = nd.get_top_names(n=10000, use_first_names=False, country_alpha2='CZ')
193
+ per.update(surnames["CZ"])
194
+ # titles
195
+ per.update(titles)
196
+ # nationalities
197
+ per.update(nationalities)
198
+ return per
199
+
200
+ def get_locations():
201
+ df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/LOC/world-data-2023.csv")
202
+ loc = {country for country in df['Country'].tolist()}
203
+ loc.update(["Asie", "Afrika", "Severní Amerika", "Jižní Amerika", "Antarktida", "Evropa", "Austrálie"])
204
+ with open("/home/xstromp/dp/data/gazetteers_data/LOC/data.json", 'rb') as handle:
205
+ loaded_dict = pickle.load(handle)
206
+ loc.update(list(itertools.chain.from_iterable([v for _, v in loaded_dict.items()])))
207
+ loc.update(load_gazetteers_from_paper())
208
+ return loc
209
+
210
+
211
+ def get_organizations():
212
+
213
+ df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/ORG/Inc5000Eu-full.csv")
214
+ org = set(df['Company'].tolist())
215
+ df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/ORG/FirmyBrno.csv")
216
+ org.update(df['name'].tolist())
217
+ org.update(load_gazetteers_from_paper("/home/xstromp/dp/data/gazetteers_data/paper/Organizations"))
218
+ return org
data_manipulation/dataset_funcions.py ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ from tqdm import tqdm
5
+
6
+ from datasets import Dataset, DatasetDict
7
+
8
+ def load_gazetteers(path):
9
+ """
10
+ Load gazetteers from a file
11
+ :param path: path to the gazetteer file
12
+ :return: a dict of gazetteers
13
+ """
14
+ with open(path, 'r') as f:
15
+ gazetteers = json.load(f)
16
+ for k, v in gazetteers.items():
17
+ gazetteers[k] = set(v)
18
+ return gazetteers
19
+
20
+ def create_dataset(label_mapper:dict, args):
21
+ if args.dataset == "cnec":
22
+ return create_cnec_dataset(label_mapper, args)
23
+ return load_wikiann_testing_dataset(args)
24
+
25
+
26
+ ####################################################################################################
27
+ ### GAZETTEERS EMBEDDINGS ##########################################################################
28
+ ####################################################################################################
29
+
30
+
31
+ def find_multi_token_matches(tokens, looking_tokens, gazetteers, matches):
32
+ i = 0
33
+ n = len(tokens)
34
+ assert n == len(looking_tokens)
35
+ while i < n:
36
+ for length in range(min(5, n-i), 0, -1): # Assuming maximum entity length is 5
37
+ phrase = ' '.join(looking_tokens[i:i+length])
38
+ for gazetteer in gazetteers:
39
+ if phrase in gazetteer:
40
+ match_type = gazetteer[phrase]
41
+ for index in range(i, i+length):
42
+ matches.setdefault(tokens[index], []).append((phrase, match_type))
43
+ i += 1
44
+ return matches
45
+
46
+ def find_single_token_matches(tokens, looking_tokens, gazetteers, matches):
47
+ return matches
48
+
49
+ def find_combination_single_multi_token_matches(tokens, looking_tokens, gazetteers, matches):
50
+ return matches
51
+
52
+ def gazetteer_matching(words, gazetteers_for_matching):
53
+ single_token_match = False
54
+ ending_ova = False
55
+ apply_lemmatizing = False
56
+
57
+
58
+ if single_token_match:
59
+ matches = {}
60
+
61
+ else: # multi_token_match
62
+ matches = find_multi_token_matches(words, words, gazetteers_for_matching, {})
63
+ # if apply_lemmatizing: TODO
64
+ # lemmatize_tokens = [lemmatizing(t) for t in words]
65
+ # matches = find_multi_token_matches(words, lemmatize_tokens, gazetteers_for_matching, matches)
66
+
67
+ result = []
68
+ for word in words:
69
+ mid_res = sorted(matches.get(word, []), key=lambda x: x[0].count(" "), reverse=True)
70
+ per, org, loc = 0, 0, 0
71
+ for res in mid_res:
72
+ if mid_res[0][0].count(" ") == res[0].count(" "):
73
+ if res[1] == "per":
74
+ per = 1
75
+ elif res[1] == "org":
76
+ org = 1
77
+ elif res[1] == "loc":
78
+ loc = 1
79
+ if ending_ova and word.endswith("ová") and word[0].isupper():
80
+ per = 1
81
+ result.append([per, org, loc])
82
+ return result
83
+
84
+
85
+ ####################################################################################################
86
+ ### GAZETTEERS EXPANSION TRAIN DATASET #############################################################
87
+ ####################################################################################################
88
+
89
+ def expand_train_dataset_with_gazetteers(train, args):
90
+ if args.apply_extended_embeddings:
91
+ gazetteers_for_matching = load_gazetteers(args.extended_embeddings_gazetteers_path)
92
+ gazetteers = load_gazetteers(args.train_gazetteers_path)
93
+ count_gazetteers = {}
94
+ id_ = train[-1]["id"]
95
+ dataset = []
96
+ for row in train:
97
+ dataset.append({"id": row['id'], 'tokens': row['tokens'].copy(),
98
+ 'ner_tags': row['ner_tags'].copy(), 'gazetteers': row['gazetteers'].copy()})
99
+ for k in gazetteers.keys():
100
+ count_gazetteers[k] = 0
101
+ for index in range(args.gazetteers_counter):
102
+ for row in tqdm(train, desc=f"loop {index} from {args.gazetteers_counter}"):
103
+ i = 0
104
+ temp_1 = row["ner_tags"].copy()
105
+ temp_2 = row["tokens"].copy()
106
+ if temp_1.count(0) == len(temp_1):
107
+ continue
108
+ while i < len(temp_1):
109
+ tag = temp_1[i]
110
+ if tag % 2 == 1:
111
+ tags = temp_1[:i]
112
+ tokens = temp_2[:i]
113
+ i += 1
114
+ assert len(gazetteers[tag]) > count_gazetteers[tag]
115
+ new = gazetteers[tag][count_gazetteers[tag]].split(" ")
116
+ count_gazetteers[tag] += 1
117
+ while i < len(temp_1):
118
+ if temp_1[i] != tag + 1:
119
+ break
120
+ i += 1
121
+ tags.append(tag)
122
+ tags.extend([tag + 1] * (len(new) - 1))
123
+ tags.extend(temp_1[i:])
124
+
125
+ tokens.extend(new)
126
+ tokens.extend(temp_2[i:])
127
+ temp_1 = tags
128
+ temp_2 = tokens
129
+ else:
130
+ i += 1
131
+ id_ += 1
132
+ if args.apply_extended_embeddings:
133
+ matching = gazetteer_matching(temp_2, gazetteers_for_matching, args)
134
+ dataset.append({"id": id_, 'tokens': temp_2, 'ner_tags': temp_1, "gazetteers": matching})
135
+ dataset.append({"id": id_, 'tokens': temp_2, 'ner_tags': temp_1})
136
+ return dataset
137
+
138
+
139
+ ####################################################################################################
140
+ ### CNEC DATASET ###################################################################################
141
+ ####################################################################################################
142
+ def get_dataset_from_cnec(label_mapper:dict, xml_file_path, args):
143
+ """
144
+ label_mapper: cnec labels to int
145
+ """
146
+ # Open and read the XML file as plain text
147
+ assert os.path.isfile(xml_file_path)
148
+ id_ = 0
149
+ with open(xml_file_path, "r", encoding="utf-8") as xml_file:
150
+ plain_text = xml_file.read()
151
+ plain_text = plain_text[5:-5] # remove unnessery characters
152
+ plain_text = re.sub(r'([a-zA-Z.])<ne', r'\1 <ne', plain_text)
153
+ plain_text = re.sub(r'</ne>([a-zA-Z.])', r'</ne> \1', plain_text)
154
+ plain_text = re.sub(r'[ ]+', ' ', plain_text)
155
+ sentences = plain_text.split("\n")
156
+ ne_pattern = r'<ne type="([a-zA-Z?_-]{1,5})">([^<]+)</ne>'
157
+ data = []
158
+ if args.apply_extended_embeddings:
159
+ gazetteers_for_matching = load_gazetteers(args.extended_embeddings_gazetteers_path)
160
+ from data_manipulation.preprocess_gazetteers import build_reverse_dictionary
161
+ temp = []
162
+ for i in gazetteers_for_matching.keys():
163
+ temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
164
+ gazetteers_for_matching = temp
165
+
166
+ for sentence in tqdm(sentences):
167
+ entity_mapping = []
168
+ while "<ne type=" in sentence: # while because there are nested entities
169
+ nes = re.findall(ne_pattern, sentence)
170
+ for label, entity in nes:
171
+ pattern = f'<ne type="{label}">{entity}</ne>'
172
+ index = sentence.index(pattern)
173
+ temp_index = index
174
+ sentence = sentence.replace(pattern, entity, 1)
175
+ temp_index -= sum([len(f'<ne type="{tag}">') for tag in re.findall(r'<ne type="([a-zA-Z?_-]{1,5})">', sentence[:index])])
176
+ temp_index -= sentence[:index].count("</ne>") * len("</ne>")
177
+ temp_index -= (re.sub(r'<ne type="([a-zA-Z?_-]{1,5})">', "", sentence[:index]).replace("</ne>", "")).count(" ")
178
+ index = temp_index
179
+ entity_mapping.append((entity, label, index, index + len(entity)))
180
+
181
+ entities = []
182
+ for entity, label, start, end in entity_mapping:
183
+ for tag in label_mapper.keys():
184
+ if label.lower().startswith(tag):
185
+ entities.append((label_mapper[tag], entity, start, end))
186
+ break
187
+ entities.sort(key=lambda x: len(x[1]), reverse=True)
188
+
189
+ words = re.split(r'\s+', sentence)
190
+ tags_per_word = []
191
+ sentence_counter = -1
192
+ for word in words:
193
+ sentence_counter += len(word) + 1
194
+ if len(entities) == 0:
195
+ tags_per_word.append(0) # tag representing no label for no word
196
+ for index_entity in range(len(entities)):
197
+ if not(sentence_counter - len(word) >= entities[index_entity][2] and
198
+ sentence_counter <= entities[index_entity][3] and
199
+ word in entities[index_entity][1]):
200
+ if index_entity == len(entities) - 1:
201
+ tags_per_word.append(0) # tag representing no label for word
202
+ continue
203
+
204
+ if args.division_to_BI_tags:
205
+ if sentence_counter - len(word) == entities[index_entity][2]:
206
+ tags_per_word.append(entities[index_entity][0] * 2 - 1) # beggining of entity
207
+ else:
208
+ tags_per_word.append(entities[index_entity][0] * 2) # inside of entity
209
+ else:
210
+ tags_per_word.append(entities[index_entity][0])
211
+ break
212
+
213
+ if args.contain_only_label_sentences and tags_per_word.count(0) == len(tags_per_word):
214
+ continue
215
+ if tags_per_word == [] or tags_per_word == [0]:
216
+ continue
217
+ if args.apply_extended_embeddings:
218
+ matching = gazetteer_matching(words, gazetteers_for_matching)
219
+ data.append({"id": id_, 'tokens': words, 'ner_tags': tags_per_word,
220
+ "sentence": " ".join(words), "gazetteers": matching})
221
+ else:
222
+ data.append({"id": id_, 'tokens': words, 'ner_tags': tags_per_word, "sentence": " ".join(words)})
223
+ id_ += 1
224
+ return data
225
+
226
+
227
+ def create_dataset2(label_mapper:dict, gazetteers_path):
228
+ path = "/nlp/projekty/gazetteer_ner/cnec2.0/data/xml"
229
+ dataset = DatasetDict()
230
+ for part, file_name in zip(["train", "validation", "test"],["named_ent_train.xml", "named_ent_etest.xml", "named_ent_dtest.xml"]):
231
+ file_path = os.path.join(path, file_name)
232
+ ##
233
+ id_ = 0
234
+ with open(file_path, "r", encoding="utf-8") as xml_file:
235
+ plain_text = xml_file.read()
236
+ plain_text = plain_text[5:-5] # remove unnessery characters
237
+ plain_text = re.sub(r'([a-zA-Z.])<ne', r'\1 <ne', plain_text)
238
+ plain_text = re.sub(r'</ne>([a-zA-Z.])', r'</ne> \1', plain_text)
239
+ plain_text = re.sub(r'[ ]+', ' ', plain_text)
240
+ sentences = plain_text.split("\n")
241
+ ne_pattern = r'<ne type="([a-zA-Z?_-]{1,5})">([^<]+)</ne>'
242
+ data = []
243
+ if True:
244
+ gazetteers_for_matching = load_gazetteers(gazetteers_path)
245
+ from data_manipulation.preprocess_gazetteers import build_reverse_dictionary
246
+ temp = []
247
+ for i in gazetteers_for_matching.keys():
248
+ temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
249
+ gazetteers_for_matching = temp
250
+
251
+ for sentence in tqdm(sentences):
252
+ entity_mapping = []
253
+ while "<ne type=" in sentence: # while because there are nested entities
254
+ nes = re.findall(ne_pattern, sentence)
255
+ for label, entity in nes:
256
+ pattern = f'<ne type="{label}">{entity}</ne>'
257
+ index = sentence.index(pattern)
258
+ temp_index = index
259
+ sentence = sentence.replace(pattern, entity, 1)
260
+ temp_index -= sum([len(f'<ne type="{tag}">') for tag in re.findall(r'<ne type="([a-zA-Z?_-]{1,5})">', sentence[:index])])
261
+ temp_index -= sentence[:index].count("</ne>") * len("</ne>")
262
+ temp_index -= (re.sub(r'<ne type="([a-zA-Z?_-]{1,5})">', "", sentence[:index]).replace("</ne>", "")).count(" ")
263
+ index = temp_index
264
+ entity_mapping.append((entity, label, index, index + len(entity)))
265
+
266
+ entities = []
267
+ for entity, label, start, end in entity_mapping:
268
+ for tag in label_mapper.keys():
269
+ if label.lower().startswith(tag):
270
+ entities.append((label_mapper[tag], entity, start, end))
271
+ break
272
+ entities.sort(key=lambda x: len(x[1]), reverse=True)
273
+
274
+ words = re.split(r'\s+', sentence)
275
+ tags_per_word = []
276
+ sentence_counter = -1
277
+ for word in words:
278
+ sentence_counter += len(word) + 1
279
+ if len(entities) == 0:
280
+ tags_per_word.append(0) # tag representing no label for no word
281
+ for index_entity in range(len(entities)):
282
+ if not(sentence_counter - len(word) >= entities[index_entity][2] and
283
+ sentence_counter <= entities[index_entity][3] and
284
+ word in entities[index_entity][1]):
285
+ if index_entity == len(entities) - 1:
286
+ tags_per_word.append(0) # tag representing no label for word
287
+ continue
288
+
289
+ if True:
290
+ if sentence_counter - len(word) == entities[index_entity][2]:
291
+ tags_per_word.append(entities[index_entity][0] * 2 - 1) # beggining of entity
292
+ else:
293
+ tags_per_word.append(entities[index_entity][0] * 2) # inside of entity
294
+ else:
295
+ tags_per_word.append(entities[index_entity][0])
296
+ break
297
+
298
+ if tags_per_word == [] or tags_per_word == [0]:
299
+ continue
300
+ if True:
301
+ matching = gazetteer_matching(words, gazetteers_for_matching)
302
+ data.append({"id": id_, 'tokens': words, 'ner_tags': tags_per_word,
303
+ "sentence": " ".join(words), "gazetteers": matching})
304
+ else:
305
+ data.append({"id": id_, 'tokens': words, 'ner_tags': tags_per_word, "sentence": " ".join(words)})
306
+ id_ += 1
307
+
308
+
309
+ ##
310
+ dataset[part] = Dataset.from_list(data)
311
+ return dataset
312
+
313
+
314
+ def create_cnec_dataset(label_mapper:dict, args):
315
+
316
+ assert os.path.isdir(args.cnec_dataset_dir_path)
317
+ dataset = DatasetDict()
318
+ for part, file_name in zip(["train", "validation", "test"],["named_ent_train.xml", "named_ent_etest.xml", "named_ent_dtest.xml"]):
319
+ file_path = os.path.join(args.cnec_dataset_dir_path, file_name)
320
+ assert os.path.isfile(file_path)
321
+ temp_dataset = get_dataset_from_cnec(label_mapper, file_path, args)
322
+ if args.expand_train_data:
323
+ temp_dataset = expand_train_dataset_with_gazetteers(temp_dataset, args)
324
+ dataset[part] = Dataset.from_list(temp_dataset)
325
+ return dataset
326
+
327
+ ####################################################################################################
328
+ ### WIKIANN DATASET ################################################################################
329
+ ####################################################################################################
330
+ def load_wikiann_testing_dataset(args):
331
+ if args.apply_gazetteers_info:
332
+ gazetteers_for_matching = load_gazetteers(args.extended_embeddings_gazetteers_path)
333
+ assert os.path.isfile(args.wikiann_dataset_path)
334
+ dataset = []
335
+ index = 0
336
+ sentences = load_tagged_sentences(args.wikiann_dataset_path)
337
+ for sentence in sentences:
338
+ words = [word for word, _ in sentence]
339
+ tags = [tag for _, tag in sentence]
340
+ if args.apply_gazetteers_info:
341
+ matching = gazetteer_matching(words, gazetteers_for_matching, args)
342
+ dataset.append({"id": index, 'tokens': words, 'ner_tags': tags, "gazetteers": matching})
343
+ else:
344
+ dataset.append({"id": index, 'tokens': words, 'ner_tags': tags})
345
+ index += 1
346
+
347
+ test = Dataset.from_list(dataset)
348
+ # dataset = DatasetDict({"train": Dataset.from_list([{"id": 1, 'tokens': [], 'ner_tags': [], "gazetteers": []}]),
349
+ # "validation": Dataset.from_list([{"id": 1, 'tokens': [], 'ner_tags': [], "gazetteers": []}]), "test": test})
350
+ dataset = DatasetDict({"test": test})
351
+ return dataset
352
+
353
+
354
+ def load_tagged_sentences(file_path):
355
+ sentences = [] # List to hold all sentences
356
+ current_sentence = [] # List to hold current sentence tokens and tags
357
+
358
+ with open(file_path, 'r', encoding='utf-8') as file:
359
+ for line in file:
360
+ line = line.strip() # Remove any extra whitespace from the line
361
+ if line:
362
+ # Split the line into token and tag
363
+ token_tag_pair = line.split()
364
+ if len(token_tag_pair) == 2:
365
+ # Add the token and tag tuple to the current sentence
366
+ current_sentence.append((token_tag_pair[0].split(':')[1], token_tag_pair[1]))
367
+ else:
368
+ # If line is empty and current sentence is not, add it to sentences
369
+ if current_sentence:
370
+ sentences.append(current_sentence)
371
+ current_sentence = [] # Reset for the next sentence
372
+
373
+ # Add the last sentence if the file doesn't end with a blank line
374
+ if current_sentence:
375
+ sentences.append(current_sentence)
376
+ return sentences
377
+
378
+
379
+ ####################################################################################################
380
+ ### TOKENIZE DATASET ###############################################################################
381
+ ####################################################################################################
382
+ def align_labels_with_tokens(labels, word_ids):
383
+ new_labels = []
384
+ current_word = None
385
+ for word_id in word_ids:
386
+ if word_id != current_word:
387
+ # Start of a new word!
388
+ current_word = word_id
389
+ label = -100 if word_id is None else labels[word_id]
390
+ new_labels.append(label)
391
+ elif word_id is None:
392
+ # Special token
393
+ new_labels.append(-100)
394
+ else:
395
+ # Same word as previous token
396
+ label = labels[word_id]
397
+ # If the label is B-XXX we change it to I-XXX
398
+ if label % 2 == 1:
399
+ label += 1
400
+ new_labels.append(label)
401
+ return new_labels
402
+
403
+ def align_gazetteers_with_tokens(gazetteers, word_ids):
404
+ new_g = []
405
+ current_word = None
406
+ for word_id in word_ids:
407
+ if word_id != current_word:
408
+ # Start of a new word!
409
+ current_word = word_id
410
+ gazetteer = [0,0,0] if word_id is None else gazetteers[word_id]
411
+ new_g.append(gazetteer)
412
+ elif word_id is None:
413
+ # Special token
414
+ new_g.append([0,0,0])
415
+ else:
416
+ # Same word as previous token
417
+ gazetteer = gazetteers[word_id]
418
+ # # If the label is B-XXX we change it to I-XXX
419
+ # if gazetteer % 2 == 1:
420
+ # gazetteer += 1
421
+ new_g.append(gazetteer)
422
+ return new_g
423
+
424
+
425
+ def create_tokenized_dataset(raw_dataset, tokenizer, apply_extended_embeddings=True):
426
+ def tokenize_and_align_labels(examples):
427
+ tokenized_inputs = tokenizer(
428
+ examples["tokens"], truncation=True, is_split_into_words=True
429
+ )
430
+ all_labels = examples["ner_tags"]
431
+ new_labels = []
432
+ for i, labels in enumerate(all_labels):
433
+ word_ids = tokenized_inputs.word_ids(i)
434
+ new_labels.append(align_labels_with_tokens(labels, word_ids))
435
+ tokenized_inputs["labels"] = new_labels
436
+ if apply_extended_embeddings:
437
+ g = examples["gazetteers"]
438
+ new_g = []
439
+ for i, g in enumerate(g):
440
+ word_ids = tokenized_inputs.word_ids(i)
441
+ new_g.append(align_gazetteers_with_tokens(g, word_ids))
442
+ p, o, l = [], [], []
443
+ for i in new_g:
444
+ p.append([x[0] for x in i])
445
+ o.append([x[1] for x in i])
446
+ l.append([x[2] for x in i])
447
+ tokenized_inputs["per"] = p
448
+ tokenized_inputs["org"] = o
449
+ tokenized_inputs["loc"] = l
450
+ return tokenized_inputs
451
+
452
+
453
+ dataset = raw_dataset.map(
454
+ tokenize_and_align_labels,
455
+ batched=True,
456
+ remove_columns=raw_dataset["train"].column_names,
457
+ )
458
+ return dataset
data_manipulation/preprocess_gazetteers.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from simplemma import lemmatize
4
+
5
+
6
+ def flatten(xss):
7
+ return [x for xs in xss for x in xs]
8
+
9
+
10
+ def remove_all_brackets(text):
11
+ return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text)
12
+
13
+
14
+ def lemmatizing(x):
15
+ if x == "":
16
+ return ""
17
+ return lemmatize(x, lang="cs")
18
+
19
+
20
+ def build_reverse_dictionary(dictionary, apply_lemmatizing=False):
21
+ reverse_dictionary = {}
22
+ for key, values in dictionary.items():
23
+ for value in values:
24
+ reverse_dictionary[value] = key
25
+ if apply_lemmatizing:
26
+ temp = lemmatizing(value)
27
+ if temp != value:
28
+ reverse_dictionary[temp] = key
29
+ return reverse_dictionary
30
+
31
+
32
+ def split_gazetteers_for_single_token_match(gazetteers):
33
+ result = {}
34
+ for k, v in gazetteers.items():
35
+ result[k] = set(flatten([vv.split(" ") for vv in v]))
36
+ result[k] = {x for x in result[k] if len(x) > 2}
37
+ return result
38
+
39
+
40
+ def preprocess_gazetteers(gazetteers, config):
41
+ if config["split_person"]:
42
+ gazetteers["PER"].update(set([x for x in flatten([v.split(" ") for v in gazetteers["PER"]]) if len(x) > 2]))
43
+ if config["lemmatize"]:
44
+ for k, v in gazetteers.items():
45
+ gazetteers[k] = set(flatten([(vv, lemmatizing(vv)) for vv in v if len(vv) > 2]))
46
+ if config["remove_brackets"]:
47
+ for k, v in gazetteers.items():
48
+ gazetteers[k] = {remove_all_brackets(vv).strip() for vv in v if len(remove_all_brackets(vv).strip()) > 2}
49
+ if config["remove_numeric"]:
50
+ for k, v in gazetteers.items():
51
+ gazetteers[k] = {vv for vv in v if not vv.isnumeric()}
52
+ if config["techniq_for_matching"] != "single":
53
+ gazetteers = split_gazetteers_for_single_token_match(gazetteers)
54
+ return gazetteers
extended_embeddings/__init__.py ADDED
File without changes
extended_embeddings/extended_embeddings_model.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaEncoder, RobertaEmbeddings
2
+ from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
3
+ from typing import List, Optional, Tuple, Union
4
+ import torch
5
+ from torch.nn import functional as F
6
+ from torch import nn
7
+
8
+ # Copied from transformers.models.bert.modeling_bert.BertPooler
9
+ class ExtendedEmbeddigsRobertaPooler(nn.Module):
10
+ def __init__(self, config):
11
+ super().__init__()
12
+ size_of_gazetters_part = int((len(config.id2label.keys()) - 1) // 2)
13
+ self.dense = nn.Linear(config.hidden_size + size_of_gazetters_part, config.hidden_size + size_of_gazetters_part)
14
+ self.activation = nn.Tanh()
15
+
16
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
17
+ # We "pool" the model by simply taking the hidden state corresponding
18
+ # to the first token.
19
+ first_token_tensor = hidden_states[:, 0]
20
+ pooled_output = self.dense(first_token_tensor)
21
+ pooled_output = self.activation(pooled_output)
22
+ return pooled_output
23
+
24
+ class ExtendedEmbeddigsRobertaModel(RobertaModel):
25
+ """
26
+
27
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
28
+ cross-attention is added between the self-attention layers, following the architecture described in *Attention is
29
+ all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
30
+ Kaiser and Illia Polosukhin.
31
+
32
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
33
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
34
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
35
+
36
+ .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
37
+
38
+ """
39
+
40
+ # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
41
+ def __init__(self, config, add_pooling_layer=True):
42
+ super().__init__(config)
43
+ self.config = config
44
+
45
+ self.embeddings = RobertaEmbeddings(config)
46
+ self.encoder = RobertaEncoder(config)
47
+ # self.gazetteers = GazetteersNetwork() # change
48
+
49
+ self.pooler = ExtendedEmbeddigsRobertaPooler(config)
50
+
51
+ # Initialize weights and apply final processing
52
+ self.post_init()
53
+
54
+ def forward(
55
+ self,
56
+ input_ids: Optional[torch.Tensor] = None,
57
+ attention_mask: Optional[torch.Tensor] = None,
58
+ token_type_ids: Optional[torch.Tensor] = None,
59
+ position_ids: Optional[torch.Tensor] = None,
60
+ # gazetteers_ids: Optional[torch.Tensor] = None, # change
61
+ per: Optional[torch.Tensor] = None, # change
62
+ org: Optional[torch.Tensor] = None, # change
63
+ loc: Optional[torch.Tensor] = None, # change
64
+ head_mask: Optional[torch.Tensor] = None,
65
+ inputs_embeds: Optional[torch.Tensor] = None,
66
+ encoder_hidden_states: Optional[torch.Tensor] = None,
67
+ encoder_attention_mask: Optional[torch.Tensor] = None,
68
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
69
+ use_cache: Optional[bool] = None,
70
+ output_attentions: Optional[bool] = None,
71
+ output_hidden_states: Optional[bool] = None,
72
+ return_dict: Optional[bool] = None,
73
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
74
+ r"""
75
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
76
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
77
+ the model is configured as a decoder.
78
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
79
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
80
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
81
+
82
+ - 1 for tokens that are **not masked**,
83
+ - 0 for tokens that are **masked**.
84
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
85
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
86
+
87
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
88
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
89
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
90
+ use_cache (`bool`, *optional*):
91
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
92
+ `past_key_values`).
93
+ """
94
+
95
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
96
+ output_hidden_states = (
97
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
98
+ )
99
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
100
+
101
+ if self.config.is_decoder:
102
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
103
+ else:
104
+ use_cache = False
105
+
106
+ if input_ids is not None and inputs_embeds is not None:
107
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
108
+ elif input_ids is not None:
109
+ # self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
110
+ input_shape = input_ids.size()
111
+ elif inputs_embeds is not None:
112
+ input_shape = inputs_embeds.size()[:-1]
113
+ else:
114
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
115
+
116
+ batch_size, seq_length = input_shape
117
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
118
+
119
+ # past_key_values_length
120
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
121
+
122
+ if attention_mask is None:
123
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
124
+
125
+ if token_type_ids is None:
126
+ if hasattr(self.embeddings, "token_type_ids"):
127
+ buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
128
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
129
+ token_type_ids = buffered_token_type_ids_expanded
130
+ else:
131
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
132
+
133
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
134
+ # ourselves in which case we just need to make it broadcastable to all heads.
135
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
136
+
137
+ # If a 2D or 3D attention mask is provided for the cross-attention
138
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
139
+ if self.config.is_decoder and encoder_hidden_states is not None:
140
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
141
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
142
+ if encoder_attention_mask is None:
143
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
144
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
145
+ else:
146
+ encoder_extended_attention_mask = None
147
+
148
+ # Prepare head mask if needed
149
+ # 1.0 in head_mask indicate we keep the head
150
+ # attention_probs has shape bsz x n_heads x N x N
151
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
152
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
153
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
154
+
155
+
156
+ embedding_output = self.embeddings(
157
+ input_ids=input_ids,
158
+ position_ids=position_ids,
159
+ token_type_ids=token_type_ids,
160
+ inputs_embeds=inputs_embeds,
161
+ past_key_values_length=past_key_values_length,
162
+ )
163
+ encoder_outputs = self.encoder(
164
+ embedding_output,
165
+ attention_mask=extended_attention_mask,
166
+ head_mask=head_mask,
167
+ encoder_hidden_states=encoder_hidden_states,
168
+ encoder_attention_mask=encoder_extended_attention_mask,
169
+ past_key_values=past_key_values,
170
+ use_cache=use_cache,
171
+ output_attentions=output_attentions,
172
+ output_hidden_states=output_hidden_states,
173
+ return_dict=return_dict,
174
+ )
175
+ sequence_output = encoder_outputs[0]
176
+
177
+ sequence_output = torch.cat((sequence_output, per.unsqueeze(2), org.unsqueeze(2), loc.unsqueeze(2)), dim=2) # change
178
+
179
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
180
+
181
+ if not return_dict:
182
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
183
+
184
+ return BaseModelOutputWithPoolingAndCrossAttentions(
185
+ last_hidden_state=sequence_output,
186
+ pooler_output=pooled_output,
187
+ past_key_values=encoder_outputs.past_key_values,
188
+ hidden_states=encoder_outputs.hidden_states,
189
+ attentions=encoder_outputs.attentions,
190
+ cross_attentions=encoder_outputs.cross_attentions,
191
+ )
extended_embeddings/token_classification.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, Union
2
+
3
+ import torch
4
+ from torch import nn
5
+ from transformers.modeling_outputs import TokenClassifierOutput
6
+ from transformers.models.roberta.modeling_roberta import RobertaForTokenClassification
7
+ from transformers.models.roberta.modeling_roberta import ROBERTA_INPUTS_DOCSTRING, add_start_docstrings_to_model_forward, add_code_sample_docstrings
8
+
9
+ from extended_embeddings.extended_embeddings_model import ExtendedEmbeddigsRobertaModel
10
+
11
+ _CONFIG_FOR_DOC = "RobertaConfig"
12
+
13
+
14
+ class ExtendedEmbeddigsRobertaForTokenClassification(RobertaForTokenClassification):
15
+ def __init__(self, config):
16
+ super().__init__(config)
17
+ self.num_labels = config.num_labels
18
+
19
+ self.roberta = ExtendedEmbeddigsRobertaModel(config, add_pooling_layer=False)
20
+ classifier_dropout = (
21
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
22
+ )
23
+ self.dropout = nn.Dropout(classifier_dropout)
24
+ self.classifier = nn.Linear(config.hidden_size + 3, config.num_labels)
25
+
26
+ # Initialize weights and apply final processing
27
+ self.post_init()
28
+
29
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
30
+ @add_code_sample_docstrings(
31
+ checkpoint="Jean-Baptiste/roberta-large-ner-english",
32
+ output_type=TokenClassifierOutput,
33
+ config_class=_CONFIG_FOR_DOC,
34
+ expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
35
+ expected_loss=0.01,
36
+ )
37
+ def forward(
38
+ self,
39
+ input_ids: Optional[torch.LongTensor] = None,
40
+ attention_mask: Optional[torch.FloatTensor] = None,
41
+ token_type_ids: Optional[torch.LongTensor] = None,
42
+ position_ids: Optional[torch.LongTensor] = None,
43
+ per: Optional[torch.Tensor] = None,
44
+ org: Optional[torch.Tensor] = None,
45
+ loc: Optional[torch.Tensor] = None,
46
+ head_mask: Optional[torch.FloatTensor] = None,
47
+ inputs_embeds: Optional[torch.FloatTensor] = None,
48
+ labels: Optional[torch.LongTensor] = None,
49
+ output_attentions: Optional[bool] = None,
50
+ output_hidden_states: Optional[bool] = None,
51
+ return_dict: Optional[bool] = None,
52
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
53
+ r"""
54
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
55
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
56
+ """
57
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
58
+
59
+ outputs = self.roberta(
60
+ input_ids,
61
+ attention_mask=attention_mask,
62
+ token_type_ids=token_type_ids,
63
+ position_ids=position_ids,
64
+ per=per,
65
+ org=org,
66
+ loc=loc,
67
+ head_mask=head_mask,
68
+ inputs_embeds=inputs_embeds,
69
+ output_attentions=output_attentions,
70
+ output_hidden_states=output_hidden_states,
71
+ return_dict=return_dict,
72
+ )
73
+
74
+ sequence_output = outputs[0]
75
+
76
+ sequence_output = self.dropout(sequence_output)
77
+ logits = self.classifier(sequence_output)
78
+
79
+ loss = None
80
+ if labels is not None:
81
+ # move labels to correct device to enable model parallelism
82
+ labels = labels.to(logits.device)
83
+ loss_fct = nn.CrossEntropyLoss()
84
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
85
+
86
+ if not return_dict:
87
+ output = (logits,) + outputs[2:]
88
+ return ((loss,) + output) if loss is not None else output
89
+
90
+ return TokenClassifierOutput(
91
+ loss=loss,
92
+ logits=logits,
93
+ hidden_states=outputs.hidden_states,
94
+ attentions=outputs.attentions,
95
+ )
gazz2.json ADDED
The diff for this file is too large to render. See raw diff
 
upload_model.ipynb ADDED
@@ -0,0 +1,3150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "application/vnd.jupyter.widget-view+json": {
11
+ "model_id": "65fea98bf7924f4fb4947d8e2dda2f4d",
12
+ "version_major": 2,
13
+ "version_minor": 0
14
+ },
15
+ "text/plain": [
16
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
17
+ ]
18
+ },
19
+ "metadata": {},
20
+ "output_type": "display_data"
21
+ }
22
+ ],
23
+ "source": [
24
+ "from huggingface_hub import notebook_login\n",
25
+ "\n",
26
+ "notebook_login()"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 3,
32
+ "metadata": {},
33
+ "outputs": [
34
+ {
35
+ "name": "stderr",
36
+ "output_type": "stream",
37
+ "text": [
38
+ "/home/betty/miniconda3/envs/DP/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
39
+ " warnings.warn(\n"
40
+ ]
41
+ }
42
+ ],
43
+ "source": [
44
+ "from website_script import load\n",
45
+ "tokenizer, model, gazetteers_for_matching = load()"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 4,
51
+ "metadata": {},
52
+ "outputs": [
53
+ {
54
+ "data": {
55
+ "application/vnd.jupyter.widget-view+json": {
56
+ "model_id": "c1ee9dbfcb694b968d85152542356298",
57
+ "version_major": 2,
58
+ "version_minor": 0
59
+ },
60
+ "text/plain": [
61
+ "pytorch_model.bin: 0%| | 0.00/504M [00:00<?, ?B/s]"
62
+ ]
63
+ },
64
+ "metadata": {},
65
+ "output_type": "display_data"
66
+ },
67
+ {
68
+ "data": {
69
+ "text/plain": [
70
+ "CommitInfo(commit_url='https://huggingface.co/bettystr/NerRoB-czech/commit/7e120317ea2a9500929d09ef9f55b5eff8640d0b', commit_message='Upload ExtendedEmbeddigsRobertaForTokenClassification', commit_description='', oid='7e120317ea2a9500929d09ef9f55b5eff8640d0b', pr_url=None, pr_revision=None, pr_num=None)"
71
+ ]
72
+ },
73
+ "execution_count": 4,
74
+ "metadata": {},
75
+ "output_type": "execute_result"
76
+ }
77
+ ],
78
+ "source": [
79
+ "model.push_to_hub(\"bettystr/NerRoB-czech\")"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": 1,
85
+ "metadata": {},
86
+ "outputs": [
87
+ {
88
+ "data": {
89
+ "text/plain": [
90
+ "{'PER': {'Peta',\n",
91
+ " 'Ind',\n",
92
+ " 'Ilona',\n",
93
+ " 'Blazenka',\n",
94
+ " 'Kotrbová',\n",
95
+ " 'Otradovec',\n",
96
+ " 'Chocholouš',\n",
97
+ " 'Kovář',\n",
98
+ " 'Galerie',\n",
99
+ " 'Váňa',\n",
100
+ " 'Krkoška',\n",
101
+ " 'Kvėta',\n",
102
+ " 'Braník',\n",
103
+ " 'Rýpar',\n",
104
+ " 'Vykydal',\n",
105
+ " 'Tomča',\n",
106
+ " 'Nína',\n",
107
+ " 'Evca',\n",
108
+ " 'Hozová',\n",
109
+ " 'Zacharova',\n",
110
+ " 'Vlasto',\n",
111
+ " 'Luckyn',\n",
112
+ " 'Andělka',\n",
113
+ " 'Srba',\n",
114
+ " 'Dýdžej',\n",
115
+ " 'Ířij',\n",
116
+ " 'Brutální',\n",
117
+ " 'Jackob',\n",
118
+ " 'Kozler',\n",
119
+ " 'Jerguš',\n",
120
+ " 'Krysa',\n",
121
+ " 'Helenie',\n",
122
+ " 'Sramek',\n",
123
+ " 'Simonova',\n",
124
+ " 'Bošková',\n",
125
+ " 'Macinka',\n",
126
+ " 'Šich',\n",
127
+ " 'Renata-Dita',\n",
128
+ " 'Holaňová',\n",
129
+ " 'Obermajerová',\n",
130
+ " 'Míšák',\n",
131
+ " 'Holický',\n",
132
+ " 'Pepina',\n",
133
+ " 'Debí',\n",
134
+ " 'Půlpán',\n",
135
+ " 'Faktorová',\n",
136
+ " 'Jirouch',\n",
137
+ " 'Rákos',\n",
138
+ " 'Andreyka',\n",
139
+ " 'Kiš',\n",
140
+ " 'Kuntová',\n",
141
+ " 'Papírníková',\n",
142
+ " 'Evza',\n",
143
+ " 'Vajdíková',\n",
144
+ " 'Agama',\n",
145
+ " 'Zralý',\n",
146
+ " 'Bubnová',\n",
147
+ " 'Keprt',\n",
148
+ " 'Leňa',\n",
149
+ " 'Sekyrová',\n",
150
+ " 'Miklíková',\n",
151
+ " 'Corso',\n",
152
+ " 'Vondruška',\n",
153
+ " 'Heluš',\n",
154
+ " 'Детелина',\n",
155
+ " 'Bartůněk',\n",
156
+ " 'Hajda',\n",
157
+ " 'Bartoníček',\n",
158
+ " 'Rumanová',\n",
159
+ " 'Izabela',\n",
160
+ " 'Kamasová',\n",
161
+ " 'Volenec',\n",
162
+ " 'Duchoslavová',\n",
163
+ " 'Tynulinka',\n",
164
+ " 'Skokan',\n",
165
+ " 'Стьопа',\n",
166
+ " 'Mandarinka',\n",
167
+ " 'Brázdová',\n",
168
+ " 'Haňula',\n",
169
+ " 'Bubela',\n",
170
+ " 'Anný',\n",
171
+ " 'Bohatý',\n",
172
+ " 'Lucinečka',\n",
173
+ " 'Švestková',\n",
174
+ " 'Vaďurová',\n",
175
+ " 'Ленка',\n",
176
+ " 'Docent',\n",
177
+ " 'Slaby',\n",
178
+ " 'Zuzička',\n",
179
+ " 'Ditunka',\n",
180
+ " 'Sjůů',\n",
181
+ " 'Nepovímová',\n",
182
+ " 'Raichlová',\n",
183
+ " 'Panuška',\n",
184
+ " 'Malinovský',\n",
185
+ " 'Szabo',\n",
186
+ " 'Zouhar',\n",
187
+ " 'Holanová',\n",
188
+ " 'Šťastňoučká',\n",
189
+ " 'Světlana',\n",
190
+ " 'Vaňousová',\n",
191
+ " 'Prucek',\n",
192
+ " 'Kopencová',\n",
193
+ " 'Zábojník',\n",
194
+ " 'Čepová',\n",
195
+ " 'Rydlo',\n",
196
+ " 'Koutná',\n",
197
+ " 'Snopek',\n",
198
+ " 'Majkl',\n",
199
+ " 'Kameníková',\n",
200
+ " 'Berenica',\n",
201
+ " 'Hlavatý',\n",
202
+ " 'Taekyo',\n",
203
+ " 'Tokar',\n",
204
+ " 'Balaš',\n",
205
+ " 'Lukasova',\n",
206
+ " 'Rokosová',\n",
207
+ " 'Smrckova',\n",
208
+ " 'Spáčilová',\n",
209
+ " 'Gruntová',\n",
210
+ " 'Haňulka',\n",
211
+ " 'Starek',\n",
212
+ " 'Cupalová',\n",
213
+ " 'Sabulina',\n",
214
+ " 'Celkem',\n",
215
+ " 'Hamplova',\n",
216
+ " 'Výmolová',\n",
217
+ " 'Polaskova',\n",
218
+ " 'Krizova',\n",
219
+ " 'Hanzl',\n",
220
+ " 'Vojtková',\n",
221
+ " 'Valja',\n",
222
+ " 'Pavelcová',\n",
223
+ " 'Ilianski',\n",
224
+ " 'Chromcová',\n",
225
+ " 'Strnadlová',\n",
226
+ " 'Takáč',\n",
227
+ " 'Macíček',\n",
228
+ " 'Burianek',\n",
229
+ " 'Ondík',\n",
230
+ " 'Mařen',\n",
231
+ " 'Denule',\n",
232
+ " 'Kroulík',\n",
233
+ " 'Hozman',\n",
234
+ " 'Honzas',\n",
235
+ " 'Přibáň',\n",
236
+ " 'Hladíková',\n",
237
+ " 'Štědroň',\n",
238
+ " 'Racing',\n",
239
+ " 'Lenla',\n",
240
+ " 'Trnkova',\n",
241
+ " 'Businský',\n",
242
+ " 'Šikulová',\n",
243
+ " 'Mišanek',\n",
244
+ " 'Ratajová',\n",
245
+ " 'Naďežda',\n",
246
+ " 'Šimonková',\n",
247
+ " 'Vard',\n",
248
+ " 'Paollo',\n",
249
+ " 'Jandera',\n",
250
+ " 'Lucias',\n",
251
+ " 'Páája',\n",
252
+ " 'Gemmax',\n",
253
+ " 'Matulu',\n",
254
+ " 'Rychly',\n",
255
+ " 'Ryslan',\n",
256
+ " 'Kristína',\n",
257
+ " 'Terézie',\n",
258
+ " 'Stibůrek',\n",
259
+ " 'Sherbek',\n",
260
+ " 'Makynka',\n",
261
+ " 'Rychnovská',\n",
262
+ " 'Bajger',\n",
263
+ " 'Dupaux',\n",
264
+ " 'Kovačíková',\n",
265
+ " 'Tomešková',\n",
266
+ " 'Culková',\n",
267
+ " 'Bušová',\n",
268
+ " 'Jabub',\n",
269
+ " 'Štechová',\n",
270
+ " 'Houdek',\n",
271
+ " 'Kreisinger',\n",
272
+ " 'Potměšilová',\n",
273
+ " 'Poskočilová',\n",
274
+ " 'Dugy',\n",
275
+ " 'Paťan',\n",
276
+ " 'Vágnerová',\n",
277
+ " 'Pacltová',\n",
278
+ " 'Blažej',\n",
279
+ " 'Jarino',\n",
280
+ " 'Pixa',\n",
281
+ " 'Strach',\n",
282
+ " 'Vyhnánek',\n",
283
+ " 'Semerád',\n",
284
+ " 'Pudilová',\n",
285
+ " 'Rozka',\n",
286
+ " 'Lýdie',\n",
287
+ " 'Vávrová',\n",
288
+ " 'Matrik',\n",
289
+ " 'Madbear',\n",
290
+ " 'Chrobáková',\n",
291
+ " 'Kotalíková',\n",
292
+ " 'Střílková',\n",
293
+ " 'Urbanx',\n",
294
+ " 'Zora',\n",
295
+ " 'Zavázalová',\n",
296
+ " 'Emilka',\n",
297
+ " 'Łukas',\n",
298
+ " 'Jindŕich',\n",
299
+ " 'Hanik',\n",
300
+ " 'Iwe',\n",
301
+ " 'Lasáková',\n",
302
+ " 'Mařáková',\n",
303
+ " 'Šťastný',\n",
304
+ " 'Sajmonka',\n",
305
+ " 'Kánský',\n",
306
+ " 'Dolfi',\n",
307
+ " 'Američané',\n",
308
+ " 'Urban',\n",
309
+ " 'Samik',\n",
310
+ " 'Ouředník',\n",
311
+ " 'Виорика',\n",
312
+ " 'Vyhul',\n",
313
+ " 'Madr',\n",
314
+ " 'Šilhová',\n",
315
+ " 'Stefaniya',\n",
316
+ " 'Radomír',\n",
317
+ " 'Taxibila',\n",
318
+ " 'Exnerová',\n",
319
+ " 'Jíříček',\n",
320
+ " 'Emik',\n",
321
+ " 'Vrtiška',\n",
322
+ " 'Stána',\n",
323
+ " 'Soxib',\n",
324
+ " 'Buresova',\n",
325
+ " 'Jarous',\n",
326
+ " 'Chmelik',\n",
327
+ " 'Ládin',\n",
328
+ " 'Absolon',\n",
329
+ " 'Bohuňková',\n",
330
+ " 'Rybníček',\n",
331
+ " 'Cairoli',\n",
332
+ " 'Kurečka',\n",
333
+ " 'Slabý',\n",
334
+ " 'Jevhenija',\n",
335
+ " 'Jajin',\n",
336
+ " 'Eego',\n",
337
+ " 'Bárka',\n",
338
+ " 'Lentilka',\n",
339
+ " 'Čert',\n",
340
+ " 'Teri',\n",
341
+ " 'Crhová',\n",
342
+ " 'Korcová',\n",
343
+ " 'Vlastníková',\n",
344
+ " 'Elča',\n",
345
+ " 'Koutecká',\n",
346
+ " 'Pavlicová',\n",
347
+ " 'Choze',\n",
348
+ " 'Bronča',\n",
349
+ " 'Burza',\n",
350
+ " 'Zemanec',\n",
351
+ " 'Anetqa',\n",
352
+ " 'Černíková',\n",
353
+ " 'Certice',\n",
354
+ " 'Mašus',\n",
355
+ " 'Šilerová',\n",
356
+ " 'Lesia',\n",
357
+ " 'Majkyna',\n",
358
+ " 'Адлы',\n",
359
+ " 'Trener',\n",
360
+ " 'Stara',\n",
361
+ " 'Zámečníková',\n",
362
+ " 'Rostja',\n",
363
+ " 'Szabó',\n",
364
+ " 'Mateej',\n",
365
+ " 'Wlada',\n",
366
+ " 'Pafča',\n",
367
+ " 'Stočková',\n",
368
+ " 'Šustová',\n",
369
+ " 'Frýdová',\n",
370
+ " 'Žofia',\n",
371
+ " 'Faltejsková',\n",
372
+ " 'Maruškaa',\n",
373
+ " 'Editka',\n",
374
+ " 'Otradovcová',\n",
375
+ " 'Vejvoda',\n",
376
+ " 'Neuwirthová',\n",
377
+ " 'Ráda',\n",
378
+ " 'Macháčková',\n",
379
+ " 'Vičarová',\n",
380
+ " 'Julinka',\n",
381
+ " 'Hranická',\n",
382
+ " 'Satanas',\n",
383
+ " 'Pfeffer',\n",
384
+ " 'Hádková',\n",
385
+ " 'Lianochka',\n",
386
+ " 'Ngoc',\n",
387
+ " 'Šáruš',\n",
388
+ " 'Terynka',\n",
389
+ " 'Євген',\n",
390
+ " 'Štefánek',\n",
391
+ " 'Kristinka',\n",
392
+ " 'Zittová',\n",
393
+ " 'Švub',\n",
394
+ " 'Pavlíková',\n",
395
+ " 'Nikiška',\n",
396
+ " 'Kováčik',\n",
397
+ " 'Sedli',\n",
398
+ " 'Honziik',\n",
399
+ " 'Barunečka',\n",
400
+ " 'Žatečková',\n",
401
+ " 'Zvoněnka',\n",
402
+ " 'Přibilová',\n",
403
+ " 'Mišel',\n",
404
+ " 'Adamčíková',\n",
405
+ " 'Jiricek',\n",
406
+ " 'Strnad',\n",
407
+ " 'Svrčina',\n",
408
+ " 'Horyna',\n",
409
+ " 'Claudinka',\n",
410
+ " 'Tisoň',\n",
411
+ " 'Kučová',\n",
412
+ " 'Ranch',\n",
413
+ " 'Obchod',\n",
414
+ " 'Romca',\n",
415
+ " 'Kalenský',\n",
416
+ " 'Nell',\n",
417
+ " 'Krumlová',\n",
418
+ " 'Kvasnicová',\n",
419
+ " 'Klementýna',\n",
420
+ " 'Drahokoupilová',\n",
421
+ " 'Míja',\n",
422
+ " 'Франта',\n",
423
+ " 'Alexandrova',\n",
424
+ " 'Cervenkova',\n",
425
+ " 'Rottová',\n",
426
+ " 'Radim',\n",
427
+ " 'Věkoslav',\n",
428
+ " 'Weissová',\n",
429
+ " 'Peťulíí',\n",
430
+ " 'Fiserova',\n",
431
+ " 'Juras',\n",
432
+ " 'Macík',\n",
433
+ " 'Pavluska',\n",
434
+ " 'Thi',\n",
435
+ " 'Adell',\n",
436
+ " 'Cvíčo',\n",
437
+ " 'Žílová',\n",
438
+ " 'Šimi',\n",
439
+ " 'Jsna',\n",
440
+ " 'Natalli',\n",
441
+ " 'Lenorka',\n",
442
+ " 'Rambod',\n",
443
+ " 'Stanislava',\n",
444
+ " 'Vencl',\n",
445
+ " 'Mudr',\n",
446
+ " 'Dámské',\n",
447
+ " 'Faktor',\n",
448
+ " 'Patrk',\n",
449
+ " 'Efik',\n",
450
+ " 'Tvaruzka',\n",
451
+ " 'Lukee',\n",
452
+ " 'Frsntisek',\n",
453
+ " 'Hofmannová',\n",
454
+ " 'Páv',\n",
455
+ " 'Jacha',\n",
456
+ " 'Martinaa',\n",
457
+ " 'Balda',\n",
458
+ " 'Mishelin',\n",
459
+ " 'Brouček',\n",
460
+ " 'Chloupková',\n",
461
+ " 'Divad',\n",
462
+ " 'Bubáček',\n",
463
+ " 'Stehno',\n",
464
+ " 'Holinka',\n",
465
+ " 'Ardeb',\n",
466
+ " 'Sovička',\n",
467
+ " 'Stavinoha',\n",
468
+ " 'Kvetiny',\n",
469
+ " 'Hrabulata',\n",
470
+ " 'Motyli',\n",
471
+ " 'Hubová',\n",
472
+ " 'Burianová',\n",
473
+ " 'Pluhařová',\n",
474
+ " 'Tauchman',\n",
475
+ " 'Petka',\n",
476
+ " 'Lubošek',\n",
477
+ " 'Havrilová',\n",
478
+ " 'Philippos',\n",
479
+ " 'Kaleja',\n",
480
+ " 'Dvorackova',\n",
481
+ " 'Šebíková',\n",
482
+ " 'Kulio',\n",
483
+ " 'Sýkorová',\n",
484
+ " 'Peřinka',\n",
485
+ " 'Lukyy',\n",
486
+ " 'Zprava',\n",
487
+ " 'Sviatlana',\n",
488
+ " 'Pawson',\n",
489
+ " 'Sláma',\n",
490
+ " 'Šubertová',\n",
491
+ " 'Kaločová',\n",
492
+ " 'Janáček',\n",
493
+ " 'Voltr',\n",
494
+ " 'Lubík',\n",
495
+ " 'Kosmetický',\n",
496
+ " 'Mícheál',\n",
497
+ " 'Šnoblová',\n",
498
+ " 'Janouš',\n",
499
+ " 'Ondrejka',\n",
500
+ " 'Romanka',\n",
501
+ " 'Picek',\n",
502
+ " 'Henychová',\n",
503
+ " 'Vondracek',\n",
504
+ " 'Verýsek',\n",
505
+ " 'Machovec',\n",
506
+ " 'Jeníková',\n",
507
+ " 'Jejda',\n",
508
+ " 'Luk',\n",
509
+ " 'Fousová',\n",
510
+ " 'Ený',\n",
511
+ " 'Jindriska',\n",
512
+ " 'Aknelka',\n",
513
+ " 'Dubnová',\n",
514
+ " 'Minařík',\n",
515
+ " 'Limetka',\n",
516
+ " 'Houmr',\n",
517
+ " 'Šedová',\n",
518
+ " 'Balounová',\n",
519
+ " 'Krakonoš',\n",
520
+ " 'Darča',\n",
521
+ " 'Snizhanna',\n",
522
+ " 'Kateřin',\n",
523
+ " 'Köhler',\n",
524
+ " 'Wilém',\n",
525
+ " 'Kubánková',\n",
526
+ " 'Petrak',\n",
527
+ " 'Weja',\n",
528
+ " 'Veronička',\n",
529
+ " 'Flieger',\n",
530
+ " 'Drozd',\n",
531
+ " 'Lení',\n",
532
+ " 'Bělohradská',\n",
533
+ " 'Accademia',\n",
534
+ " 'Lavicka',\n",
535
+ " 'Talinka',\n",
536
+ " 'Chudoba',\n",
537
+ " 'Brožíková',\n",
538
+ " 'Tomeš',\n",
539
+ " 'Hanushka',\n",
540
+ " 'Hradcová',\n",
541
+ " 'Heďa',\n",
542
+ " 'Lidunqa',\n",
543
+ " 'Holek',\n",
544
+ " 'Zelinkova',\n",
545
+ " 'Đavid',\n",
546
+ " 'Milfaitová',\n",
547
+ " 'Chci',\n",
548
+ " 'Jiříčková',\n",
549
+ " 'Buchar',\n",
550
+ " 'Luciána',\n",
551
+ " 'Łukyn',\n",
552
+ " 'Adriána',\n",
553
+ " 'Lešáková',\n",
554
+ " 'Kopová',\n",
555
+ " 'Ordinace',\n",
556
+ " 'Radislava',\n",
557
+ " 'Handlová',\n",
558
+ " 'Pečenková',\n",
559
+ " 'Gejbina',\n",
560
+ " 'Čenda',\n",
561
+ " 'Holušová',\n",
562
+ " 'Konečny',\n",
563
+ " 'Drechsler',\n",
564
+ " 'Pivoňková',\n",
565
+ " 'Markovič',\n",
566
+ " 'Ráchel',\n",
567
+ " 'Šimicová',\n",
568
+ " 'Raduška',\n",
569
+ " 'Hrdá',\n",
570
+ " 'Hron',\n",
571
+ " 'Atletka',\n",
572
+ " 'Véja',\n",
573
+ " 'Adelká',\n",
574
+ " 'Proky',\n",
575
+ " 'Hladiš',\n",
576
+ " 'Velek',\n",
577
+ " 'Barbora',\n",
578
+ " 'Glaserová',\n",
579
+ " 'Nesládková',\n",
580
+ " 'Lubomír',\n",
581
+ " 'Skluzan',\n",
582
+ " 'Jajda',\n",
583
+ " 'Komrska',\n",
584
+ " 'Minarčíková',\n",
585
+ " 'Podešva',\n",
586
+ " 'Necas',\n",
587
+ " 'Vacek',\n",
588
+ " 'Ifét',\n",
589
+ " 'Myshka',\n",
590
+ " 'Chrástek',\n",
591
+ " 'Brousilová',\n",
592
+ " 'Luciik',\n",
593
+ " 'Nehasilová',\n",
594
+ " 'Petrek',\n",
595
+ " 'Burianova',\n",
596
+ " 'Jindřiška',\n",
597
+ " 'Sehnal',\n",
598
+ " 'Танька',\n",
599
+ " 'Žaba',\n",
600
+ " 'Tyfus',\n",
601
+ " 'Tvrdík',\n",
602
+ " 'Lucin',\n",
603
+ " 'Domína',\n",
604
+ " 'Kropáč',\n",
605
+ " 'Masáková',\n",
606
+ " 'Cepková',\n",
607
+ " 'Bobik',\n",
608
+ " 'Jičínská',\n",
609
+ " 'Kubko',\n",
610
+ " 'Tihelka',\n",
611
+ " 'Janiina',\n",
612
+ " 'Viták',\n",
613
+ " 'Cze',\n",
614
+ " 'Karhanová',\n",
615
+ " 'Далибор',\n",
616
+ " 'Mexičanka',\n",
617
+ " 'Hronova',\n",
618
+ " 'Armini',\n",
619
+ " 'Ulč',\n",
620
+ " 'Hajnová',\n",
621
+ " 'Pevný',\n",
622
+ " 'Dycky',\n",
623
+ " 'Zdislava',\n",
624
+ " 'Bohus',\n",
625
+ " 'Иванна',\n",
626
+ " 'Chomát',\n",
627
+ " 'Grulich',\n",
628
+ " 'Hradecká',\n",
629
+ " 'Medunová',\n",
630
+ " 'Stehlík',\n",
631
+ " 'Juda',\n",
632
+ " 'Keclík',\n",
633
+ " 'Balšánková',\n",
634
+ " 'Liscová',\n",
635
+ " 'Pittner',\n",
636
+ " 'Smigl',\n",
637
+ " 'Jenky',\n",
638
+ " 'Sailerová',\n",
639
+ " 'Klausová',\n",
640
+ " 'Hercik',\n",
641
+ " 'Obst',\n",
642
+ " 'Iluška',\n",
643
+ " 'Janotová',\n",
644
+ " 'Mládková',\n",
645
+ " 'Brejcha',\n",
646
+ " 'Kutlák',\n",
647
+ " 'Janíí',\n",
648
+ " 'Viťezslav',\n",
649
+ " 'Michková',\n",
650
+ " 'Mattes',\n",
651
+ " 'Režný',\n",
652
+ " 'Mihalik',\n",
653
+ " 'Simir',\n",
654
+ " 'Vyhnal',\n",
655
+ " 'Tauchmanová',\n",
656
+ " 'Domčáá',\n",
657
+ " 'Paia',\n",
658
+ " 'Klapka',\n",
659
+ " 'Frantysek',\n",
660
+ " 'Kohútová',\n",
661
+ " 'Ilii',\n",
662
+ " 'Czesław',\n",
663
+ " 'Pastorová',\n",
664
+ " 'Autonečy',\n",
665
+ " 'Jurko',\n",
666
+ " 'Koordinátor',\n",
667
+ " 'Blazkova',\n",
668
+ " 'Kaštánek',\n",
669
+ " 'Kyso',\n",
670
+ " 'Bouchal',\n",
671
+ " 'Lýda',\n",
672
+ " 'Bourbon',\n",
673
+ " 'Radoslav',\n",
674
+ " 'Константин',\n",
675
+ " 'Valtr',\n",
676
+ " 'Jarek',\n",
677
+ " 'Barushe',\n",
678
+ " 'Zetocha',\n",
679
+ " 'Ferry',\n",
680
+ " 'Sońa',\n",
681
+ " 'Volf',\n",
682
+ " 'Profi',\n",
683
+ " 'Tomášš',\n",
684
+ " 'Doubková',\n",
685
+ " 'Adissek',\n",
686
+ " 'Voloďa',\n",
687
+ " 'Čížková',\n",
688
+ " 'Mišalka',\n",
689
+ " 'Tezz',\n",
690
+ " 'Uhlíková',\n",
691
+ " 'Lánský',\n",
692
+ " 'Pítrs',\n",
693
+ " 'Mocek',\n",
694
+ " 'Geryk',\n",
695
+ " 'Radecek',\n",
696
+ " 'Andr',\n",
697
+ " 'Ivush',\n",
698
+ " 'Pelikán',\n",
699
+ " 'Kutějová',\n",
700
+ " 'Šárkys',\n",
701
+ " 'Řeznictví',\n",
702
+ " 'Vencovská',\n",
703
+ " 'Hubálková',\n",
704
+ " 'Rákosová',\n",
705
+ " 'Sapex',\n",
706
+ " 'Moudrý',\n",
707
+ " 'Mikulec',\n",
708
+ " 'Valesova',\n",
709
+ " 'Wojta',\n",
710
+ " 'Květoslav',\n",
711
+ " 'Bubeník',\n",
712
+ " 'Robenek',\n",
713
+ " 'Kvetus',\n",
714
+ " 'Masaryk',\n",
715
+ " 'Vavrová',\n",
716
+ " 'Špalková',\n",
717
+ " 'Lapík',\n",
718
+ " 'Chačik',\n",
719
+ " 'Siegl',\n",
720
+ " 'Jarolímek',\n",
721
+ " 'Aulická',\n",
722
+ " 'Kostka',\n",
723
+ " 'Уляна',\n",
724
+ " 'Konrád',\n",
725
+ " 'Smutny',\n",
726
+ " 'Patchwork',\n",
727
+ " 'Klán',\n",
728
+ " 'Peťule',\n",
729
+ " 'Ottova',\n",
730
+ " 'Ptak',\n",
731
+ " 'Martíínka',\n",
732
+ " 'Nicolie',\n",
733
+ " 'Gročová',\n",
734
+ " 'Čepičková',\n",
735
+ " 'Kubex',\n",
736
+ " 'Zvonková',\n",
737
+ " 'Hofer',\n",
738
+ " 'Nekvapil',\n",
739
+ " 'Majerová',\n",
740
+ " 'Josef',\n",
741
+ " 'Matějovská',\n",
742
+ " 'Dablik',\n",
743
+ " 'Suková',\n",
744
+ " 'Thành',\n",
745
+ " 'Šefčík',\n",
746
+ " 'Sáva',\n",
747
+ " 'Balcarová',\n",
748
+ " 'Vaclav',\n",
749
+ " 'Monila',\n",
750
+ " 'Teréz',\n",
751
+ " 'Matějíček',\n",
752
+ " 'Ífka',\n",
753
+ " 'Rigo',\n",
754
+ " 'Drobek',\n",
755
+ " 'Harvánková',\n",
756
+ " 'Jozko',\n",
757
+ " 'Ihar',\n",
758
+ " 'Denča',\n",
759
+ " 'Julie',\n",
760
+ " 'Řehulka',\n",
761
+ " 'Kulicka',\n",
762
+ " 'Rákosník',\n",
763
+ " 'Ewík',\n",
764
+ " 'Trojáková',\n",
765
+ " 'Šlegrová',\n",
766
+ " 'Fritschová',\n",
767
+ " 'Tadeker',\n",
768
+ " 'Pelán',\n",
769
+ " 'Ivulka',\n",
770
+ " 'Doskočilová',\n",
771
+ " 'Klacková',\n",
772
+ " 'Dilina',\n",
773
+ " 'Kacula',\n",
774
+ " 'Dobrovolna',\n",
775
+ " 'Wláďa',\n",
776
+ " 'Juřička',\n",
777
+ " 'Kvardová',\n",
778
+ " 'Moonika',\n",
779
+ " 'Drahunka',\n",
780
+ " 'Terí',\n",
781
+ " 'Laduška',\n",
782
+ " 'Janků',\n",
783
+ " 'Ureš',\n",
784
+ " 'Štourač',\n",
785
+ " 'Sotona',\n",
786
+ " 'Kubes',\n",
787
+ " 'Černovská',\n",
788
+ " 'Strmiska',\n",
789
+ " 'Terenc',\n",
790
+ " 'Niki',\n",
791
+ " 'Vovsová',\n",
792
+ " 'Zubek',\n",
793
+ " 'Náčelník',\n",
794
+ " 'Nella',\n",
795
+ " 'Klaruše',\n",
796
+ " 'Wiesner',\n",
797
+ " 'Václava',\n",
798
+ " 'Tresky',\n",
799
+ " 'Čáslava',\n",
800
+ " 'Vojtaa',\n",
801
+ " 'Bicková',\n",
802
+ " 'Hanez',\n",
803
+ " 'Vejražka',\n",
804
+ " 'Karlička',\n",
805
+ " 'Duchoň',\n",
806
+ " 'Slivka',\n",
807
+ " 'Milovník',\n",
808
+ " 'Košátková',\n",
809
+ " 'Hurníková',\n",
810
+ " 'Slušný',\n",
811
+ " 'Holý',\n",
812
+ " 'Cikánová',\n",
813
+ " 'Smokehouse',\n",
814
+ " 'Zdislav',\n",
815
+ " 'Pazderova',\n",
816
+ " 'Šádková',\n",
817
+ " 'Taťka',\n",
818
+ " 'Zděnda',\n",
819
+ " 'Tynulka',\n",
820
+ " 'Kubíík',\n",
821
+ " 'Seterm',\n",
822
+ " 'Miloň',\n",
823
+ " 'Krupková',\n",
824
+ " 'Budíková',\n",
825
+ " 'Nika',\n",
826
+ " 'Korous',\n",
827
+ " 'Šmejkal',\n",
828
+ " 'Harazimová',\n",
829
+ " 'Марянка',\n",
830
+ " 'Štemberk',\n",
831
+ " 'Honzik',\n",
832
+ " 'Serza',\n",
833
+ " 'Nekola',\n",
834
+ " 'Bayerova',\n",
835
+ " 'Jardys',\n",
836
+ " 'Nikysek',\n",
837
+ " 'Pavl',\n",
838
+ " 'Mackova',\n",
839
+ " 'Wojnarová',\n",
840
+ " 'Markýz',\n",
841
+ " 'Zabak',\n",
842
+ " 'Vystrčil',\n",
843
+ " 'Hanusova',\n",
844
+ " 'Lejsek',\n",
845
+ " 'Brixi',\n",
846
+ " 'Katchka',\n",
847
+ " 'Řádková',\n",
848
+ " 'Mykhaylo',\n",
849
+ " 'Břetis',\n",
850
+ " 'Rccg',\n",
851
+ " 'Blaho',\n",
852
+ " 'Berka',\n",
853
+ " 'Fajtová',\n",
854
+ " 'Sanča',\n",
855
+ " 'Koudelova',\n",
856
+ " 'Blažkova',\n",
857
+ " 'Lukášková',\n",
858
+ " 'Janderová',\n",
859
+ " 'Treml',\n",
860
+ " 'Opravář',\n",
861
+ " 'Zikánová',\n",
862
+ " 'Kropáčková',\n",
863
+ " 'Pertlová',\n",
864
+ " 'Kalíšek',\n",
865
+ " 'Halda',\n",
866
+ " 'Stáza',\n",
867
+ " 'Vosmík',\n",
868
+ " 'Mullerová',\n",
869
+ " 'Ládyn',\n",
870
+ " 'Kracíková',\n",
871
+ " 'Andrej',\n",
872
+ " 'Pfeiferová',\n",
873
+ " 'Bulínová',\n",
874
+ " 'Hiếu',\n",
875
+ " 'Náplavová',\n",
876
+ " 'Dudl',\n",
877
+ " 'Džany',\n",
878
+ " 'Valasek',\n",
879
+ " 'Kubíno',\n",
880
+ " 'Samko',\n",
881
+ " 'Horák',\n",
882
+ " 'Břicháček',\n",
883
+ " 'Fotopasti',\n",
884
+ " 'Ulrichová',\n",
885
+ " 'Tonislav',\n",
886
+ " 'Wendys',\n",
887
+ " 'Evica',\n",
888
+ " 'Zuzule',\n",
889
+ " 'Konopová',\n",
890
+ " 'Pečený',\n",
891
+ " 'Dynda',\n",
892
+ " 'Vlaďula',\n",
893
+ " 'Terézia',\n",
894
+ " 'Zapletal',\n",
895
+ " 'Cicko',\n",
896
+ " 'Zrzka',\n",
897
+ " 'Nikoolka',\n",
898
+ " 'Kucmochtová',\n",
899
+ " 'Zavřel',\n",
900
+ " 'Krajíček',\n",
901
+ " 'Simína',\n",
902
+ " 'Grace',\n",
903
+ " 'Buchtová',\n",
904
+ " 'Pečinka',\n",
905
+ " 'Terca',\n",
906
+ " 'Miretchek',\n",
907
+ " 'Sarlota',\n",
908
+ " 'Čonkova',\n",
909
+ " 'Zoufalá',\n",
910
+ " 'Zdisa',\n",
911
+ " 'Tygřík',\n",
912
+ " 'Mirek',\n",
913
+ " 'Христина',\n",
914
+ " 'Pavlína',\n",
915
+ " 'Brunnerová',\n",
916
+ " 'Danielka',\n",
917
+ " 'Nasťa',\n",
918
+ " 'Baronka',\n",
919
+ " 'Baštová',\n",
920
+ " 'Prroky',\n",
921
+ " 'Gertruda',\n",
922
+ " 'Wenca',\n",
923
+ " 'Hanz',\n",
924
+ " 'Safranek',\n",
925
+ " 'Votava',\n",
926
+ " 'Matoulek',\n",
927
+ " 'Barbarka',\n",
928
+ " 'Vydra',\n",
929
+ " 'Černohlávek',\n",
930
+ " 'Jeřábek',\n",
931
+ " 'Pešáková',\n",
932
+ " 'Fuxová',\n",
933
+ " 'Kvarda',\n",
934
+ " 'Milius',\n",
935
+ " 'Sirůčková',\n",
936
+ " 'Zko',\n",
937
+ " 'Jiřína',\n",
938
+ " 'Grimová',\n",
939
+ " 'Fotografka',\n",
940
+ " 'Владислава',\n",
941
+ " 'Pavlikova',\n",
942
+ " 'Košnarová',\n",
943
+ " 'Miris',\n",
944
+ " 'Danč',\n",
945
+ " 'Ležáková',\n",
946
+ " 'Češi',\n",
947
+ " 'Kvasnica',\n",
948
+ " 'Barták',\n",
949
+ " 'Klaruska',\n",
950
+ " 'Abraka',\n",
951
+ " 'Vojín',\n",
952
+ " 'Jenda',\n",
953
+ " 'Lenka',\n",
954
+ " 'Bárbra',\n",
955
+ " 'Husaberg',\n",
956
+ " 'Ondro',\n",
957
+ " 'Stanouš',\n",
958
+ " 'Pytrs',\n",
959
+ " 'Lektorka',\n",
960
+ " 'Bělohradský',\n",
961
+ " 'Dvorska',\n",
962
+ " 'Svab',\n",
963
+ " 'Šanda',\n",
964
+ " 'Toufarová',\n",
965
+ " 'Čestmír',\n",
966
+ " 'Pavlínečka',\n",
967
+ " 'Dudis',\n",
968
+ " 'Dadka',\n",
969
+ " 'Rychlík',\n",
970
+ " 'Fajkus',\n",
971
+ " 'Tennisa',\n",
972
+ " 'Maixnerová',\n",
973
+ " 'Hejčová',\n",
974
+ " 'Molnárová',\n",
975
+ " 'Talašová',\n",
976
+ " 'Janickova',\n",
977
+ " 'Peterková',\n",
978
+ " 'Kalčík',\n",
979
+ " 'Simik',\n",
980
+ " 'Oksi',\n",
981
+ " 'Korandová',\n",
982
+ " 'Filus',\n",
983
+ " 'Šinágl',\n",
984
+ " 'Peroutka',\n",
985
+ " 'Kluci',\n",
986
+ " 'Røman',\n",
987
+ " 'Mischak',\n",
988
+ " 'Kavalír',\n",
989
+ " 'Qnko',\n",
990
+ " 'Vičar',\n",
991
+ " 'Ninuš',\n",
992
+ " 'Český',\n",
993
+ " 'Autolakovna',\n",
994
+ " 'Ehrenbergerová',\n",
995
+ " 'Kopáč',\n",
996
+ " 'Štěpis',\n",
997
+ " 'Nosál',\n",
998
+ " 'Hornof',\n",
999
+ " 'Draci',\n",
1000
+ " 'Lindička',\n",
1001
+ " 'Hrabáková',\n",
1002
+ " 'Zlatnictví',\n",
1003
+ " 'Oluša',\n",
1004
+ " 'Sbor',\n",
1005
+ " 'Pjotr',\n",
1006
+ " 'Kleri',\n",
1007
+ " 'Gerenzel',\n",
1008
+ " 'Justýna',\n",
1009
+ " 'Анастасія',\n",
1010
+ " 'Dostálková',\n",
1011
+ " 'Pestra',\n",
1012
+ " 'Penziony',\n",
1013
+ " 'Lenii',\n",
1014
+ " 'Štefinka',\n",
1015
+ " 'Vales',\n",
1016
+ " 'Pegyna',\n",
1017
+ " 'Иван',\n",
1018
+ " 'Dzon',\n",
1019
+ " 'Tessinka',\n",
1020
+ " 'Dandý',\n",
1021
+ " 'Radek',\n",
1022
+ " 'Vodak',\n",
1023
+ " 'Berinka',\n",
1024
+ " 'Matys',\n",
1025
+ " 'Mergl',\n",
1026
+ " 'Славка',\n",
1027
+ " 'Horkel',\n",
1028
+ " 'Biodanza',\n",
1029
+ " 'Kudelová',\n",
1030
+ " 'Matees',\n",
1031
+ " 'Viktora',\n",
1032
+ " 'Lepka',\n",
1033
+ " 'Moňásek',\n",
1034
+ " 'Léňa',\n",
1035
+ " 'Míšinka',\n",
1036
+ " 'Kapek',\n",
1037
+ " 'Zeithamlová',\n",
1038
+ " 'Juklová',\n",
1039
+ " 'Lipták',\n",
1040
+ " 'Lucci',\n",
1041
+ " 'Evelýna',\n",
1042
+ " 'Pouzar',\n",
1043
+ " 'Grisa',\n",
1044
+ " 'Trubač',\n",
1045
+ " 'Zítová',\n",
1046
+ " 'Hujová',\n",
1047
+ " 'Dědina',\n",
1048
+ " 'Běloušková',\n",
1049
+ " 'Lankaš',\n",
1050
+ " 'Kubr',\n",
1051
+ " 'Brodský',\n",
1052
+ " 'Hanys',\n",
1053
+ " 'Kohout',\n",
1054
+ " 'Spacek',\n",
1055
+ " 'Touš',\n",
1056
+ " 'Gejza',\n",
1057
+ " 'Bezděk',\n",
1058
+ " 'Stratil',\n",
1059
+ " 'Hruskova',\n",
1060
+ " 'Libus',\n",
1061
+ " 'Čalounictví',\n",
1062
+ " 'Leunka',\n",
1063
+ " 'Nehtů',\n",
1064
+ " 'Tymofij',\n",
1065
+ " 'Suchánek',\n",
1066
+ " 'Vojťech',\n",
1067
+ " 'Háša',\n",
1068
+ " 'Matusko',\n",
1069
+ " 'Jasanský',\n",
1070
+ " 'Kawulok',\n",
1071
+ " 'Boudová',\n",
1072
+ " 'Janíčková',\n",
1073
+ " 'Netušil',\n",
1074
+ " 'Jánský',\n",
1075
+ " 'Kutil',\n",
1076
+ " 'Zdenča',\n",
1077
+ " 'Šelmek',\n",
1078
+ " 'Kubi',\n",
1079
+ " 'Geržová',\n",
1080
+ " 'Mirdos',\n",
1081
+ " 'Vendelin',\n",
1082
+ " 'Lidunka',\n",
1083
+ " 'Pipik',\n",
1084
+ " 'Klarki',\n",
1085
+ " 'Vyoralová',\n",
1086
+ " 'Baťa',\n",
1087
+ " 'Vodicka',\n",
1088
+ " 'Trávník',\n",
1089
+ " 'Kolařík',\n",
1090
+ " ...},\n",
1091
+ " 'ORG': {'Goldwater',\n",
1092
+ " 'Madrone',\n",
1093
+ " 'Duryea',\n",
1094
+ " 'Lumière',\n",
1095
+ " 'Hønsvald',\n",
1096
+ " 'University-South',\n",
1097
+ " 'Sinhgad',\n",
1098
+ " 'Isak',\n",
1099
+ " 'Chyron',\n",
1100
+ " 'Itman',\n",
1101
+ " 'Safi',\n",
1102
+ " 'Backhouse',\n",
1103
+ " 'Liquid',\n",
1104
+ " 'Bucharest',\n",
1105
+ " 'Nautronix',\n",
1106
+ " 'PEQAB',\n",
1107
+ " 'Thessaloniki',\n",
1108
+ " 'OSHCA',\n",
1109
+ " 'reseller',\n",
1110
+ " 'Bernois',\n",
1111
+ " 'Metalurg',\n",
1112
+ " 'Avnet',\n",
1113
+ " 'Kimep',\n",
1114
+ " 'Muitalægje',\n",
1115
+ " 'Occupied',\n",
1116
+ " 'Nogometni',\n",
1117
+ " 'Diaraf',\n",
1118
+ " 'Tufik',\n",
1119
+ " 'Doukkali',\n",
1120
+ " 'Lighting,',\n",
1121
+ " 'AETNA,',\n",
1122
+ " 'Oundle',\n",
1123
+ " 'ShroudFilm',\n",
1124
+ " 'Vývoj',\n",
1125
+ " 'Moment',\n",
1126
+ " 'R.H.',\n",
1127
+ " 'Hightower',\n",
1128
+ " 'Barberton',\n",
1129
+ " 'Curtiss',\n",
1130
+ " 'Cardiff',\n",
1131
+ " '4-5-6',\n",
1132
+ " 'Sturm,',\n",
1133
+ " 'Hizb-an-Nusra',\n",
1134
+ " 'Ogranichennoi',\n",
1135
+ " 'Murray',\n",
1136
+ " 'Transformator',\n",
1137
+ " 'Views',\n",
1138
+ " 'Fighting',\n",
1139
+ " 'Ashton',\n",
1140
+ " \"'umeke\",\n",
1141
+ " 'Vale',\n",
1142
+ " 'Daang',\n",
1143
+ " 'Energomontaj',\n",
1144
+ " 'Sedbergh',\n",
1145
+ " 'Auchmuty',\n",
1146
+ " 'Ovrya',\n",
1147
+ " 'Christophers',\n",
1148
+ " 'Leister',\n",
1149
+ " 'Pai',\n",
1150
+ " 'Cordium',\n",
1151
+ " 'Guion',\n",
1152
+ " 'Agora',\n",
1153
+ " 'Nambassa',\n",
1154
+ " 'Buddha',\n",
1155
+ " 'Peacebuilding',\n",
1156
+ " 'Adesh',\n",
1157
+ " 'Blulita',\n",
1158
+ " 'WellSpan',\n",
1159
+ " 'Marini',\n",
1160
+ " '378',\n",
1161
+ " 'Sevigne',\n",
1162
+ " 'Biab',\n",
1163
+ " 'marine',\n",
1164
+ " 'Amerasian',\n",
1165
+ " 'TS&W/Claymore',\n",
1166
+ " 'Defensor',\n",
1167
+ " 'Trud',\n",
1168
+ " 'Flugstoðir',\n",
1169
+ " 'Flash',\n",
1170
+ " 'Asherton',\n",
1171
+ " '1990s',\n",
1172
+ " 'Suksa',\n",
1173
+ " 'KKS',\n",
1174
+ " 'Marchant',\n",
1175
+ " 'Assestment',\n",
1176
+ " 'Bougainville',\n",
1177
+ " 'RFID',\n",
1178
+ " 'Zhoghov',\n",
1179
+ " 'Bhoomaraddi',\n",
1180
+ " 'Fulham',\n",
1181
+ " 'Sch/Armstrong',\n",
1182
+ " 'Méhaignerie',\n",
1183
+ " 'Karakola',\n",
1184
+ " 'Century',\n",
1185
+ " 'Perfecto',\n",
1186
+ " 'Funny',\n",
1187
+ " 'Redbrick',\n",
1188
+ " 'LKiNG',\n",
1189
+ " 'Explosives',\n",
1190
+ " 'Winnipeg',\n",
1191
+ " 'Wallmann',\n",
1192
+ " 'Tapachula',\n",
1193
+ " 'Architel',\n",
1194
+ " 'Ahed',\n",
1195
+ " 'Strojírenský',\n",
1196
+ " 'Isbister',\n",
1197
+ " 'Tele',\n",
1198
+ " 'Bihać',\n",
1199
+ " 'Hosack',\n",
1200
+ " 'Use',\n",
1201
+ " 'Daeyeon',\n",
1202
+ " 'Silveyville',\n",
1203
+ " 'NEW',\n",
1204
+ " 'Barnegat',\n",
1205
+ " 'Goujon',\n",
1206
+ " 'Minjok',\n",
1207
+ " 'Druge',\n",
1208
+ " 'NORAZ',\n",
1209
+ " 'Kwik-Fit',\n",
1210
+ " 'Cameroons',\n",
1211
+ " 'Sybertooth',\n",
1212
+ " 'Adista',\n",
1213
+ " 'Olivero',\n",
1214
+ " 'Hangchow',\n",
1215
+ " 'JAM',\n",
1216
+ " 'Thulinverken',\n",
1217
+ " 'Myrkdalen',\n",
1218
+ " 'Glenstal',\n",
1219
+ " 'Envision',\n",
1220
+ " 'Tollcross',\n",
1221
+ " 'Kinnard',\n",
1222
+ " 'Yonggi',\n",
1223
+ " 'Tormod',\n",
1224
+ " 'Sopoćani',\n",
1225
+ " 'Template:Richmond',\n",
1226
+ " 'Bluebox',\n",
1227
+ " 'Mother-one',\n",
1228
+ " 'Kidapawan',\n",
1229
+ " 'Tacloban',\n",
1230
+ " 'Fosston',\n",
1231
+ " 'Marghiloman',\n",
1232
+ " 'School-Freshman',\n",
1233
+ " 'Superintendent',\n",
1234
+ " 'Flowertown',\n",
1235
+ " 'Nefesh',\n",
1236
+ " 'Haluxvill',\n",
1237
+ " 'Carrfield',\n",
1238
+ " 'Lucentum',\n",
1239
+ " 'Yearling',\n",
1240
+ " 'Javelinas',\n",
1241
+ " 'Assistants-McAllen',\n",
1242
+ " 'Trendmasters',\n",
1243
+ " 'Albritton',\n",
1244
+ " 'Zora',\n",
1245
+ " 'Merdeka',\n",
1246
+ " 'Bema',\n",
1247
+ " 'Momišići',\n",
1248
+ " 'NATICC',\n",
1249
+ " 'Holdheim',\n",
1250
+ " 'Moneybarn',\n",
1251
+ " 'Batten',\n",
1252
+ " 'Izol,',\n",
1253
+ " 'Karpaty-2',\n",
1254
+ " 'Clown',\n",
1255
+ " 'Rayne',\n",
1256
+ " 'Souls',\n",
1257
+ " 'Krag-Juel-Vind-Frijs',\n",
1258
+ " 'solidarity',\n",
1259
+ " 'Hines-Caldwell',\n",
1260
+ " 'Jonub',\n",
1261
+ " 'Blata',\n",
1262
+ " 'partneři,',\n",
1263
+ " 'ministry',\n",
1264
+ " 'agronomiques',\n",
1265
+ " 'affairs',\n",
1266
+ " 'Cancer,',\n",
1267
+ " 'Balıkesirspor',\n",
1268
+ " 'NYLXS',\n",
1269
+ " 'Ruhrah',\n",
1270
+ " 'Satchel',\n",
1271
+ " 'Eclectic',\n",
1272
+ " 'Qeren',\n",
1273
+ " 'Xiniya',\n",
1274
+ " 'Mendes-France',\n",
1275
+ " 'Jugoslovenska',\n",
1276
+ " 'Spellacy',\n",
1277
+ " 'Pixmania',\n",
1278
+ " 'Yea',\n",
1279
+ " 'Omaga',\n",
1280
+ " 'Glenlara',\n",
1281
+ " 'ASME',\n",
1282
+ " 'HUB',\n",
1283
+ " 'Faye',\n",
1284
+ " 'Toroa',\n",
1285
+ " 'School-Somersworth',\n",
1286
+ " 'Staines',\n",
1287
+ " 'MISC',\n",
1288
+ " 'Intermarket',\n",
1289
+ " 'Chevalier',\n",
1290
+ " 'Bahnpolizei',\n",
1291
+ " 'Marple',\n",
1292
+ " 'Portlethen',\n",
1293
+ " 'McDeere',\n",
1294
+ " \"Zita's\",\n",
1295
+ " 'Telesforo',\n",
1296
+ " 'Leptondale',\n",
1297
+ " 'Desmond',\n",
1298
+ " 'Asu',\n",
1299
+ " 'Slana',\n",
1300
+ " 'Stila',\n",
1301
+ " 'Stara',\n",
1302
+ " 'Newham',\n",
1303
+ " 'Zakynthos',\n",
1304
+ " 'Hospitallers',\n",
1305
+ " 'Besiana',\n",
1306
+ " 'VVV-Venlo',\n",
1307
+ " 'Wessobrunn',\n",
1308
+ " 'Clareification',\n",
1309
+ " 'Manitex',\n",
1310
+ " 'Macks',\n",
1311
+ " 'départements',\n",
1312
+ " 'Oluwatomisin',\n",
1313
+ " 'Gépgyár',\n",
1314
+ " 'Cardinale',\n",
1315
+ " 'Latil',\n",
1316
+ " 'Seasonings',\n",
1317
+ " 'Åhléns',\n",
1318
+ " 'BANRO',\n",
1319
+ " 'John/Endicott',\n",
1320
+ " 'Cuatiá',\n",
1321
+ " 'Margai',\n",
1322
+ " 'Seanergy',\n",
1323
+ " 'Hanho',\n",
1324
+ " 'Tory',\n",
1325
+ " 'Psychotherapy',\n",
1326
+ " 'Plymouth-Canton',\n",
1327
+ " 'Hoefler',\n",
1328
+ " 'Vinton',\n",
1329
+ " 'Rabel',\n",
1330
+ " 'Mandaue',\n",
1331
+ " 'Shengkai',\n",
1332
+ " 'Adrien',\n",
1333
+ " 'Prospective',\n",
1334
+ " 'FERN',\n",
1335
+ " 'Velyka',\n",
1336
+ " 'Roke',\n",
1337
+ " 'TARBS',\n",
1338
+ " 'Infirm',\n",
1339
+ " 'Stono',\n",
1340
+ " 'Matapeake',\n",
1341
+ " 'Radim',\n",
1342
+ " 'Point-Rtc',\n",
1343
+ " 'Sagesse',\n",
1344
+ " 'Dur-O-Lite',\n",
1345
+ " 'Patricians',\n",
1346
+ " 'Sagle',\n",
1347
+ " 'Juras',\n",
1348
+ " 'Hamed',\n",
1349
+ " 'Liberty-Eylau',\n",
1350
+ " 'Plans',\n",
1351
+ " 'Padmaja',\n",
1352
+ " 'Fredon',\n",
1353
+ " 'Reproductions',\n",
1354
+ " 'Gensler',\n",
1355
+ " 'Salthill',\n",
1356
+ " 'Faktor',\n",
1357
+ " 'Earley',\n",
1358
+ " 'Waresboro',\n",
1359
+ " 'Sanitas',\n",
1360
+ " 'Trufab',\n",
1361
+ " 'MGR-Janaki',\n",
1362
+ " 'Fiske',\n",
1363
+ " 'Mavis',\n",
1364
+ " 'Dřevoeuro,',\n",
1365
+ " 'Organizations/Format',\n",
1366
+ " 'Boulin',\n",
1367
+ " 'Netopia',\n",
1368
+ " 'Basie',\n",
1369
+ " 'Kuraray',\n",
1370
+ " 'Seguin',\n",
1371
+ " 'Mossley',\n",
1372
+ " 'Dyer',\n",
1373
+ " 'Lawsuit',\n",
1374
+ " 'Omer,',\n",
1375
+ " 'Zemun',\n",
1376
+ " 'AuthenTec',\n",
1377
+ " 'Miryang',\n",
1378
+ " 'Academy-Edison',\n",
1379
+ " 'Academy-Onalaska',\n",
1380
+ " 'Post-Crescent',\n",
1381
+ " 'Forestview',\n",
1382
+ " 'Beaty',\n",
1383
+ " 'Goodwood',\n",
1384
+ " 'Martuni',\n",
1385
+ " 'Kilpauk',\n",
1386
+ " 'Mazon-Verona-Kinsman',\n",
1387
+ " 'Litana',\n",
1388
+ " 'Capoeira',\n",
1389
+ " 'Srinivas',\n",
1390
+ " 'NCB',\n",
1391
+ " 'Stoychev',\n",
1392
+ " 'Metroline',\n",
1393
+ " 'Corsair',\n",
1394
+ " 'IDBM',\n",
1395
+ " 'Gibbsboro',\n",
1396
+ " 'Speiderforbund',\n",
1397
+ " 'Vamida,',\n",
1398
+ " \"Shoney's\",\n",
1399
+ " 'Grewenow',\n",
1400
+ " 'Kingswinford',\n",
1401
+ " 'Ammons',\n",
1402
+ " 'Aught',\n",
1403
+ " 'thermique',\n",
1404
+ " 'GoYin',\n",
1405
+ " '282',\n",
1406
+ " 'Fouad',\n",
1407
+ " 'Čerpání',\n",
1408
+ " 'Clip',\n",
1409
+ " 'Chechnya',\n",
1410
+ " 'Hansa',\n",
1411
+ " 'Campobasso',\n",
1412
+ " 'Dandenong',\n",
1413
+ " 'Morigeau',\n",
1414
+ " 'Todos',\n",
1415
+ " 'Cibibon',\n",
1416
+ " 'Ayre',\n",
1417
+ " 'Monon',\n",
1418
+ " 'Proizvodnja,',\n",
1419
+ " 'Ericsson',\n",
1420
+ " 'Holek',\n",
1421
+ " 'Bentonit',\n",
1422
+ " 'Thurrock',\n",
1423
+ " 'NEI',\n",
1424
+ " 'Grownupgreen',\n",
1425
+ " 'estate,',\n",
1426
+ " 'guards',\n",
1427
+ " 'Tydfil',\n",
1428
+ " 'Shechen',\n",
1429
+ " 'Adami',\n",
1430
+ " 'Oley',\n",
1431
+ " 'Lineville-Clio',\n",
1432
+ " 'Pascagoula',\n",
1433
+ " 'Supertec',\n",
1434
+ " 'JKT',\n",
1435
+ " 'Lemont',\n",
1436
+ " 'Kingsbury',\n",
1437
+ " 'GIVE',\n",
1438
+ " 'Stor-Elvdal',\n",
1439
+ " 'Greynium',\n",
1440
+ " 'Bundoora',\n",
1441
+ " 'Bodhi',\n",
1442
+ " 'Biocountry',\n",
1443
+ " 'Reklamugynokseg',\n",
1444
+ " 'Waqaea',\n",
1445
+ " 'Hewan',\n",
1446
+ " 'Bodin',\n",
1447
+ " 'Ferndale',\n",
1448
+ " 'Eckhart',\n",
1449
+ " 'Electronique',\n",
1450
+ " 'Lumea',\n",
1451
+ " 'Nizamia',\n",
1452
+ " 'McNally',\n",
1453
+ " 'Hadnot',\n",
1454
+ " 'Possum',\n",
1455
+ " 'Ovarense',\n",
1456
+ " 'CIBC',\n",
1457
+ " 'AFA',\n",
1458
+ " 'Ungana-Afrika',\n",
1459
+ " 'Manatau',\n",
1460
+ " 'Cherokees',\n",
1461
+ " 'Graf-Munster-Gymnasium',\n",
1462
+ " 'Hogges',\n",
1463
+ " 'MacEachen',\n",
1464
+ " 'Weighted',\n",
1465
+ " 'Kennerly',\n",
1466
+ " 'Renner',\n",
1467
+ " 'V.I.S.A.',\n",
1468
+ " 'Distillaries',\n",
1469
+ " 'Transmountain',\n",
1470
+ " 'Tapia',\n",
1471
+ " 'Feromat',\n",
1472
+ " 'Hutsonville',\n",
1473
+ " 'Steinadler',\n",
1474
+ " 'Rehan',\n",
1475
+ " \"Macy's,\",\n",
1476
+ " 'Debartolo',\n",
1477
+ " 'Scotts',\n",
1478
+ " 'TRANSEARCH',\n",
1479
+ " 'Sagebrush',\n",
1480
+ " 'GASAG',\n",
1481
+ " 'Nicktoons',\n",
1482
+ " 'Maudrey',\n",
1483
+ " 'Frontrunners',\n",
1484
+ " 'Butterley',\n",
1485
+ " 'SAIC',\n",
1486
+ " 'Ruhani',\n",
1487
+ " 'Celerity',\n",
1488
+ " 'Soddo',\n",
1489
+ " 'Odee',\n",
1490
+ " 'Oklee',\n",
1491
+ " 'Maidstone',\n",
1492
+ " '46-Charles',\n",
1493
+ " 'Face',\n",
1494
+ " 'Macatawa',\n",
1495
+ " 'Ritar',\n",
1496
+ " 'Pleşu',\n",
1497
+ " 'Harrisonburg',\n",
1498
+ " 'Jayewardenepura',\n",
1499
+ " 'Cramer',\n",
1500
+ " 'Družstvo',\n",
1501
+ " 'Estância',\n",
1502
+ " 'Job',\n",
1503
+ " 'Zanardelli',\n",
1504
+ " 'Hevia',\n",
1505
+ " 'Manhasset',\n",
1506
+ " 'Cyberjaya',\n",
1507
+ " 'Elphick',\n",
1508
+ " 'Kratzert',\n",
1509
+ " 'Paso-Gridley',\n",
1510
+ " 'Harrassowitz',\n",
1511
+ " 'Záchranáři,',\n",
1512
+ " 'Brunton,',\n",
1513
+ " 'Semiconductor',\n",
1514
+ " 'Saddlers',\n",
1515
+ " 'Marcia',\n",
1516
+ " 'Template:Atlanta',\n",
1517
+ " 'Gend',\n",
1518
+ " 'Smit',\n",
1519
+ " 'Agogo',\n",
1520
+ " 'Kasthuri',\n",
1521
+ " 'Brewood',\n",
1522
+ " 'Lampre',\n",
1523
+ " 'Luffenham',\n",
1524
+ " 'Tshogdu',\n",
1525
+ " 'Brisson',\n",
1526
+ " 'Hepburn',\n",
1527
+ " 'ABTI-American',\n",
1528
+ " 'Daventry',\n",
1529
+ " 'Quantex',\n",
1530
+ " 'Bullants',\n",
1531
+ " 'Drinking',\n",
1532
+ " 'Enclave',\n",
1533
+ " 'Hankamer',\n",
1534
+ " 'Oliva',\n",
1535
+ " 'Cosmopolitan',\n",
1536
+ " 'Sint-Truidense',\n",
1537
+ " 'Draytonville',\n",
1538
+ " 'HES,',\n",
1539
+ " 'Westrans',\n",
1540
+ " 'Politico',\n",
1541
+ " 'Lincroft',\n",
1542
+ " 'Deltras',\n",
1543
+ " 'Hiraben',\n",
1544
+ " 'Mayan',\n",
1545
+ " 'Reactrix',\n",
1546
+ " 'Zeferino',\n",
1547
+ " 'Chashama',\n",
1548
+ " 'Mahilyou',\n",
1549
+ " 'Stoph',\n",
1550
+ " 'Dabugaon',\n",
1551
+ " 'Kuttichal',\n",
1552
+ " 'Thompsen',\n",
1553
+ " 'Bharali',\n",
1554
+ " 'Asda',\n",
1555
+ " 'Tikveš',\n",
1556
+ " 'Eschen/Mauren',\n",
1557
+ " 'Collinswood',\n",
1558
+ " 'Cry',\n",
1559
+ " 'Youngs',\n",
1560
+ " \"Malley's\",\n",
1561
+ " 'Renwood',\n",
1562
+ " 'Rajarshi',\n",
1563
+ " 'Graveraet',\n",
1564
+ " 'Evaluation',\n",
1565
+ " 'Kutir',\n",
1566
+ " \"d'opinion\",\n",
1567
+ " 'Havras,',\n",
1568
+ " 'Tuerto',\n",
1569
+ " 'Malbaza',\n",
1570
+ " 'Michael-Albertville',\n",
1571
+ " 'Mohanlal',\n",
1572
+ " 'Rotem',\n",
1573
+ " 'Borman',\n",
1574
+ " 'Josef',\n",
1575
+ " 'Montague',\n",
1576
+ " \"Nature's\",\n",
1577
+ " 'Révolutionnaire',\n",
1578
+ " 'Nobuyuki',\n",
1579
+ " 'Lucama',\n",
1580
+ " 'Gainesville,inc.',\n",
1581
+ " 'Gigurtu',\n",
1582
+ " 'Tozer',\n",
1583
+ " 'Lexus',\n",
1584
+ " 'Endo',\n",
1585
+ " 'Venkateshwara',\n",
1586
+ " 'PLANSEE',\n",
1587
+ " 'Hansabank',\n",
1588
+ " 'E-15',\n",
1589
+ " 'Saint-Louis',\n",
1590
+ " 'Istmo',\n",
1591
+ " 'Herber',\n",
1592
+ " 'Magnet',\n",
1593
+ " 'ECRYPT',\n",
1594
+ " 'Delfield',\n",
1595
+ " 'Southbound',\n",
1596
+ " 'Metallurgy',\n",
1597
+ " 'Bahir',\n",
1598
+ " 'vihar',\n",
1599
+ " 'Taipans',\n",
1600
+ " 'Fisler',\n",
1601
+ " 'Viking',\n",
1602
+ " 'Skog',\n",
1603
+ " 'Faucon',\n",
1604
+ " 'Wawaloam',\n",
1605
+ " 'PMC',\n",
1606
+ " 'Weeping',\n",
1607
+ " 'names',\n",
1608
+ " 'JONELTA',\n",
1609
+ " 'Missouri-Columbia',\n",
1610
+ " 'Potts',\n",
1611
+ " 'Certified',\n",
1612
+ " 'Azania',\n",
1613
+ " 'Arrowsmith',\n",
1614
+ " 'Karbalaa',\n",
1615
+ " 'Apostle',\n",
1616
+ " 'Stunners',\n",
1617
+ " 'Clydesdale',\n",
1618
+ " 'Kasuri',\n",
1619
+ " 'articles/Cooperatives',\n",
1620
+ " 'CDs',\n",
1621
+ " 'Arapahoe',\n",
1622
+ " 'Russian-Armenian',\n",
1623
+ " 'Wannaque',\n",
1624
+ " 'Harder',\n",
1625
+ " 'Flames',\n",
1626
+ " 'Warren-Alvarado-Oslo',\n",
1627
+ " 'Mirpur',\n",
1628
+ " 'Izard',\n",
1629
+ " 'Stipula',\n",
1630
+ " 'Diversifed',\n",
1631
+ " 'S-Mid',\n",
1632
+ " \"Lot's\",\n",
1633
+ " 'Sorbara',\n",
1634
+ " 'Gregorio',\n",
1635
+ " 'Antech',\n",
1636
+ " 'Garndiffaith',\n",
1637
+ " 'Illustrated',\n",
1638
+ " 'Halawa',\n",
1639
+ " 'planet',\n",
1640
+ " 'Aigieas',\n",
1641
+ " 'Peake',\n",
1642
+ " 'Emmalena',\n",
1643
+ " 'Sexsmith',\n",
1644
+ " 'Aftec-Palisades',\n",
1645
+ " 'Clonfert',\n",
1646
+ " 'Gebrüder',\n",
1647
+ " 'Defenses',\n",
1648
+ " 'Sonicare',\n",
1649
+ " 'Boström',\n",
1650
+ " 'Vetenskap',\n",
1651
+ " 'Gulfport',\n",
1652
+ " 'Piccola',\n",
1653
+ " 'LeZion',\n",
1654
+ " 'Polirom',\n",
1655
+ " 'Bandalag',\n",
1656
+ " 'Hartsfield',\n",
1657
+ " 'Schoenly',\n",
1658
+ " 'Gollancz',\n",
1659
+ " 'Dryburgh',\n",
1660
+ " 'Datastorm',\n",
1661
+ " 'REXCAPITAL',\n",
1662
+ " 'Apocalypse',\n",
1663
+ " 'Steczkowski',\n",
1664
+ " 'Tursib',\n",
1665
+ " 'Mouchel',\n",
1666
+ " 'Es-Company',\n",
1667
+ " 'Badshot',\n",
1668
+ " 'Kerasotes',\n",
1669
+ " 'Menzies',\n",
1670
+ " 'Denia',\n",
1671
+ " 'Clarins',\n",
1672
+ " 'Maricourt',\n",
1673
+ " 'Rooks',\n",
1674
+ " 'Yard',\n",
1675
+ " 'Sohar',\n",
1676
+ " 'ERGOPRAXIS',\n",
1677
+ " 'Muerte',\n",
1678
+ " '2006-07',\n",
1679
+ " 'Annual',\n",
1680
+ " '477th',\n",
1681
+ " 'DIRP,',\n",
1682
+ " 'CO-OP',\n",
1683
+ " 'Hillingdon',\n",
1684
+ " 'Baykal',\n",
1685
+ " 'HannStar',\n",
1686
+ " 'Knickerbocker',\n",
1687
+ " 'Covadonga',\n",
1688
+ " 'Positron!',\n",
1689
+ " 'Impuzamugambi',\n",
1690
+ " 'Bodydonnas',\n",
1691
+ " 'Bachoco',\n",
1692
+ " 'Recreation',\n",
1693
+ " 'Mqabba',\n",
1694
+ " 'Guinyard',\n",
1695
+ " 'MPPJ',\n",
1696
+ " 'Telstar',\n",
1697
+ " 'Rubery',\n",
1698
+ " 'Priština',\n",
1699
+ " 'Infrastructur',\n",
1700
+ " 'Apotex',\n",
1701
+ " 'Redstone',\n",
1702
+ " 'Timisoara',\n",
1703
+ " 'Climsland',\n",
1704
+ " 'VirnetX',\n",
1705
+ " 'Bowl',\n",
1706
+ " 'Iiro',\n",
1707
+ " 'Minneapolis',\n",
1708
+ " \"d'Ingénieurs\",\n",
1709
+ " 'UMS-Wright',\n",
1710
+ " 'Bourg-Péronnas',\n",
1711
+ " 'Daňka,',\n",
1712
+ " 'Miner',\n",
1713
+ " 'Kordcarbon,',\n",
1714
+ " 'DEKA',\n",
1715
+ " 'Institute-Cordova',\n",
1716
+ " 'Ybarra',\n",
1717
+ " 'Mirek',\n",
1718
+ " 'Sambo',\n",
1719
+ " 'Clairton',\n",
1720
+ " 'SAIPA',\n",
1721
+ " 'Incognito',\n",
1722
+ " 'HMSI',\n",
1723
+ " 'Zip.ca',\n",
1724
+ " 'Sucessores,',\n",
1725
+ " 'ONFEM',\n",
1726
+ " 'City-As-School',\n",
1727
+ " 'Huntington-Surrey',\n",
1728
+ " 'Tolleson',\n",
1729
+ " 'Hamilton-Maineville',\n",
1730
+ " 'Swint',\n",
1731
+ " 'Karabükspor',\n",
1732
+ " 'Demolice,',\n",
1733
+ " 'Ravailler',\n",
1734
+ " 'Latson',\n",
1735
+ " 'Primus',\n",
1736
+ " 'Jwp',\n",
1737
+ " 'Základní',\n",
1738
+ " 'Vaudreuil',\n",
1739
+ " 'HITEC',\n",
1740
+ " 'Netherhall',\n",
1741
+ " 'Corridor',\n",
1742
+ " 'Jabalain',\n",
1743
+ " 'Corymore',\n",
1744
+ " 'Fortissimo',\n",
1745
+ " 'Liliam',\n",
1746
+ " 'Nilsson',\n",
1747
+ " 'Etnyre',\n",
1748
+ " 'Maranhao',\n",
1749
+ " 'Hainan',\n",
1750
+ " 'Domo',\n",
1751
+ " 'Rychlík',\n",
1752
+ " 'Reproservis',\n",
1753
+ " 'MTA',\n",
1754
+ " 'Konekta',\n",
1755
+ " 'Permafrost',\n",
1756
+ " 'Aktivit',\n",
1757
+ " 'Stjarnan',\n",
1758
+ " 'Peck',\n",
1759
+ " 'Epoch',\n",
1760
+ " 'Eurohypo',\n",
1761
+ " 'Cardiel',\n",
1762
+ " 'Metaflow',\n",
1763
+ " 'Fairey',\n",
1764
+ " 'Mondial',\n",
1765
+ " 'Katayama',\n",
1766
+ " 'Hempel',\n",
1767
+ " \"d'Aix\",\n",
1768
+ " 'Lateko',\n",
1769
+ " 'Tommie',\n",
1770
+ " 'Marrs',\n",
1771
+ " 'Soriano-Pedroso',\n",
1772
+ " 'Cavanaugh',\n",
1773
+ " 'ICPR',\n",
1774
+ " 'Finn',\n",
1775
+ " 'Mozarteum',\n",
1776
+ " 'Tarleton',\n",
1777
+ " 'CAFE,',\n",
1778
+ " 'Ashtarak',\n",
1779
+ " 'Ohlange',\n",
1780
+ " 'Kosan',\n",
1781
+ " 'delo',\n",
1782
+ " 'Crvenkovski',\n",
1783
+ " 'Celaya',\n",
1784
+ " 'Yasufumi',\n",
1785
+ " 'Goel',\n",
1786
+ " 'développement',\n",
1787
+ " 'Mukhtar',\n",
1788
+ " 'Institute-Westminster',\n",
1789
+ " 'Tenor',\n",
1790
+ " 'Kickapoo',\n",
1791
+ " 'Sullins',\n",
1792
+ " 'Paulson',\n",
1793
+ " 'Vales',\n",
1794
+ " 'Gerwani',\n",
1795
+ " 'Arusha',\n",
1796
+ " 'Sontag',\n",
1797
+ " 'Mussa',\n",
1798
+ " 'Rivière-Pilote',\n",
1799
+ " 'XING',\n",
1800
+ " 'Schott',\n",
1801
+ " 'NASCAR',\n",
1802
+ " 'Psychology,',\n",
1803
+ " 'Bahujan',\n",
1804
+ " 'UNIX',\n",
1805
+ " 'Bif',\n",
1806
+ " 'Mountz',\n",
1807
+ " 'Naresuan',\n",
1808
+ " 'Centrair',\n",
1809
+ " 'School-Harrisburg',\n",
1810
+ " 'Akella',\n",
1811
+ " 'Gizz,',\n",
1812
+ " 'Gzira',\n",
1813
+ " 'Historiska',\n",
1814
+ " 'Cathkin',\n",
1815
+ " 'Nationaliste',\n",
1816
+ " 'Zimmermann',\n",
1817
+ " 'Germaniawerft',\n",
1818
+ " 'Tampico',\n",
1819
+ " 'Mbujimayi',\n",
1820
+ " 'Aidano,',\n",
1821
+ " 'Ponferradina',\n",
1822
+ " 'Sunbirds',\n",
1823
+ " 'Stottlemyer',\n",
1824
+ " 'Glorian',\n",
1825
+ " 'Taylor/Kirklane',\n",
1826
+ " 'Canine',\n",
1827
+ " 'Landers',\n",
1828
+ " 'Serikat',\n",
1829
+ " 'Logistics',\n",
1830
+ " 'Elemetary',\n",
1831
+ " 'EUROPE,',\n",
1832
+ " 'Northolt',\n",
1833
+ " 'Glanc',\n",
1834
+ " 'Clary',\n",
1835
+ " 'Alperton',\n",
1836
+ " 'Mcgaheysville',\n",
1837
+ " 'Holyrood',\n",
1838
+ " 'FirstCity',\n",
1839
+ " 'Impe',\n",
1840
+ " 'Harpenden',\n",
1841
+ " 'Suchánek',\n",
1842
+ " 'Subex',\n",
1843
+ " 'GrowHow',\n",
1844
+ " 'Dartmoor',\n",
1845
+ " 'Fifty-Sixth',\n",
1846
+ " 'Palee',\n",
1847
+ " 'Disappeared',\n",
1848
+ " 'Grierson',\n",
1849
+ " 'Revap,',\n",
1850
+ " 'al-Attar',\n",
1851
+ " 'Unite',\n",
1852
+ " 'Nevin',\n",
1853
+ " 'Yverdon-Sport',\n",
1854
+ " 'GalGael',\n",
1855
+ " 'Vendelin',\n",
1856
+ " 'Coola',\n",
1857
+ " 'Kepong',\n",
1858
+ " 'Gen¹³',\n",
1859
+ " 'Ryans',\n",
1860
+ " 'Maxwelltown',\n",
1861
+ " 'Impak',\n",
1862
+ " 'pro.',\n",
1863
+ " 'planning',\n",
1864
+ " 'Brent',\n",
1865
+ " 'Academic',\n",
1866
+ " 'Hillerich',\n",
1867
+ " 'Zentral-Dombauverein',\n",
1868
+ " 'Marr',\n",
1869
+ " 'Harp',\n",
1870
+ " 'Pulmuone',\n",
1871
+ " 'Tziona',\n",
1872
+ " 'Mccarter',\n",
1873
+ " 'Lyonpo',\n",
1874
+ " 'I.T.',\n",
1875
+ " 'Sivagiri',\n",
1876
+ " 'Aptidon',\n",
1877
+ " 'Malibu',\n",
1878
+ " 'Limarko',\n",
1879
+ " 'Pinero',\n",
1880
+ " 'Novator',\n",
1881
+ " 'Stephens',\n",
1882
+ " \"Colt's\",\n",
1883
+ " 'Fovu',\n",
1884
+ " 'SIPoL',\n",
1885
+ " 'Montreux',\n",
1886
+ " 'Minne',\n",
1887
+ " 'DSN,',\n",
1888
+ " 'Farband',\n",
1889
+ " 'Resistances',\n",
1890
+ " 'Mance',\n",
1891
+ " 'Nestucca',\n",
1892
+ " 'Bremond',\n",
1893
+ " 'Precis',\n",
1894
+ " 'Harsanyi',\n",
1895
+ " 'Ryders',\n",
1896
+ " 'Cinenova',\n",
1897
+ " 'Maths',\n",
1898
+ " 'RSPORTS',\n",
1899
+ " 'Damelin',\n",
1900
+ " 'Omoljica',\n",
1901
+ " 'Cushing',\n",
1902
+ " 'Bergfrid',\n",
1903
+ " 'Swagelok',\n",
1904
+ " 'netball',\n",
1905
+ " 'Myślenice',\n",
1906
+ " \"Hook's\",\n",
1907
+ " 'Valves',\n",
1908
+ " 'Wilson,',\n",
1909
+ " 'Wilbraham',\n",
1910
+ " 'Eurocom',\n",
1911
+ " 'Pabriks',\n",
1912
+ " 'Sequah',\n",
1913
+ " 'Vorachith',\n",
1914
+ " 'Benjamin',\n",
1915
+ " 'Poplin',\n",
1916
+ " 'Arrant-Light',\n",
1917
+ " 'Ostružnica',\n",
1918
+ " 'Calyon',\n",
1919
+ " 'KLH',\n",
1920
+ " 'BofI',\n",
1921
+ " 'M-League',\n",
1922
+ " 'chaparratique',\n",
1923
+ " 'Kauno',\n",
1924
+ " 'Delphic',\n",
1925
+ " 'Shishi',\n",
1926
+ " 'Hornmed,',\n",
1927
+ " 'Marmot',\n",
1928
+ " 'Okolona',\n",
1929
+ " 'Tele-Network',\n",
1930
+ " 'Cowlishaw',\n",
1931
+ " 'Ghazl',\n",
1932
+ " 'Inuksuk',\n",
1933
+ " 'Greens/Green',\n",
1934
+ " 'Optech',\n",
1935
+ " 'Equitable',\n",
1936
+ " 'Datenschlag',\n",
1937
+ " 'Derkach',\n",
1938
+ " 'Summ',\n",
1939
+ " 'BroadVoice',\n",
1940
+ " 'Langton',\n",
1941
+ " 'Arcelor',\n",
1942
+ " 'Yosemite',\n",
1943
+ " 'Vaillante',\n",
1944
+ " 'Mussellman',\n",
1945
+ " 'Frulact',\n",
1946
+ " 'mont,',\n",
1947
+ " 'Middelthon',\n",
1948
+ " 'Template:Rutgers',\n",
1949
+ " 'Nanzan',\n",
1950
+ " 'Bashford',\n",
1951
+ " 'Tampa,',\n",
1952
+ " 'Illusion',\n",
1953
+ " 'Metrovile',\n",
1954
+ " 'J-Link',\n",
1955
+ " 'Talmadge',\n",
1956
+ " 'ESR',\n",
1957
+ " 'Catamount',\n",
1958
+ " 'Fairland',\n",
1959
+ " 'wal-Jihad',\n",
1960
+ " 'Tickson',\n",
1961
+ " 'Bimbo',\n",
1962
+ " 'Toward',\n",
1963
+ " 'Mauston',\n",
1964
+ " 'Wasserstein',\n",
1965
+ " 'Royce',\n",
1966
+ " 'Strathcona',\n",
1967
+ " 'Wide',\n",
1968
+ " 'Hairgrove',\n",
1969
+ " 'BG-TURK',\n",
1970
+ " 'Ahasanullah',\n",
1971
+ " 'Tryggvi',\n",
1972
+ " 'Altern',\n",
1973
+ " 'Mathey',\n",
1974
+ " 'Photronics',\n",
1975
+ " 'Dualit',\n",
1976
+ " 'Nullsoft',\n",
1977
+ " 'Marcher',\n",
1978
+ " 'Talas',\n",
1979
+ " 'Boals',\n",
1980
+ " 'Polytechnická',\n",
1981
+ " 'Sante',\n",
1982
+ " 'Acreage',\n",
1983
+ " 'Ecotricity',\n",
1984
+ " 'Guotai',\n",
1985
+ " 'Compass',\n",
1986
+ " 'Ceferino',\n",
1987
+ " 'Image:NVCClogo.gif',\n",
1988
+ " 'Proto-Cathedral',\n",
1989
+ " 'Rensselaer',\n",
1990
+ " \"Sant'Antonio\",\n",
1991
+ " 'Lenca',\n",
1992
+ " 'Allgood',\n",
1993
+ " 'Chidhood',\n",
1994
+ " 'Daewon',\n",
1995
+ " 'Ichabod',\n",
1996
+ " 'Starvation',\n",
1997
+ " 'Oppigards',\n",
1998
+ " 'PostEurop',\n",
1999
+ " 'Enron',\n",
2000
+ " 'Heure',\n",
2001
+ " 'Extreme-Park',\n",
2002
+ " 'MIS',\n",
2003
+ " 'Doe',\n",
2004
+ " 'Hanburys',\n",
2005
+ " 'Měřičkova',\n",
2006
+ " 'Siege',\n",
2007
+ " 'Schoollaire',\n",
2008
+ " 'Caledonians',\n",
2009
+ " 'Bargalló',\n",
2010
+ " 'Drumsurn',\n",
2011
+ " 'Mellbye',\n",
2012
+ " 'Coyote',\n",
2013
+ " 'Issaries,',\n",
2014
+ " 'Reiffton',\n",
2015
+ " 'Hung',\n",
2016
+ " 'Mukka',\n",
2017
+ " 'Makedonski',\n",
2018
+ " 'Consolation',\n",
2019
+ " 'Friends',\n",
2020
+ " 'F.X.',\n",
2021
+ " 'Churchill',\n",
2022
+ " 'Metalock',\n",
2023
+ " 'Fellowships',\n",
2024
+ " 'Dalen',\n",
2025
+ " 'Pro-Am',\n",
2026
+ " 'Oddfellows',\n",
2027
+ " 'Haglöfs',\n",
2028
+ " 'Folkestad',\n",
2029
+ " 'NeoPac',\n",
2030
+ " 'Rajhans',\n",
2031
+ " 'Castleblayney',\n",
2032
+ " 'Shawnigan',\n",
2033
+ " 'Pristis',\n",
2034
+ " 'Conservators',\n",
2035
+ " 'Scoil',\n",
2036
+ " 'Draude',\n",
2037
+ " 'Peat',\n",
2038
+ " 'MBM',\n",
2039
+ " 'Capistrano-Laguna',\n",
2040
+ " 'Yabra',\n",
2041
+ " 'Dateland',\n",
2042
+ " 'Literarcy',\n",
2043
+ " 'Chem-Mod',\n",
2044
+ " 'MphasiS',\n",
2045
+ " 'Landeck',\n",
2046
+ " 'Ventures,',\n",
2047
+ " 'Shanker',\n",
2048
+ " 'Knocklyon',\n",
2049
+ " 'NEP',\n",
2050
+ " 'Ana',\n",
2051
+ " 'Freiheit',\n",
2052
+ " 'Vitoria',\n",
2053
+ " 'Understanding',\n",
2054
+ " 'Ingrad',\n",
2055
+ " 'Nesom',\n",
2056
+ " 'AmbA',\n",
2057
+ " 'E.W.',\n",
2058
+ " 'Actioneer,',\n",
2059
+ " 'Lenoir-Rhyne',\n",
2060
+ " 'Datakonsult',\n",
2061
+ " 'Glimcher',\n",
2062
+ " 'VIA,',\n",
2063
+ " 'Wikipedia:Forum',\n",
2064
+ " 'Institute-Oklahoma',\n",
2065
+ " 'Isd-Local',\n",
2066
+ " 'Interboro',\n",
2067
+ " 'Diagonal',\n",
2068
+ " 'Acrylic',\n",
2069
+ " 'Castleson',\n",
2070
+ " 'Jippi',\n",
2071
+ " 'Purpose',\n",
2072
+ " 'Kinabalu',\n",
2073
+ " 'Takapuna',\n",
2074
+ " 'Fil',\n",
2075
+ " 'Osawatomie',\n",
2076
+ " 'Accident',\n",
2077
+ " \"O'Dempseys\",\n",
2078
+ " 'Stategies',\n",
2079
+ " 'Recepcao',\n",
2080
+ " 'Kaptol',\n",
2081
+ " 'Monell',\n",
2082
+ " 'Melters',\n",
2083
+ " 'Garbadale',\n",
2084
+ " 'Melanesia',\n",
2085
+ " 'Haub',\n",
2086
+ " 'Lázár',\n",
2087
+ " 'Ash',\n",
2088
+ " 'RMD',\n",
2089
+ " 'Vernon-Verona-Sherrill',\n",
2090
+ " \"Pearson's\",\n",
2091
+ " ...},\n",
2092
+ " 'LOC': {'Ningerova',\n",
2093
+ " 'Stolzové',\n",
2094
+ " 'Domcích',\n",
2095
+ " 'Sedlická',\n",
2096
+ " 'Konradova',\n",
2097
+ " 'Dělová',\n",
2098
+ " 'Bucharest',\n",
2099
+ " 'Lipoltov',\n",
2100
+ " 'Andělka',\n",
2101
+ " 'Náplavní',\n",
2102
+ " 'Hlupenov',\n",
2103
+ " 'Radětínská',\n",
2104
+ " 'Kozojedy',\n",
2105
+ " 'Hrádecký',\n",
2106
+ " 'Koledníku',\n",
2107
+ " 'Čajkovského',\n",
2108
+ " 'háječku',\n",
2109
+ " 'Prstná',\n",
2110
+ " 'Dluhoště',\n",
2111
+ " 'Jejkov',\n",
2112
+ " 'Korábu',\n",
2113
+ " 'Cardiff',\n",
2114
+ " 'Lipinka',\n",
2115
+ " 'Judytky',\n",
2116
+ " 'Baizy',\n",
2117
+ " 'Dlažební',\n",
2118
+ " 'Malířská',\n",
2119
+ " 'Pešatova',\n",
2120
+ " 'Lesotho',\n",
2121
+ " 'Kosmá',\n",
2122
+ " 'Förstrova',\n",
2123
+ " 'Větruši',\n",
2124
+ " 'Vodičků',\n",
2125
+ " 'Čumpelíkova',\n",
2126
+ " 'Klavarská',\n",
2127
+ " 'Pramene',\n",
2128
+ " 'Lidušky',\n",
2129
+ " 'Vilémovská',\n",
2130
+ " 'Pospíšilovo',\n",
2131
+ " 'Kaštanu',\n",
2132
+ " 'Bělidla',\n",
2133
+ " 'Ratměřice',\n",
2134
+ " 'Radoušova',\n",
2135
+ " 'Traxlerova',\n",
2136
+ " 'tůních',\n",
2137
+ " 'Přečkova',\n",
2138
+ " 'Zvěřínská',\n",
2139
+ " 'Netlukám',\n",
2140
+ " 'Potštát',\n",
2141
+ " 'Osobovy',\n",
2142
+ " 'Farkám',\n",
2143
+ " 'Lassallova',\n",
2144
+ " 'Vantrokách',\n",
2145
+ " 'Desátém',\n",
2146
+ " 'Struhaře',\n",
2147
+ " 'Vítova',\n",
2148
+ " 'Lohrera',\n",
2149
+ " 'Pavlišovská',\n",
2150
+ " 'Milošice',\n",
2151
+ " 'Závisti',\n",
2152
+ " 'Pelikána',\n",
2153
+ " 'Rajhrad',\n",
2154
+ " 'Valenty',\n",
2155
+ " 'Rotherham',\n",
2156
+ " 'Zbába',\n",
2157
+ " 'Sborová',\n",
2158
+ " 'Senotín',\n",
2159
+ " 'Otmíče',\n",
2160
+ " 'Olivy',\n",
2161
+ " 'Zdymadlu',\n",
2162
+ " 'Lubě',\n",
2163
+ " 'Blata',\n",
2164
+ " 'Donín',\n",
2165
+ " 'roklí',\n",
2166
+ " 'Pernštejnské',\n",
2167
+ " 'Gustava',\n",
2168
+ " 'Malešická',\n",
2169
+ " 'Ronzovy',\n",
2170
+ " 'Bílka',\n",
2171
+ " 'Fišpance',\n",
2172
+ " 'skladům',\n",
2173
+ " 'Vleku',\n",
2174
+ " 'Pokutická',\n",
2175
+ " 'Koněspřežky',\n",
2176
+ " 'Koutecká',\n",
2177
+ " 'Malínky',\n",
2178
+ " 'Kulhavého',\n",
2179
+ " 'Ledenice',\n",
2180
+ " 'Rozvadovská',\n",
2181
+ " 'Kostrbova',\n",
2182
+ " 'Přeložky',\n",
2183
+ " 'vysočině',\n",
2184
+ " 'Plavínová',\n",
2185
+ " 'Štúrova',\n",
2186
+ " 'Špičníku',\n",
2187
+ " 'Grunty',\n",
2188
+ " 'Postřelmov',\n",
2189
+ " 'Při',\n",
2190
+ " 'Masných',\n",
2191
+ " 'Kusého',\n",
2192
+ " 'Jérez',\n",
2193
+ " 'Trstenická',\n",
2194
+ " 'Deštná',\n",
2195
+ " 'Spálenky',\n",
2196
+ " 'Paříž',\n",
2197
+ " 'Nebovazy',\n",
2198
+ " 'Bzdince',\n",
2199
+ " 'Prokopov',\n",
2200
+ " 'Plk.',\n",
2201
+ " 'Zbenice',\n",
2202
+ " 'Sněmovní',\n",
2203
+ " 'Hrázek',\n",
2204
+ " 'Kondrac',\n",
2205
+ " 'Vodnická',\n",
2206
+ " 'Kanada',\n",
2207
+ " 'Nýrsko',\n",
2208
+ " 'Radim',\n",
2209
+ " 'Lermontovova',\n",
2210
+ " 'Ploiesti',\n",
2211
+ " 'Knín',\n",
2212
+ " 'Slov.',\n",
2213
+ " 'Blanice',\n",
2214
+ " 'Pražákova',\n",
2215
+ " 'Moutnice',\n",
2216
+ " 'Bartoňův',\n",
2217
+ " 'Teslova',\n",
2218
+ " 'Rozsedly',\n",
2219
+ " 'Kolné',\n",
2220
+ " 'Kestřan',\n",
2221
+ " 'Bylany',\n",
2222
+ " 'Novinová',\n",
2223
+ " 'Chleby',\n",
2224
+ " 'Tocháčkův',\n",
2225
+ " 'Cibulky',\n",
2226
+ " 'Vizině',\n",
2227
+ " 'Veletiny',\n",
2228
+ " 'Podhorní',\n",
2229
+ " 'Kirilovova',\n",
2230
+ " 'mezí',\n",
2231
+ " 'Pánvích',\n",
2232
+ " 'Hliníky',\n",
2233
+ " 'Čajkova',\n",
2234
+ " 'Rýza',\n",
2235
+ " 'Šenovská',\n",
2236
+ " 'Ramzová',\n",
2237
+ " 'Vilémovec',\n",
2238
+ " 'Synalov',\n",
2239
+ " 'Táborského',\n",
2240
+ " 'Chlumčany',\n",
2241
+ " 'Salonika',\n",
2242
+ " 'Bělohradská',\n",
2243
+ " 'Pecihrádku',\n",
2244
+ " 'Británce',\n",
2245
+ " 'Herolda',\n",
2246
+ " 'Oznice',\n",
2247
+ " 'Mincovní',\n",
2248
+ " 'Žebětínská',\n",
2249
+ " 'Alejíčku',\n",
2250
+ " 'Vrbičany',\n",
2251
+ " 'Sobí',\n",
2252
+ " 'Oblekovická',\n",
2253
+ " 'úvozem',\n",
2254
+ " 'Hájčí',\n",
2255
+ " 'Brázdimská',\n",
2256
+ " 'Týneckého',\n",
2257
+ " 'Pamferova',\n",
2258
+ " 'Želinská',\n",
2259
+ " 'Koštíře',\n",
2260
+ " 'Količín',\n",
2261
+ " 'Jilmu',\n",
2262
+ " 'rybníkům',\n",
2263
+ " 'Souška',\n",
2264
+ " 'Korábě',\n",
2265
+ " 'Němčice',\n",
2266
+ " 'Všechromy',\n",
2267
+ " 'váhy',\n",
2268
+ " 'Podvlčí',\n",
2269
+ " 'hranic',\n",
2270
+ " 'Deutschova',\n",
2271
+ " 'Řádkách',\n",
2272
+ " 'Kochánky',\n",
2273
+ " 'Benešova',\n",
2274
+ " 'Olšávkou',\n",
2275
+ " 'hnízda',\n",
2276
+ " 'Stupešice',\n",
2277
+ " 'Rachvalská',\n",
2278
+ " 'Družnosti',\n",
2279
+ " 'Rozvadovice',\n",
2280
+ " 'Padařov',\n",
2281
+ " 'Souhrada',\n",
2282
+ " 'Sudoměř',\n",
2283
+ " 'Žitětín',\n",
2284
+ " 'Bartolomějská',\n",
2285
+ " 'Rtýňská',\n",
2286
+ " 'Jakobiho',\n",
2287
+ " 'Ungeltem',\n",
2288
+ " 'Božec',\n",
2289
+ " 'Michovka',\n",
2290
+ " 'Hospříz',\n",
2291
+ " 'Litomyšlské',\n",
2292
+ " 'honu',\n",
2293
+ " 'Borečkova',\n",
2294
+ " 'Tehovičkách',\n",
2295
+ " 'Stypova',\n",
2296
+ " 'sportovců',\n",
2297
+ " 'Čupy',\n",
2298
+ " 'Pecím',\n",
2299
+ " 'Podstádlí',\n",
2300
+ " 'Msgre',\n",
2301
+ " 'Ryšánce',\n",
2302
+ " 'Zárybnice',\n",
2303
+ " 'Labi',\n",
2304
+ " 'Výmolem',\n",
2305
+ " 'Klacovská',\n",
2306
+ " 'Janštejn',\n",
2307
+ " 'Lukavského',\n",
2308
+ " 'Přímské',\n",
2309
+ " 'Dožice',\n",
2310
+ " 'Palcary',\n",
2311
+ " 'Paneláku',\n",
2312
+ " 'Rákosová',\n",
2313
+ " 'Bulhara',\n",
2314
+ " 'Krondlova',\n",
2315
+ " 'Mlýnský',\n",
2316
+ " 'Bernáčkova',\n",
2317
+ " 'Mirošovická',\n",
2318
+ " 'Jevišovka',\n",
2319
+ " 'Valinkevičova',\n",
2320
+ " 'Častonín',\n",
2321
+ " 'Důlku',\n",
2322
+ " 'Poledníkova',\n",
2323
+ " 'Enkláva',\n",
2324
+ " 'Ottova',\n",
2325
+ " 'Kralovice',\n",
2326
+ " 'Češov',\n",
2327
+ " 'Zvonková',\n",
2328
+ " 'Rohozenská',\n",
2329
+ " 'Ferrariho',\n",
2330
+ " 'Josef',\n",
2331
+ " 'Čertousy',\n",
2332
+ " 'Špýcharem',\n",
2333
+ " 'Czestochowa',\n",
2334
+ " 'Hrbov',\n",
2335
+ " 'Bukačov',\n",
2336
+ " 'Milovicům',\n",
2337
+ " 'Smetánkou',\n",
2338
+ " 'Příkopy',\n",
2339
+ " 'Kanín',\n",
2340
+ " 'Příči',\n",
2341
+ " 'Čakovická',\n",
2342
+ " 'Závěrce',\n",
2343
+ " 'Lahovskou',\n",
2344
+ " 'Znojmo',\n",
2345
+ " 'Kypy',\n",
2346
+ " 'Záhlinice',\n",
2347
+ " 'rukavičkárně',\n",
2348
+ " 'Slubice',\n",
2349
+ " 'kulturním',\n",
2350
+ " 'Folmava',\n",
2351
+ " 'Maninách',\n",
2352
+ " 'Blahobytu',\n",
2353
+ " 'Lichtenštejnská',\n",
2354
+ " 'Fortny',\n",
2355
+ " 'Kaštanová',\n",
2356
+ " 'Šípkova',\n",
2357
+ " 'Voňavá',\n",
2358
+ " 'Jezerce',\n",
2359
+ " 'Hořany',\n",
2360
+ " 'Vilémovice',\n",
2361
+ " 'Dolánecké',\n",
2362
+ " 'Práchovně',\n",
2363
+ " 'Rukáveč',\n",
2364
+ " 'Bubovická',\n",
2365
+ " 'Fortenská',\n",
2366
+ " 'Mackova',\n",
2367
+ " 'Doublovičky',\n",
2368
+ " 'Votroubkova',\n",
2369
+ " 'středu',\n",
2370
+ " 'Dómská',\n",
2371
+ " 'Křečkov',\n",
2372
+ " 'Blaho',\n",
2373
+ " 'Kytlická',\n",
2374
+ " 'Drozdická',\n",
2375
+ " 'habru',\n",
2376
+ " 'Blažkova',\n",
2377
+ " 'Hospodou',\n",
2378
+ " 'Vráží',\n",
2379
+ " 'Mošnice',\n",
2380
+ " 'Cest',\n",
2381
+ " 'Hackerova',\n",
2382
+ " 'Nantes',\n",
2383
+ " 'Čakovice',\n",
2384
+ " 'Kuchynky',\n",
2385
+ " 'Harantova',\n",
2386
+ " 'Drátenická',\n",
2387
+ " 'Vítězství',\n",
2388
+ " 'Vraní',\n",
2389
+ " 'Maredova',\n",
2390
+ " 'koutku',\n",
2391
+ " 'kultury',\n",
2392
+ " 'Habrkovice',\n",
2393
+ " 'vodárně',\n",
2394
+ " 'Hřivno',\n",
2395
+ " 'Timisoara',\n",
2396
+ " 'lomy',\n",
2397
+ " 'Řeháčkova',\n",
2398
+ " 'Najmanské',\n",
2399
+ " 'Uhřínovice',\n",
2400
+ " 'Přítkov',\n",
2401
+ " 'Mikulovická',\n",
2402
+ " 'Zábeštní',\n",
2403
+ " 'struskách',\n",
2404
+ " 'Haldou',\n",
2405
+ " 'Škardou',\n",
2406
+ " 'Durďákova',\n",
2407
+ " 'Kelčická',\n",
2408
+ " 'Babolky',\n",
2409
+ " 'Jaroslavu',\n",
2410
+ " 'Rožmitálova',\n",
2411
+ " 'cesty',\n",
2412
+ " 'Uhlířov',\n",
2413
+ " 'Radhošť',\n",
2414
+ " 'Novosedlické',\n",
2415
+ " 'Blatin',\n",
2416
+ " 'Kotlanova',\n",
2417
+ " 'Veselé',\n",
2418
+ " 'Krásněves',\n",
2419
+ " 'Emance',\n",
2420
+ " 'Pragovka',\n",
2421
+ " 'Reinišova',\n",
2422
+ " 'Kyje',\n",
2423
+ " 'Kadolec',\n",
2424
+ " 'Brdská',\n",
2425
+ " 'Štáblovice',\n",
2426
+ " 'Žíchovec',\n",
2427
+ " 'Dvorska',\n",
2428
+ " 'Kamenem',\n",
2429
+ " 'hájovny',\n",
2430
+ " 'Rozvodna',\n",
2431
+ " 'Sušárny',\n",
2432
+ " 'zálomu',\n",
2433
+ " 'Záhumenská',\n",
2434
+ " 'Rácovice',\n",
2435
+ " 'Vítkově',\n",
2436
+ " 'Peroutka',\n",
2437
+ " 'Fanty',\n",
2438
+ " 'Jandy',\n",
2439
+ " 'Slunečný',\n",
2440
+ " 'Křimice',\n",
2441
+ " 'Žebnice',\n",
2442
+ " 'Prvomájová',\n",
2443
+ " 'Ohře',\n",
2444
+ " 'Krsice',\n",
2445
+ " 'Karmelitská',\n",
2446
+ " 'Stivínové',\n",
2447
+ " 'Bradlec',\n",
2448
+ " 'Všehrdovo',\n",
2449
+ " 'Kokrdy',\n",
2450
+ " 'Ptáčnická',\n",
2451
+ " 'Cikánky',\n",
2452
+ " 'Komárovské',\n",
2453
+ " 'Palpostě',\n",
2454
+ " 'Kurnického',\n",
2455
+ " 'Zakopaná',\n",
2456
+ " 'Melč',\n",
2457
+ " 'Lichnická',\n",
2458
+ " 'Dubinská',\n",
2459
+ " 'Krautgartnerova',\n",
2460
+ " 'Kravínu',\n",
2461
+ " 'Chilská',\n",
2462
+ " 'Stříbrného',\n",
2463
+ " 'losách',\n",
2464
+ " 'Macháčka',\n",
2465
+ " 'Schodišťová',\n",
2466
+ " 'Sýpka',\n",
2467
+ " 'Nebřich',\n",
2468
+ " 'Klabalská',\n",
2469
+ " 'Vatinám',\n",
2470
+ " 'Školičkou',\n",
2471
+ " 'Tkalcovská',\n",
2472
+ " 'Hliněný',\n",
2473
+ " 'Kašovická',\n",
2474
+ " 'krámy',\n",
2475
+ " 'silnici',\n",
2476
+ " 'Pokojná',\n",
2477
+ " 'Ohrobecká',\n",
2478
+ " 'Tlumačov',\n",
2479
+ " 'Záříčí',\n",
2480
+ " 'Tománkova',\n",
2481
+ " 'Poděvousy',\n",
2482
+ " 'Záhorského',\n",
2483
+ " 'Jindry',\n",
2484
+ " 'Výtuňská',\n",
2485
+ " 'Jelínkova',\n",
2486
+ " 'Pokratická',\n",
2487
+ " 'Břestek',\n",
2488
+ " 'Eškova',\n",
2489
+ " 'tratě',\n",
2490
+ " 'Závodišti',\n",
2491
+ " 'Křížkový',\n",
2492
+ " 'Hustopeče',\n",
2493
+ " 'Domova',\n",
2494
+ " 'Šánovická',\n",
2495
+ " 'Houpačkách',\n",
2496
+ " 'vokovické',\n",
2497
+ " 'Novopetrovická',\n",
2498
+ " 'Trojana',\n",
2499
+ " 'Provazníkova',\n",
2500
+ " 'Hlohová',\n",
2501
+ " 'Tramvajní',\n",
2502
+ " 'Pávla',\n",
2503
+ " 'Hvožďanská',\n",
2504
+ " 'Borise',\n",
2505
+ " 'Záseka',\n",
2506
+ " 'Bítov',\n",
2507
+ " 'Vítkovu',\n",
2508
+ " 'Ludinou',\n",
2509
+ " 'Matek',\n",
2510
+ " 'Čejov',\n",
2511
+ " 'Třískolupy',\n",
2512
+ " '24.',\n",
2513
+ " 'Kole',\n",
2514
+ " 'Křemenná',\n",
2515
+ " 'Zapova',\n",
2516
+ " 'Vejrostova',\n",
2517
+ " 'Březovická',\n",
2518
+ " 'Knížáku',\n",
2519
+ " 'Urešova',\n",
2520
+ " 'Vyhnálov',\n",
2521
+ " 'Krosenská',\n",
2522
+ " 'Podkozí',\n",
2523
+ " 'Hrutkov',\n",
2524
+ " 'Opolenec',\n",
2525
+ " 'Vidov',\n",
2526
+ " 'Nezbavětice',\n",
2527
+ " 'bytovkách',\n",
2528
+ " 'Kocbeře',\n",
2529
+ " 'Zderaze',\n",
2530
+ " 'Sandwell',\n",
2531
+ " 'Baldové',\n",
2532
+ " 'Jeslí',\n",
2533
+ " 'Polom',\n",
2534
+ " 'Mokerské',\n",
2535
+ " 'Kujavy',\n",
2536
+ " 'čokoládoven',\n",
2537
+ " 'Řebří',\n",
2538
+ " 'Bezprašná',\n",
2539
+ " 'Chovatelská',\n",
2540
+ " 'Suchanovova',\n",
2541
+ " 'Lešanská',\n",
2542
+ " 'Krňovská',\n",
2543
+ " 'Högrova',\n",
2544
+ " 'Jakubská',\n",
2545
+ " 'Láskov',\n",
2546
+ " 'Lámař',\n",
2547
+ " 'Stejskalova',\n",
2548
+ " 'Kolbena',\n",
2549
+ " 'Mikuleč',\n",
2550
+ " 'Mysliboř',\n",
2551
+ " 'Morkovice',\n",
2552
+ " 'Harfou',\n",
2553
+ " 'Lety',\n",
2554
+ " 'Koziny',\n",
2555
+ " 'Smržice',\n",
2556
+ " 'Hejmy',\n",
2557
+ " 'pomníkem',\n",
2558
+ " 'Libice',\n",
2559
+ " 'Měřičkova',\n",
2560
+ " 'Betlémská',\n",
2561
+ " 'Káranská',\n",
2562
+ " 'zvonici',\n",
2563
+ " 'Zahraničních',\n",
2564
+ " 'Felixova',\n",
2565
+ " 'Floriana',\n",
2566
+ " 'Dvouramenná',\n",
2567
+ " 'Sovoluská',\n",
2568
+ " 'Lobodice',\n",
2569
+ " 'Jiřičkum',\n",
2570
+ " 'Mýtu',\n",
2571
+ " 'Průplavu',\n",
2572
+ " 'Kalužní',\n",
2573
+ " 'Vítězov',\n",
2574
+ " 'Pflegrova',\n",
2575
+ " 'Juřinka',\n",
2576
+ " 'Hrabek',\n",
2577
+ " 'Chatami',\n",
2578
+ " 'Žlebu',\n",
2579
+ " 'Tuřanská',\n",
2580
+ " 'Kúty',\n",
2581
+ " 'Zátyní',\n",
2582
+ " 'Vymazalova',\n",
2583
+ " 'Sedlářská',\n",
2584
+ " 'Zbraslav',\n",
2585
+ " 'Skryjská',\n",
2586
+ " 'Žlábka',\n",
2587
+ " 'rybníkem',\n",
2588
+ " 'Přilehlá',\n",
2589
+ " 'Plácek',\n",
2590
+ " 'Stálky',\n",
2591
+ " 'Bolešiny',\n",
2592
+ " 'Krčínovo',\n",
2593
+ " 'Březhrad',\n",
2594
+ " 'Hrubé',\n",
2595
+ " 'Kratochvílova',\n",
2596
+ " 'Stezka',\n",
2597
+ " 'Farách',\n",
2598
+ " 'Měrovice',\n",
2599
+ " 'Roprachtice',\n",
2600
+ " 'Křemenáčová',\n",
2601
+ " 'Příčky',\n",
2602
+ " 'Pravěká',\n",
2603
+ " 'Mannerova',\n",
2604
+ " 'Bahně',\n",
2605
+ " 'Lesinka',\n",
2606
+ " 'Hovorčovická',\n",
2607
+ " 'Ovesné',\n",
2608
+ " 'Cítolibská',\n",
2609
+ " 'Blatec',\n",
2610
+ " 'Myšlínu',\n",
2611
+ " 'Voletinská',\n",
2612
+ " 'vápence',\n",
2613
+ " 'Otaslavice',\n",
2614
+ " 'garáže-Černé',\n",
2615
+ " 'Pflegerova',\n",
2616
+ " 'Hrdoňovice',\n",
2617
+ " 'jirchářích',\n",
2618
+ " 'Šlapetova',\n",
2619
+ " 'Březhradská',\n",
2620
+ " 'Zlatého',\n",
2621
+ " 'Rapotina',\n",
2622
+ " 'Hostějov',\n",
2623
+ " 'Pittsburská',\n",
2624
+ " 'Kopanice',\n",
2625
+ " 'schůdkách',\n",
2626
+ " 'Ladova',\n",
2627
+ " 'Mikulovice',\n",
2628
+ " 'Stáj',\n",
2629
+ " 'Házů',\n",
2630
+ " 'Glazkovova',\n",
2631
+ " 'Žerotínova',\n",
2632
+ " 'Žárovná',\n",
2633
+ " 'Fojtská',\n",
2634
+ " 'Buč',\n",
2635
+ " 'Žižkovec',\n",
2636
+ " 'Jesenice',\n",
2637
+ " 'Třebízského',\n",
2638
+ " 'Pertoltická',\n",
2639
+ " 'Vlkovice',\n",
2640
+ " 'Zahradám',\n",
2641
+ " 'Václavíka',\n",
2642
+ " 'Preslova',\n",
2643
+ " 'Těsná',\n",
2644
+ " 'Ještědem',\n",
2645
+ " 'Kyjov',\n",
2646
+ " 'Fary',\n",
2647
+ " 'Krvavá',\n",
2648
+ " 'Šebora',\n",
2649
+ " 'Poličské',\n",
2650
+ " 'Ratinky',\n",
2651
+ " 'Kotyzy',\n",
2652
+ " 'Chudolazy',\n",
2653
+ " 'Věteřov',\n",
2654
+ " 'Ol.',\n",
2655
+ " 'Cvičištěm',\n",
2656
+ " 'Rusek',\n",
2657
+ " 'Nezabudická',\n",
2658
+ " 'Vlastišov',\n",
2659
+ " 'Podsychrovská',\n",
2660
+ " 'Včelenská',\n",
2661
+ " 'Pacltova',\n",
2662
+ " 'Spořická',\n",
2663
+ " 'Ceplechova',\n",
2664
+ " 'Štidla',\n",
2665
+ " 'Strusky',\n",
2666
+ " 'Chotovice',\n",
2667
+ " 'Čejky',\n",
2668
+ " 'Ženskými',\n",
2669
+ " 'Sweden',\n",
2670
+ " 'Herinku',\n",
2671
+ " 'Podbořan',\n",
2672
+ " 'Volanovská',\n",
2673
+ " 'Čečkovice',\n",
2674
+ " 'Minaret',\n",
2675
+ " 'Klusáčka',\n",
2676
+ " 'Porážková',\n",
2677
+ " 'Radomyšl',\n",
2678
+ " 'Šerkov',\n",
2679
+ " 'Jilská',\n",
2680
+ " 'Leskovice',\n",
2681
+ " 'Deštnice',\n",
2682
+ " 'Hlinikách',\n",
2683
+ " 'strouhy',\n",
2684
+ " 'Budilovo',\n",
2685
+ " 'Pořešín',\n",
2686
+ " 'Budy',\n",
2687
+ " 'Svatogothardská',\n",
2688
+ " 'Koňským',\n",
2689
+ " 'Beladova',\n",
2690
+ " 'Gruši',\n",
2691
+ " 'Čachnov',\n",
2692
+ " 'Mánesovo',\n",
2693
+ " 'Lublinská',\n",
2694
+ " 'Hostěrádky-Rešov',\n",
2695
+ " 'Tasovská',\n",
2696
+ " 'Budiměřice',\n",
2697
+ " 'Řeřichová',\n",
2698
+ " 'Žlebech',\n",
2699
+ " 'Kováříkova',\n",
2700
+ " 'Pazderce',\n",
2701
+ " 'Bukovanského',\n",
2702
+ " 'Samechov',\n",
2703
+ " 'Zbůch',\n",
2704
+ " 'Bukovická',\n",
2705
+ " 'Xaverovu',\n",
2706
+ " 'Veličkou',\n",
2707
+ " 'lis',\n",
2708
+ " 'Nahořanská',\n",
2709
+ " 'Hrobce',\n",
2710
+ " 'Kozlůvka',\n",
2711
+ " 'Višňovky',\n",
2712
+ " 'Rochdale',\n",
2713
+ " 'náplavkou',\n",
2714
+ " 'Jílovecká',\n",
2715
+ " 'Orlovice',\n",
2716
+ " 'Jedová',\n",
2717
+ " 'Želatovská',\n",
2718
+ " 'Činírna',\n",
2719
+ " 'Šrůtkova',\n",
2720
+ " 'Žalkovice',\n",
2721
+ " 'Chořelice',\n",
2722
+ " 'Cvičišti',\n",
2723
+ " 'Kachlířkou',\n",
2724
+ " 'podkovy',\n",
2725
+ " 'Plánkova',\n",
2726
+ " 'Havranická',\n",
2727
+ " 'Roháčových',\n",
2728
+ " 'Vrše',\n",
2729
+ " 'háji',\n",
2730
+ " 'Václavka',\n",
2731
+ " 'Malostranské',\n",
2732
+ " 'Iceland',\n",
2733
+ " 'Rozstání',\n",
2734
+ " 'Marciho',\n",
2735
+ " 'Straškov',\n",
2736
+ " 'Nezvalova',\n",
2737
+ " 'Hoře',\n",
2738
+ " 'Rubíka',\n",
2739
+ " 'Balkánem',\n",
2740
+ " 'Branaldova',\n",
2741
+ " 'Šváby',\n",
2742
+ " 'Porhajmova',\n",
2743
+ " 'Osma',\n",
2744
+ " 'Mostov',\n",
2745
+ " 'Bříšťanská',\n",
2746
+ " 'Buďárkova',\n",
2747
+ " 'Herecká',\n",
2748
+ " 'Dědinou',\n",
2749
+ " 'Krčkovice',\n",
2750
+ " 'Švábovská',\n",
2751
+ " 'Vonoklaská',\n",
2752
+ " 'Lipanům',\n",
2753
+ " 'zastávce',\n",
2754
+ " 'Hubera',\n",
2755
+ " 'Miličín',\n",
2756
+ " 'Alberta',\n",
2757
+ " 'vojenským',\n",
2758
+ " 'Jeřábu',\n",
2759
+ " 'Renoty',\n",
2760
+ " 'Liberia',\n",
2761
+ " 'Lindrách',\n",
2762
+ " 'Bryansk',\n",
2763
+ " 'Turská',\n",
2764
+ " 'Záhořanského',\n",
2765
+ " 'Zámělská',\n",
2766
+ " 'Hlízovská',\n",
2767
+ " 'Kličky',\n",
2768
+ " 'Souhradská',\n",
2769
+ " 'Libínky',\n",
2770
+ " 'Todická',\n",
2771
+ " 'Serpentina',\n",
2772
+ " 'Ručičce',\n",
2773
+ " 'Kožíkova',\n",
2774
+ " 'Turbíny',\n",
2775
+ " 'Drásov',\n",
2776
+ " 'Topičská',\n",
2777
+ " 'Dlážděnce',\n",
2778
+ " 'Třešňovce',\n",
2779
+ " 'Důlní',\n",
2780
+ " 'Aldova',\n",
2781
+ " 'von',\n",
2782
+ " 'Chaloupka',\n",
2783
+ " 'Jánu',\n",
2784
+ " 'Prahy',\n",
2785
+ " 'Boletice',\n",
2786
+ " 'Jerevanská',\n",
2787
+ " 'Nesyt',\n",
2788
+ " 'Fechtnera',\n",
2789
+ " 'Raťkov',\n",
2790
+ " 'Malovců',\n",
2791
+ " 'Boskovicova',\n",
2792
+ " 'Jirovcová',\n",
2793
+ " 'Žichov',\n",
2794
+ " 'Obrátice',\n",
2795
+ " 'Němětice',\n",
2796
+ " 'Rohanovem',\n",
2797
+ " 'Písečská',\n",
2798
+ " 'Vějíři',\n",
2799
+ " 'Vyšehoří',\n",
2800
+ " 'Hrázky',\n",
2801
+ " 'Pazderáku',\n",
2802
+ " 'Hodkovská',\n",
2803
+ " 'Antonínov',\n",
2804
+ " 'Jabloňov',\n",
2805
+ " 'Hlincová',\n",
2806
+ " 'Křížkem',\n",
2807
+ " 'Marklovice',\n",
2808
+ " 'Máchovo',\n",
2809
+ " 'Lucia',\n",
2810
+ " 'Machníkova',\n",
2811
+ " 'Borkem',\n",
2812
+ " 'Tištín',\n",
2813
+ " 'Šubertovo',\n",
2814
+ " 'Předvoje',\n",
2815
+ " 'Popovec',\n",
2816
+ " 'Horníků',\n",
2817
+ " 'Ponětovice',\n",
2818
+ " 'krovem',\n",
2819
+ " 'Odborů',\n",
2820
+ " 'Feřtekova',\n",
2821
+ " 'Metelkou',\n",
2822
+ " 'Červeném',\n",
2823
+ " 'Švejcarovo',\n",
2824
+ " 'Nadhumení',\n",
2825
+ " 'Kylešovská',\n",
2826
+ " 'Zlatoust',\n",
2827
+ " 'Mažice',\n",
2828
+ " 'Moklině',\n",
2829
+ " 'Radešínská',\n",
2830
+ " 'Pečírkova',\n",
2831
+ " 'Řeheč',\n",
2832
+ " 'Tererova',\n",
2833
+ " 'Vojanky',\n",
2834
+ " 'Pelhřimov',\n",
2835
+ " 'Basse',\n",
2836
+ " 'Olšinkách',\n",
2837
+ " 'Zatáčkách',\n",
2838
+ " 'Helenín',\n",
2839
+ " 'úvozu',\n",
2840
+ " 'Čertův',\n",
2841
+ " 'Drinopolem',\n",
2842
+ " 'Gerasimovova',\n",
2843
+ " 'Dolnocholupická',\n",
2844
+ " 'Brandla',\n",
2845
+ " 'lipách',\n",
2846
+ " 'Jetelová',\n",
2847
+ " 'Stráňka',\n",
2848
+ " 'Vraty',\n",
2849
+ " 'Chramiště',\n",
2850
+ " 'Čeňku',\n",
2851
+ " 'Pančava',\n",
2852
+ " 'Kolešovce',\n",
2853
+ " 'Dagmar',\n",
2854
+ " 'Kyjovická',\n",
2855
+ " 'Sklepům',\n",
2856
+ " 'Ressela',\n",
2857
+ " 'Krinitova',\n",
2858
+ " 'Jeneč',\n",
2859
+ " 'Neveklovská',\n",
2860
+ " 'Kučovanská',\n",
2861
+ " 'Robotou',\n",
2862
+ " 'Markvartovická',\n",
2863
+ " 'Přívozní',\n",
2864
+ " 'Derridova',\n",
2865
+ " 'Stromky',\n",
2866
+ " 'Borovém',\n",
2867
+ " 'Křižanovice',\n",
2868
+ " 'Baborčici',\n",
2869
+ " 'Sport',\n",
2870
+ " 'Štíhlická',\n",
2871
+ " 'Fajtlova',\n",
2872
+ " 'Bezručova',\n",
2873
+ " 'Pouzdřanská',\n",
2874
+ " 'Radonická',\n",
2875
+ " 'Kbelce',\n",
2876
+ " 'Tkadlecova',\n",
2877
+ " 'Cholupice',\n",
2878
+ " 'Jánem',\n",
2879
+ " 'Abácie',\n",
2880
+ " 'dětského',\n",
2881
+ " 'Nazdice',\n",
2882
+ " 'Vítězslavy',\n",
2883
+ " 'Příšovice',\n",
2884
+ " 'Vnitřní',\n",
2885
+ " 'Hradec',\n",
2886
+ " 'Granitova',\n",
2887
+ " 'Studeňská',\n",
2888
+ " 'Tarase',\n",
2889
+ " 'Urbaníkova',\n",
2890
+ " 'Vlčkova',\n",
2891
+ " 'Dříteč',\n",
2892
+ " 'Chuderov',\n",
2893
+ " 'Hostouňská',\n",
2894
+ " 'Terézy',\n",
2895
+ " 'Flédlova',\n",
2896
+ " 'Curie',\n",
2897
+ " 'Brothánkova',\n",
2898
+ " 'Suzdalské',\n",
2899
+ " 'Chalupy',\n",
2900
+ " 'Baštou',\n",
2901
+ " 'Kocouráku',\n",
2902
+ " 'Koškova',\n",
2903
+ " 'Bardinové',\n",
2904
+ " 'Otročice',\n",
2905
+ " 'Taranzova',\n",
2906
+ " 'Vratěnín',\n",
2907
+ " 'van',\n",
2908
+ " 'Petrůvky',\n",
2909
+ " 'Třebčická',\n",
2910
+ " 'Kulíšek',\n",
2911
+ " 'Wollerů',\n",
2912
+ " 'Peklem',\n",
2913
+ " 'Křepelka',\n",
2914
+ " 'Wihanova',\n",
2915
+ " 'Šlejnická',\n",
2916
+ " 'polím',\n",
2917
+ " 'Hřištěm',\n",
2918
+ " 'Dubovinou',\n",
2919
+ " 'Touchovice',\n",
2920
+ " 'Dobřeň',\n",
2921
+ " 'Výtopnou',\n",
2922
+ " 'Holešova',\n",
2923
+ " 'ATHINAI',\n",
2924
+ " 'Spojeneckých',\n",
2925
+ " 'Tuchoměřická',\n",
2926
+ " 'křižovatky',\n",
2927
+ " 'Pannou',\n",
2928
+ " 'Orlovy',\n",
2929
+ " 'Parkerova',\n",
2930
+ " 'Opařanská',\n",
2931
+ " 'Třebíz',\n",
2932
+ " 'Jaroňkova',\n",
2933
+ " 'Prašivce',\n",
2934
+ " \"Rosh-Ha'ayin\",\n",
2935
+ " 'Řepín',\n",
2936
+ " 'Ploukonice',\n",
2937
+ " 'Bukovsko',\n",
2938
+ " 'Nebeské',\n",
2939
+ " 'seřadiště',\n",
2940
+ " 'Bártlova',\n",
2941
+ " 'Rachtě',\n",
2942
+ " 'Calderdale',\n",
2943
+ " 'Kosí',\n",
2944
+ " 'Brněnky',\n",
2945
+ " 'Vykuku',\n",
2946
+ " 'Cimbále',\n",
2947
+ " 'Kopist',\n",
2948
+ " 'Náves',\n",
2949
+ " 'Hajnovkách',\n",
2950
+ " 'Branický',\n",
2951
+ " 'Radíkovy',\n",
2952
+ " 'stupni',\n",
2953
+ " 'Klimentce',\n",
2954
+ " 'Jokla',\n",
2955
+ " 'Macedonia',\n",
2956
+ " 'Ochozská',\n",
2957
+ " 'Šibeníkem',\n",
2958
+ " 'Zóna',\n",
2959
+ " 'Školská',\n",
2960
+ " 'Hážovice',\n",
2961
+ " 'Přelovice',\n",
2962
+ " 'Sejkorova',\n",
2963
+ " 'Moravě',\n",
2964
+ " 'Vlněna',\n",
2965
+ " 'Zaryjach',\n",
2966
+ " 'Skržice',\n",
2967
+ " 'Příjemky',\n",
2968
+ " 'Častavina',\n",
2969
+ " 'Chocholí',\n",
2970
+ " 'Malšovický',\n",
2971
+ " 'Žlubincem',\n",
2972
+ " 'Flašky',\n",
2973
+ " 'Sněžnická',\n",
2974
+ " 'Lískovecká',\n",
2975
+ " 'Radobyčice',\n",
2976
+ " 'Skalice',\n",
2977
+ " 'Bunzla',\n",
2978
+ " 'Malíkov',\n",
2979
+ " 'Lithuania',\n",
2980
+ " 'Limnická',\n",
2981
+ " 'Chuchel',\n",
2982
+ " 'Hlavňovice',\n",
2983
+ " 'Chytiličky',\n",
2984
+ " 'Loudova',\n",
2985
+ " 'Warrington',\n",
2986
+ " 'Příčce',\n",
2987
+ " 'Hnojnice',\n",
2988
+ " 'Diouse',\n",
2989
+ " 'Stanová',\n",
2990
+ " 'Lalůvkou',\n",
2991
+ " 'Todně',\n",
2992
+ " 'Bezruče',\n",
2993
+ " 'Okluková',\n",
2994
+ " 'Torino',\n",
2995
+ " 'Jandové',\n",
2996
+ " 'Ďáblicům',\n",
2997
+ " 'selském',\n",
2998
+ " 'Benešovice',\n",
2999
+ " 'dobré',\n",
3000
+ " 'Rynholec',\n",
3001
+ " 'Puchárnou',\n",
3002
+ " 'Losenická',\n",
3003
+ " 'Valšovská',\n",
3004
+ " 'Pohořelice',\n",
3005
+ " 'Chatách',\n",
3006
+ " 'Grónská',\n",
3007
+ " 'Šebířov',\n",
3008
+ " 'Fejfarova',\n",
3009
+ " 'přechodu',\n",
3010
+ " 'Ličná',\n",
3011
+ " 'Mláka',\n",
3012
+ " 'Lázeňské',\n",
3013
+ " 'vesničky',\n",
3014
+ " 'Tovačovská',\n",
3015
+ " 'Ústaleč',\n",
3016
+ " 'Velkopavlovická',\n",
3017
+ " 'Trávníkách',\n",
3018
+ " 'Zlosyň',\n",
3019
+ " 'Lávkou',\n",
3020
+ " 'Ostašova',\n",
3021
+ " 'Žufanova',\n",
3022
+ " 'Mat.',\n",
3023
+ " 'd��lkách',\n",
3024
+ " 'Klementice',\n",
3025
+ " 'Jamkám',\n",
3026
+ " 'Valech',\n",
3027
+ " 'Křtomil',\n",
3028
+ " 'Moravcem',\n",
3029
+ " 'Milenov',\n",
3030
+ " 'Chvalatice',\n",
3031
+ " 'Pulice',\n",
3032
+ " 'Lišticí',\n",
3033
+ " 'Čechách',\n",
3034
+ " 'průčelí',\n",
3035
+ " 'Bítešská',\n",
3036
+ " 'Žabinci',\n",
3037
+ " 'Rychtáře',\n",
3038
+ " 'Zňátky',\n",
3039
+ " 'Nehasice',\n",
3040
+ " 'Hlízov',\n",
3041
+ " 'Babčice',\n",
3042
+ " 'štěrkovně',\n",
3043
+ " 'Tří',\n",
3044
+ " 'Čáslavky',\n",
3045
+ " 'Říjnová',\n",
3046
+ " 'humny',\n",
3047
+ " 'jezem',\n",
3048
+ " 'Tyršové',\n",
3049
+ " 'Korbářova',\n",
3050
+ " 'Pasekách',\n",
3051
+ " 'Vochlířská',\n",
3052
+ " 'Skorotice',\n",
3053
+ " 'Rubanisko',\n",
3054
+ " 'Bertrámová',\n",
3055
+ " 'Lutov',\n",
3056
+ " 'Herálecká',\n",
3057
+ " 'Wassermannova',\n",
3058
+ " 'Turbovická',\n",
3059
+ " 'Kocourovec',\n",
3060
+ " 'Maxovkou',\n",
3061
+ " 'Brusinková',\n",
3062
+ " 'Wericha',\n",
3063
+ " 'Pokorova',\n",
3064
+ " 'Javorská',\n",
3065
+ " 'Zámecký',\n",
3066
+ " 'Vyskočila',\n",
3067
+ " 'Lachovice',\n",
3068
+ " 'Kozlerova',\n",
3069
+ " 'most',\n",
3070
+ " 'Kaštila',\n",
3071
+ " 'Bání',\n",
3072
+ " 'Gočárova',\n",
3073
+ " 'Hampla',\n",
3074
+ " 'Loketská',\n",
3075
+ " 'výstavby',\n",
3076
+ " 'Krk',\n",
3077
+ " 'flošna',\n",
3078
+ " 'Rantířovská',\n",
3079
+ " 'zákopě',\n",
3080
+ " 'Hornohradební',\n",
3081
+ " 'Saveljevova',\n",
3082
+ " 'Kateřinské',\n",
3083
+ " 'Buděšínského',\n",
3084
+ " 'Jordana',\n",
3085
+ " 'Radiměř',\n",
3086
+ " 'Plástky',\n",
3087
+ " 'Jedlová',\n",
3088
+ " 'Petrovská',\n",
3089
+ " 'Podhořany',\n",
3090
+ " 'Čechy',\n",
3091
+ " 'Konšelská',\n",
3092
+ " ...}}"
3093
+ ]
3094
+ },
3095
+ "execution_count": 1,
3096
+ "metadata": {},
3097
+ "output_type": "execute_result"
3098
+ }
3099
+ ],
3100
+ "source": [
3101
+ "import pickle \n",
3102
+ "with open(\"gazz.json\", 'rb') as f:\n",
3103
+ " gazetteers = pickle.load(f)\n",
3104
+ "gazetteers"
3105
+ ]
3106
+ },
3107
+ {
3108
+ "cell_type": "code",
3109
+ "execution_count": 3,
3110
+ "metadata": {},
3111
+ "outputs": [],
3112
+ "source": [
3113
+ "for k, v in gazetteers.items():\n",
3114
+ " gazetteers[k] = list(v)"
3115
+ ]
3116
+ },
3117
+ {
3118
+ "cell_type": "code",
3119
+ "execution_count": 4,
3120
+ "metadata": {},
3121
+ "outputs": [],
3122
+ "source": [
3123
+ "import json\n",
3124
+ "with open(\"gazz2.json\", \"w\") as file:\n",
3125
+ " json.dump(gazetteers, file, ensure_ascii=False, indent=4)"
3126
+ ]
3127
+ }
3128
+ ],
3129
+ "metadata": {
3130
+ "kernelspec": {
3131
+ "display_name": "DP",
3132
+ "language": "python",
3133
+ "name": "python3"
3134
+ },
3135
+ "language_info": {
3136
+ "codemirror_mode": {
3137
+ "name": "ipython",
3138
+ "version": 3
3139
+ },
3140
+ "file_extension": ".py",
3141
+ "mimetype": "text/x-python",
3142
+ "name": "python",
3143
+ "nbconvert_exporter": "python",
3144
+ "pygments_lexer": "ipython3",
3145
+ "version": "3.10.14"
3146
+ }
3147
+ },
3148
+ "nbformat": 4,
3149
+ "nbformat_minor": 2
3150
+ }
website_script.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer
3
+
4
+ from extended_embeddings.token_classification import ExtendedEmbeddigsRobertaForTokenClassification
5
+ from data_manipulation.dataset_funcions import load_gazetteers, gazetteer_matching, align_gazetteers_with_tokens
6
+ from data_manipulation.preprocess_gazetteers import build_reverse_dictionary
7
+
8
+
9
+ def load():
10
+ model_name = "ufal/robeczech-base"
11
+ model_path = "bettystr/NerRoB-czech"
12
+ model = ExtendedEmbeddigsRobertaForTokenClassification.from_pretrained(model_path).to("cpu")
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+ model.eval()
15
+ gazetteers_path = "gazz2.json"
16
+ gazetteers_for_matching = load_gazetteers(gazetteers_path)
17
+ temp = []
18
+ for i in gazetteers_for_matching.keys():
19
+ temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
20
+ gazetteers_for_matching = temp
21
+ return tokenizer, model, gazetteers_for_matching
22
+
23
+
24
+ def run(tokenizer, model, gazetteers_for_matching, text):
25
+
26
+ tokenized_inputs = tokenizer(
27
+ text, truncation=True, is_split_into_words=False
28
+ )
29
+ matches = gazetteer_matching(text, gazetteers_for_matching)
30
+ new_g = []
31
+ word_ids = tokenized_inputs.word_ids()
32
+ new_g.append(align_gazetteers_with_tokens(matches, word_ids))
33
+ p, o, l = [], [], []
34
+ for i in new_g:
35
+ p.append([x[0] for x in i])
36
+ o.append([x[1] for x in i])
37
+ l.append([x[2] for x in i])
38
+
39
+ input_ids = torch.tensor(tokenized_inputs["input_ids"], device="cpu").unsqueeze(0)
40
+ attention_mask = torch.tensor(tokenized_inputs["attention_mask"], device="cpu").unsqueeze(0)
41
+ per = torch.tensor(p, device="cpu")
42
+ org = torch.tensor(o, device="cpu")
43
+ loc = torch.tensor(l, device="cpu")
44
+ output = model(input_ids=input_ids, attention_mask=attention_mask, per=per, org=org, loc=loc).logits
45
+ predictions = torch.argmax(output, dim=2).tolist()
46
+ predicted_tags = [[model.config.id2label[idx] for idx in sentence] for sentence in predictions]
47
+ return " ".join(predicted_tags[0])