boda commited on
Commit
0e55bc2
1 Parent(s): 11fe992
README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ar
4
+
5
+ thumbnail: "url to a thumbnail used in social sharing"
6
+ tags:
7
+ - ner
8
+ - token-classification
9
+ - Arabic-NER
10
+
11
+ metrics:
12
+ - accuracy
13
+ - f1
14
+ - precision
15
+ - recall
16
+
17
+ widget:
18
+ - text: "النجم محمد صلاح لاعب المنتخب المصري يعيش في مصر بالتحديد من نجريج, الشرقية"
19
+ example_title: "Mohamed Salah"
20
+ - text: "انا ساكن في حدايق الزتون و بدرس في جامعه عين شمس"
21
+ example_title: "Egyptian Dialect"
22
+ - text: "يقع نهر الأمازون في قارة أمريكا الجنوبية"
23
+ example_title: "Standard Arabic"
24
+
25
+ datasets:
26
+ - Fine-grained-Arabic-Named-Entity-Corpora
27
+ ---
28
+
29
+ # Arabic Named Entity Recognition
30
+
31
+ This project is made to enrich the Arabic Named Entity Recognition(ANER). Arabic is a tough language to deal with and has alot of difficulties.
32
+ We managed to made a model based on Arabert to support 50 entities.
33
+
34
+ ## Paper
35
+
36
+ Here's the paper that contains all the details for our model, our approach, and the training results
37
+
38
+ - [ANER Paper](https://drive.google.com/file/d/1jJn3iWqOeLzaNvO-6aKfgidzJlWOtvti/view?usp=sharing)
39
+
40
+ # Usage
41
+
42
+ The model is available in HuggingFace model page under the name: [boda/ANER](https://huggingface.co/boda/ANER). Checkpoints are available only in PyTorch at the time.
43
+
44
+ ### Use in python:
45
+
46
+ ```python
47
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
48
+
49
+ tokenizer = AutoTokenizer.from_pretrained("boda/ANER")
50
+
51
+ model = AutoModelForTokenClassification.from_pretrained("boda/ANER")
52
+ ```
53
+
54
+ # Dataset
55
+
56
+ - [Fine-grained Arabic Named Entity Corpora](https://fsalotaibi.kau.edu.sa/Pages-Arabic-NE-Corpora.aspx)
57
+
58
+ # Acknowledgments
59
+
60
+ Thanks for [Arabert](https://github.com/aub-mind/arabert) for providing the Arabic Bert model, which we used as a base model for our work.
61
+
62
+ We also would like to thank [Prof. Fahd Saleh S Alotaibi](https://fsalotaibi.kau.edu.sa/Pages-Arabic-NE-Corpora.aspx) at Faculty of Computing and Information Technology King Abdulaziz University, for providing the dataset which we used to train our model with.
63
+
64
+ # Contacts
65
+
66
+ **Abdelrahman Atef**
67
+
68
+ - [LinkedIn](linkedin.com/in/boda-sadalla)
69
+ - [Github](https://github.com/BodaSadalla98)
70
config.json ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": ".",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "O",
13
+ "1": " \u0641\u0646\u0627\u0646 ",
14
+ "2": " \u0641\u0646\u0627\u0646 ",
15
+ "3": " \u0635\u0648\u062a ",
16
+ "4": " \u0635\u0648\u062a ",
17
+ "5": " \u062a\u0639\u0644\u064a\u0645\u064a ",
18
+ "6": " \u062a\u0639\u0644\u064a\u0645\u064a ",
19
+ "7": " \u0623\u0631\u0627\u0636\u064a \u0627\u0644\u0628\u0646\u0627\u0621 ",
20
+ "8": " \u0623\u0631\u0627\u0636\u064a \u0627\u0644\u0628\u0646\u0627\u0621 ",
21
+ "9": " \u0645\u0631\u0643\u0632 \u0633\u0643\u0646\u064a ",
22
+ "10": " \u0634\u0639\u0628(\u0623\u0645\u0629) ",
23
+ "11": " \u0648\u0644\u0627\u064a\u0629 \u0623\u0648 \u0645\u0642\u0627\u0637\u0639\u0629 ",
24
+ "12": " \u0648\u0644\u0627\u064a\u0629 \u0623\u0648 \u0645\u0642\u0627\u0637\u0639\u0629 ",
25
+ "13": " \u0645\u0633\u0637\u062d \u0645\u0627\u0626\u064a ",
26
+ "14": " \u0645\u0633\u0637\u062d \u0645\u0627\u0626\u064a ",
27
+ "15": " \u0623\u0631\u0636 \u0637\u0628\u064a\u0639\u064a\u0629 ",
28
+ "16": " \u0623\u0631\u0636 \u0637\u0628\u064a\u0639\u064a\u0629 ",
29
+ "17": " \u0633\u0648\u0641\u062a\u0648\u064a\u0631(\u0628\u0631\u0645\u062c\u064a\u0627\u062a) ",
30
+ "18": " \u0633\u0648\u0641\u062a\u0648\u064a\u0631(\u0628\u0631\u0645\u062c\u064a\u0627\u062a) ",
31
+ "19": " \u0639\u0627\u0644\u0645 ",
32
+ "20": " \u0643\u062a\u0627\u0628 ",
33
+ "21": " \u0643\u062a\u0627\u0628 ",
34
+ "22": " \u0639\u0627\u0644\u0645 ",
35
+ "23": " \u0645\u062c\u0645\u0648\u0639\u0629 ",
36
+ "24": " \u0633\u0645\u0627\u0648\u064a ",
37
+ "25": " \u0634\u0631\u0637\u0629 ",
38
+ "26": " \u0634\u0631\u0637\u0629 ",
39
+ "27": " \u0645\u0631\u0643\u0632 \u0633\u0643\u0646\u064a ",
40
+ "28": " \u0633\u0645\u0627\u0648\u064a ",
41
+ "29": " \u0645\u0647\u0646\u062f\u0633 ",
42
+ "30": " \u0645\u0647\u0646\u062f\u0633 ",
43
+ "31": " \u0642\u0630\u064a\u0641\u0629 ",
44
+ "32": " \u062d\u0643\u0648\u0645\u0629 ",
45
+ "33": " \u062d\u0643\u0648\u0645\u0629 ",
46
+ "34": " \u062a\u062c\u0627\u0631\u064a ",
47
+ "35": " \u062a\u062c\u0627\u0631\u064a ",
48
+ "36": " \u0642\u0627\u0631\u0629 ",
49
+ "37": " \u0647\u0648\u0627\u0621 ",
50
+ "38": " \u0647\u0648\u0627\u0621 ",
51
+ "39": " \u0634\u062e\u0635 ",
52
+ "40": " \u0634\u062e\u0635 ",
53
+ "41": " \u0645\u062c\u0645\u0648\u0639\u0629 ",
54
+ "42": " \u0633\u064a\u0627\u0633\u064a ",
55
+ "43": " \u0633\u064a\u0627\u0633\u064a ",
56
+ "44": " \u0631\u064a\u0627\u0636\u064a ",
57
+ "45": " \u0631\u064a\u0627\u0636\u064a ",
58
+ "46": " \u0645\u0624\u0633\u0633\u0629 \u062f\u064a\u0646\u064a\u0629 ",
59
+ "47": " \u0645\u0624\u0633\u0633\u0629 \u062f\u064a\u0646\u064a\u0629 ",
60
+ "48": " \u0637\u0631\u064a\u0642 ",
61
+ "49": " \u0637\u0631\u064a\u0642 ",
62
+ "50": " \u0625\u0639\u0644\u0627\u0645 ",
63
+ "51": " \u0625\u0639\u0644\u0627\u0645 ",
64
+ "52": " \u063a\u064a\u0631 \u062d\u0643\u0648\u0645\u064a ",
65
+ "53": " \u063a\u064a\u0631 \u062d\u0643\u0648\u0645\u064a ",
66
+ "54": " \u0645\u062f\u064a\u0646\u0629 \u0623\u0648 \u0636\u0627\u062d\u064a\u0629 ",
67
+ "55": " \u0645\u062f\u064a\u0646\u0629 \u0623\u0648 \u0636\u0627\u062d\u064a\u0629 ",
68
+ "56": " \u0631\u062c\u0644 \u0623\u0639\u0645\u0627\u0644 ",
69
+ "57": " \u0645\u062d\u0627\u0645\u064a ",
70
+ "58": " \u0645\u062d\u0627\u0645\u064a ",
71
+ "59": " ",
72
+ "60": " ",
73
+ "61": " \u0634\u0639\u0628(\u0623\u0645\u0629) ",
74
+ "62": " \u0634\u062e\u0635 \u062f\u064a\u0646\u064a ",
75
+ "63": " \u0634\u062e\u0635 \u062f\u064a\u0646\u064a ",
76
+ "64": " \u0631\u062c\u0644 \u0623\u0639\u0645\u0627\u0644 ",
77
+ "65": " \u0639\u0644\u0648\u0645 \u0637\u0628\u064a\u0629 ",
78
+ "66": " \u0639\u0644\u0648\u0645 \u0637\u0628\u064a\u0629 ",
79
+ "67": " \u0641\u064a\u0644\u0645 ",
80
+ "68": " \u0641\u064a\u0644\u0645 ",
81
+ "69": " \u0645\u0627\u0621 ",
82
+ "70": " \u0645\u0627\u0621 ",
83
+ "71": " \u062f\u0648\u0627\u0621 ",
84
+ "72": " \u0639\u062a\u0627\u062f ",
85
+ "73": " \u0639\u062a\u0627\u062f ",
86
+ "74": " \u0645\u0646\u0634\u0623\u0629 \u0645\u0646\u0637\u0642\u0629 \u0641\u0631\u0639\u064a\u0629 ",
87
+ "75": " \u0645\u0646\u0634\u0623\u0629 \u0645\u0646\u0637\u0642\u0629 \u0641\u0631\u0639\u064a\u0629 ",
88
+ "76": " \u0641\u0638 ",
89
+ "77": " \u0645\u0637\u0627\u0631 ",
90
+ "78": " \u0641\u0638 ",
91
+ "79": " \u062f\u0648\u0627\u0621 ",
92
+ "80": " \u0631\u064a\u0627\u0636\u0629 ",
93
+ "81": " \u0631\u064a\u0627\u0636\u0629 ",
94
+ "82": " \u0631\u0645\u0627\u064a\u0629 ",
95
+ "83": " \u0631\u0645\u0627\u064a\u0629 ",
96
+ "84": " \u0637\u0639\u0627\u0645 ",
97
+ "85": " \u0637\u0639\u0627\u0645 ",
98
+ "86": " \u0642\u0627\u0631\u0629 ",
99
+ "87": " \u0646\u0648\u0648\u064a ",
100
+ "88": " \u0646\u0648\u0648\u064a ",
101
+ "89": " \u062a\u0631\u0641\u064a\u0647 ",
102
+ "90": " \u062a\u0631\u0641\u064a\u0647 ",
103
+ "91": " \u0642\u0630\u064a\u0641\u0629 ",
104
+ "92": " \u0623\u0631\u0636 ",
105
+ "93": " \u062d\u0627\u062f ",
106
+ "94": " \u0645\u0637\u0627\u0631 ",
107
+ "95": " \u0623\u0631\u0636 ",
108
+ "96": " \u0646\u0628\u0627\u062a ",
109
+ "97": " \u0646\u0628\u0627\u062a ",
110
+ "98": " \u0645\u0646\u0641\u062c\u0631 ",
111
+ "99": " \u0645\u0646\u0641\u062c\u0631 ",
112
+ "100": " \u0643\u064a\u0645\u064a\u0627\u0626\u064a ",
113
+ "101": " \u0643\u064a\u0645\u064a\u0627\u0626\u064a "
114
+ },
115
+ "initializer_range": 0.02,
116
+ "intermediate_size": 3072,
117
+ "label2id": {
118
+ " ": 60,
119
+ " \u0623\u0631\u0627\u0636\u064a \u0627\u0644\u0628\u0646\u0627\u0621 ": 8,
120
+ " \u0623\u0631\u0636 ": 95,
121
+ " \u0623\u0631\u0636 \u0637\u0628\u064a\u0639\u064a\u0629 ": 16,
122
+ " \u0625\u0639\u0644\u0627\u0645 ": 51,
123
+ " \u062a\u062c\u0627\u0631\u064a ": 35,
124
+ " \u062a\u0631\u0641\u064a\u0647 ": 90,
125
+ " \u062a\u0639\u0644\u064a\u0645\u064a ": 6,
126
+ " \u062d\u0627\u062f ": 93,
127
+ " \u062d\u0643\u0648\u0645\u0629 ": 33,
128
+ " \u062f\u0648\u0627\u0621 ": 79,
129
+ " \u0631\u062c\u0644 \u0623\u0639\u0645\u0627\u0644 ": 64,
130
+ " \u0631\u0645\u0627\u064a\u0629 ": 83,
131
+ " \u0631\u064a\u0627\u0636\u0629 ": 81,
132
+ " \u0631\u064a\u0627\u0636\u064a ": 45,
133
+ " \u0633\u0645\u0627\u0648\u064a ": 28,
134
+ " \u0633\u0648\u0641\u062a\u0648\u064a\u0631(\u0628\u0631\u0645\u062c\u064a\u0627\u062a) ": 18,
135
+ " \u0633\u064a\u0627\u0633\u064a ": 43,
136
+ " \u0634\u062e\u0635 ": 40,
137
+ " \u0634\u062e\u0635 \u062f\u064a\u0646\u064a ": 63,
138
+ " \u0634\u0631\u0637\u0629 ": 26,
139
+ " \u0634\u0639\u0628(\u0623\u0645\u0629) ": 61,
140
+ " \u0635\u0648\u062a ": 4,
141
+ " \u0637\u0631\u064a\u0642 ": 49,
142
+ " \u0637\u0639\u0627\u0645 ": 85,
143
+ " \u0639\u0627\u0644\u0645 ": 22,
144
+ " \u0639\u062a\u0627\u062f ": 73,
145
+ " \u0639\u0644\u0648\u0645 \u0637\u0628\u064a\u0629 ": 66,
146
+ " \u063a\u064a\u0631 \u062d\u0643\u0648\u0645\u064a ": 53,
147
+ " \u0641\u0638 ": 78,
148
+ " \u0641\u0646\u0627\u0646 ": 2,
149
+ " \u0641\u064a\u0644\u0645 ": 68,
150
+ " \u0642\u0627\u0631\u0629 ": 86,
151
+ " \u0642\u0630\u064a\u0641\u0629 ": 91,
152
+ " \u0643\u062a\u0627\u0628 ": 21,
153
+ " \u0643\u064a\u0645\u064a\u0627\u0626\u064a ": 101,
154
+ " \u0645\u0624\u0633\u0633\u0629 \u062f\u064a\u0646\u064a\u0629 ": 47,
155
+ " \u0645\u0627\u0621 ": 70,
156
+ " \u0645\u062c\u0645\u0648\u0639\u0629 ": 41,
157
+ " \u0645\u062d\u0627\u0645\u064a ": 58,
158
+ " \u0645\u062f\u064a\u0646\u0629 \u0623\u0648 \u0636\u0627\u062d\u064a\u0629 ": 55,
159
+ " \u0645\u0631\u0643\u0632 \u0633\u0643\u0646\u064a ": 27,
160
+ " \u0645\u0633\u0637\u062d \u0645\u0627\u0626\u064a ": 14,
161
+ " \u0645\u0637\u0627\u0631 ": 94,
162
+ " \u0645\u0646\u0634\u0623\u0629 \u0645\u0646\u0637\u0642\u0629 \u0641\u0631\u0639\u064a\u0629 ": 75,
163
+ " \u0645\u0646\u0641\u062c\u0631 ": 99,
164
+ " \u0645\u0647\u0646\u062f\u0633 ": 30,
165
+ " \u0646\u0628\u0627\u062a ": 97,
166
+ " \u0646\u0648\u0648\u064a ": 88,
167
+ " \u0647\u0648\u0627\u0621 ": 38,
168
+ " \u0648\u0644\u0627\u064a\u0629 \u0623\u0648 \u0645\u0642\u0627\u0637\u0639\u0629 ": 12,
169
+ "O": 0
170
+ },
171
+ "layer_norm_eps": 1e-12,
172
+ "max_position_embeddings": 512,
173
+ "model_type": "bert",
174
+ "num_attention_heads": 12,
175
+ "num_hidden_layers": 12,
176
+ "pad_token_id": 0,
177
+ "position_embedding_type": "absolute",
178
+ "transformers_version": "4.5.0",
179
+ "type_vocab_size": 2,
180
+ "use_cache": true,
181
+ "vocab_size": 64000
182
+ }
helpers/__pycache__/helper.cpython-39.pyc ADDED
Binary file (3.76 kB). View file
 
helpers/download_model.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ def download_file_from_google_drive(id, destination):
4
+ URL = "https://docs.google.com/uc?export=download"
5
+
6
+ session = requests.Session()
7
+
8
+ response = session.get(URL, params = { 'id' : id }, stream = True)
9
+ token = get_confirm_token(response)
10
+
11
+ if token:
12
+ params = { 'id' : id, 'confirm' : token }
13
+ response = session.get(URL, params = params, stream = True)
14
+
15
+ save_response_content(response, destination)
16
+
17
+ def get_confirm_token(response):
18
+ for key, value in response.cookies.items():
19
+ if key.startswith('download_warning'):
20
+ return value
21
+
22
+ return None
23
+
24
+ def save_response_content(response, destination):
25
+ CHUNK_SIZE = 32768
26
+
27
+ with open(destination, "wb") as f:
28
+ for chunk in response.iter_content(CHUNK_SIZE):
29
+ if chunk: # filter out keep-alive new chunks
30
+ f.write(chunk)
31
+
32
+
helpers/helper.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from numpy import string_
2
+ import re
3
+
4
+ en_to_ar_camel = {
5
+ 'B-LOC' : 'مكان',
6
+ 'B-ORG': 'مؤسسة',
7
+ 'B-PERS': 'شخص',
8
+ 'B-MISC': 'معنى بموضوعات متنوعة',
9
+ 'I-LOC': 'مكان',
10
+ 'I-ORG': 'مؤسسة',
11
+ 'I-PERS': 'شحص',
12
+ 'I-MISC': 'معنى بموضوعات متنوعة',
13
+ }
14
+
15
+ en_to_ar = {
16
+ "B-Artist" : "فنان",
17
+ "I-Artist" :"فنان",
18
+ "B-Sound": "صوت",
19
+ "I-Sound":"صوت",
20
+ "B-Educational": "تعليمي",
21
+ "I-Educational":"تعليمي",
22
+ "B-Building-Grounds":"أراضي البناء",
23
+ "I-Building-Grounds":"أراضي البناء",
24
+ "B-Population-Center":"مركز سكني",
25
+ "B-Nation":"شعب(أمة)",
26
+ "B-State-or-Province":"ولاية أو مقاطعة",
27
+ "I-State-or-Province": "ولاية أو مقاطعة",
28
+ "B-Water-Body": "مسطح مائي",
29
+ "I-Water-Body":"مسطح مائي",
30
+ "B-Land-Region-Natural": "أرض طبيعية",
31
+ "I-Land-Region-Natural":"أرض طبيعية",
32
+ "B-Software":"سوفتوير(برمجيات)",
33
+ "I-Software":"سوفتوير(برمجيات)",
34
+ "B-Scientist": "عالم",
35
+ "B-Book":"كتاب",
36
+ "I-Book":"كتاب",
37
+ "I-Scientist":"عالم",
38
+ "B-Group":"مجموعة",
39
+ "B-Celestial":"سماوي",
40
+ "B-Police":"شرطة",
41
+ "I-Police":"شرطة",
42
+ "I-Population-Center":"مركز سكني",
43
+ "I-Celestial":"سماوي",
44
+ "B-Engineer":"مهندس",
45
+ "I-Engineer":"مهندس",
46
+ "B-Projectile":"قذيفة",
47
+ "B-Government":"حكومة",
48
+ "I-Government":"حكومة",
49
+ "B-Commercial":"تجاري",
50
+ "I-Commercial":"تجاري",
51
+ "B-Continent":"قارة",
52
+ "B-Air":"هواء",
53
+ "I-Air":"هواء",
54
+ "B-Other_PER":"شخص",
55
+ "I-Other_PER":"شخص",
56
+ "I-Group":"مجموعة",
57
+ "B-Politician":"سياسي",
58
+ "I-Politician":"سياسي",
59
+ "B-Athlete":"رياضي",
60
+ "I-Athlete":"رياضي",
61
+ "B-Religious_ORG":"مؤسسة دينية",
62
+ "I-Religious_ORG":"مؤسسة دينية",
63
+ "B-Path":"طريق",
64
+ "I-Path":"طريق",
65
+ "B-Media":"إعلام",
66
+ "I-Media":"إعلام",
67
+ "B-Non-Governmental":"غير حكومي",
68
+ "I-Non-Governmental":"غير حكومي",
69
+ "B-County-or-District":"مدينة أو ضاحية",
70
+ "I-County-or-District":"مدينة أو ضاحية",
71
+ "B-Businessperson":"رجل أعمال",
72
+ "B-Lawyer":"محامي",
73
+ "I-Lawyer":"محامي",
74
+ "B-GPE-Cluster":"",
75
+ "I-GPE-Cluster":"",
76
+ "I-Nation":"شعب(أمة)",
77
+ "B-Religious_PER":"شخص ديني",
78
+ "I-Religious_PER":"شخص ديني",
79
+ "I-Businessperson":"رجل أعمال",
80
+ "B-Medical-Science":"علوم طبية",
81
+ "I-Medical-Science":"علوم طبية",
82
+ "B-Movie":"فيلم",
83
+ "I-Movie":"فيلم",
84
+ "B-Water":"ماء",
85
+ "I-Water":"ماء",
86
+ "B-Drug":"دواء",
87
+ "B-Hardware":"عتاد",
88
+ "I-Hardware":"عتاد",
89
+ "B-Subarea-Facility":"منشأة منطقة فرعية",
90
+ "I-Subarea-Facility":"منشأة منطقة فرعية",
91
+ "B-Blunt":"فظ",
92
+ "B-Airport":"مطار",
93
+ "I-Blunt": "فظ",
94
+ "I-Drug":"دواء",
95
+ "B-Sports":"رياضة",
96
+ "I-Sports":"رياضة",
97
+ "B-Shooting":"رماية",
98
+ "I-Shooting":"رماية",
99
+ "B-Food":"طعام",
100
+ "I-Food":"طعام",
101
+ "I-Continent":"قارة",
102
+ "B-Nuclear":"نووي",
103
+ "I-Nuclear":"نووي",
104
+ "B-Entertainment":"ترفيه",
105
+ "I-Entertainment":"ترفيه",
106
+ "I-Projectile":"قذيفة",
107
+ "B-Land":"أرض",
108
+ "B-Sharp":"حاد",
109
+ "I-Airport":"مطار",
110
+ "I-Land":"أرض",
111
+ "B-Plant":"نبات",
112
+ "I-Plant":"نبات",
113
+ "B-Exploding":"منفجر",
114
+ "I-Exploding":"منفجر",
115
+ "B-Chemical":"كيميائي",
116
+ "I-Chemical": "كيميائي",
117
+ }
118
+
119
+
120
+
121
+
122
+ def get_separate_entities(labels, tokens):
123
+ """
124
+ takes labels and token , return full name entity (mohamed, salah --> "mohamed salah")
125
+ this will be used to search in wikipedia
126
+ """
127
+ res = []
128
+ b_before = False
129
+ temp = ""
130
+ key_value = ()
131
+ for i in range(len(labels)):
132
+ print(res)
133
+ curr = labels[i]
134
+
135
+ if("B-" in curr):
136
+ if(b_before):
137
+ key_value = (temp[:-1], 1)
138
+ res.append(key_value)
139
+ temp = tokens[i] + ' '
140
+ else:
141
+ b_before = True
142
+ temp += tokens[i] + ' '
143
+ if(i == len(labels)-1):
144
+ key_value = (temp[:-1], 1)
145
+ res.append(key_value)
146
+ # print("temp is:" + str(temp))
147
+
148
+ elif("I-" in curr):
149
+ temp += tokens[i] + ' '
150
+ if(i == len(labels)-1):
151
+ key_value = (temp[:-1], 1)
152
+ res.append(key_value)
153
+
154
+ else:
155
+ if(temp == ""):
156
+ key_value = (tokens[i], 0)
157
+ res.append(key_value)
158
+ else:
159
+ key_value = (temp[:-1], 1)
160
+ res.append(key_value)
161
+ key_value = (tokens[i], 0)
162
+ res.append(key_value)
163
+ temp = ""
164
+ b_before = False
165
+
166
+
167
+
168
+ print(res)
169
+ return res
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
main.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForTokenClassification, AutoTokenizer
2
+ from helpers import helper
3
+
4
+
5
+ MODEL_NAME = 'boda/ANER'
6
+
7
+
8
+ # Load model and tokenizer
9
+ model = AutoModelForTokenClassification.from_pretrained('.')
10
+ tokenizer = AutoTokenizer.from_pretrained('.')
11
+
12
+ # change in the model labels
13
+ # model.config.id2label = {i: ' '+v+' ' for i, v in model.config.id2label.items() if i != 0 }
14
+ # model.config.id2label[0] = 'O'
15
+ # model.config.label2id = {label: i for i, label in model.config.id2label.items()}
16
+
17
+
18
+
19
+
20
+ # save model after finish
21
+ # model.save_pretrained('.')
22
+
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "max_len": 512, "do_basic_tokenize": true, "never_split": ["[بريد]", "[مستخدم]", "[رابط]"], "special_tokens_map_file": null, "name_or_path": "aubmindlab/bert-base-arabertv02"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff