CesarLeblanc commited on
Commit
aa09a05
β€’
1 Parent(s): 8da738a
app.py CHANGED
@@ -65,7 +65,7 @@ def gbif_normalization(text):
65
  def classification(text, k):
66
  text = gbif_normalization(text)
67
  result = classification_model(text)
68
- habitat_labels = [res['label'] for res in result[:k]]
69
  if k == 1:
70
  text = f"This vegetation plot belongs to the habitat {habitat_labels[0]}."
71
  else:
@@ -75,70 +75,36 @@ def classification(text, k):
75
 
76
  def masking(text):
77
  text = gbif_normalization(text)
 
78
 
79
  max_score = 0
80
  best_prediction = None
81
  best_position = None
82
  best_sentence = None
83
 
84
- # Case for the first position
85
- masked_text = "[MASK], " + ', '.join(text.split(', '))
86
- i = 0
87
- while True:
88
- prediction = mask_model(masked_text)[i]
89
- species = prediction['token_str']
90
- if species in text.split(', '):
91
- i+=1
92
- else:
93
- break
94
- score = prediction['score']
95
- sentence = prediction['sequence']
96
-
97
- if score > max_score:
98
- max_score = score
99
- best_prediction = species
100
- best_position = 0
101
- best_sentence = sentence
102
-
103
- # Loop through each position in the middle of the sentence
104
- for i in range(1, len(text.split(', '))):
105
- masked_text = ', '.join(text.split(', ')[:i]) + ', [MASK], ' + ', '.join(text.split(', ')[i:])
106
- i = 0
107
  while True:
108
- prediction = mask_model(masked_text)[i]
109
  species = prediction['token_str']
110
- if species in text.split(', '):
111
- i+=1
112
  else:
113
  break
 
114
  score = prediction['score']
115
  sentence = prediction['sequence']
116
-
117
  # Update best prediction and position if score is higher
118
  if score > max_score:
119
  max_score = score
120
  best_prediction = species
121
  best_position = i
122
  best_sentence = sentence
123
-
124
- # Case for the last position
125
- masked_text = ', '.join(text.split(', ')) + ', [MASK]'
126
- i = 0
127
- while True:
128
- prediction = mask_model(masked_text)[i]
129
- species = prediction['token_str']
130
- if species in text.split(', '):
131
- i+=1
132
- else:
133
- break
134
- score = prediction['score']
135
- sentence = prediction['sequence']
136
-
137
- if score > max_score:
138
- max_score = score
139
- best_prediction = species
140
- best_position = len(text.split(', '))
141
- best_sentence = sentence
142
 
143
  text = f"The most likely missing species is {best_prediction} (position {best_position}).\nThe new vegetation plot is {best_sentence}."
144
  image = return_species_image(best_prediction)
 
65
  def classification(text, k):
66
  text = gbif_normalization(text)
67
  result = classification_model(text)
68
+ habitat_labels = [res['label'] for res in result[0][:k]]
69
  if k == 1:
70
  text = f"This vegetation plot belongs to the habitat {habitat_labels[0]}."
71
  else:
 
75
 
76
  def masking(text):
77
  text = gbif_normalization(text)
78
+ text_split = text.split(', ')
79
 
80
  max_score = 0
81
  best_prediction = None
82
  best_position = None
83
  best_sentence = None
84
 
85
+ # Loop through each position in the sentence
86
+ for i in range(len(text_split) + 1):
87
+ # Create masked text
88
+ masked_text = ', '.join(text_split[:i] + ['[MASK]'] + text_split[i:])
89
+
90
+ j = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  while True:
92
+ prediction = mask_model(masked_text)[j]
93
  species = prediction['token_str']
94
+ if species in text_split:
95
+ j += 1
96
  else:
97
  break
98
+
99
  score = prediction['score']
100
  sentence = prediction['sequence']
101
+
102
  # Update best prediction and position if score is higher
103
  if score > max_score:
104
  max_score = score
105
  best_prediction = species
106
  best_position = i
107
  best_sentence = sentence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  text = f"The most likely missing species is {best_prediction} (position {best_position}).\nThe new vegetation plot is {best_sentence}."
110
  image = return_species_image(best_prediction)
models/{fill_mask_model β†’ plantbert_fill_mask_model_large-species_32_2e-05}/added_tokens.json RENAMED
File without changes
models/{fill_mask_model β†’ plantbert_fill_mask_model_large-species_32_2e-05}/config.json RENAMED
File without changes
models/{fill_mask_model β†’ plantbert_fill_mask_model_large-species_32_2e-05}/generation_config.json RENAMED
File without changes
models/{fill_mask_model β†’ plantbert_fill_mask_model_large-species_32_2e-05}/model.safetensors RENAMED
File without changes
models/{fill_mask_model β†’ plantbert_fill_mask_model_large-species_32_2e-05}/special_tokens_map.json RENAMED
File without changes
models/{fill_mask_model β†’ plantbert_fill_mask_model_large-species_32_2e-05}/tokenizer.json RENAMED
File without changes
models/{fill_mask_model β†’ plantbert_fill_mask_model_large-species_32_2e-05}/tokenizer_config.json RENAMED
File without changes
models/{fill_mask_model β†’ plantbert_fill_mask_model_large-species_32_2e-05}/vocab.txt RENAMED
File without changes
models/text_classification_model/config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "_name_or_path": "../Models/bert-large-uncased/",
3
  "architectures": [
4
- "BertForMaskedLM"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "classifier_dropout": null,
@@ -9,8 +9,466 @@
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 1024,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  "initializer_range": 0.02,
13
  "intermediate_size": 4096,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "layer_norm_eps": 1e-12,
15
  "max_position_embeddings": 512,
16
  "model_type": "bert",
@@ -18,6 +476,7 @@
18
  "num_hidden_layers": 24,
19
  "pad_token_id": 0,
20
  "position_embedding_type": "absolute",
 
21
  "torch_dtype": "float32",
22
  "transformers_version": "4.36.2",
23
  "type_vocab_size": 2,
 
1
  {
2
+ "_name_or_path": "../Models/plantbert_fill_mask_model_large-species_32_2e-05/",
3
  "architectures": [
4
+ "BertForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "classifier_dropout": null,
 
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 1024,
12
+ "id2label": {
13
+ "0": "MA211",
14
+ "1": "MA221",
15
+ "2": "MA222",
16
+ "3": "MA223",
17
+ "4": "MA224",
18
+ "5": "MA225",
19
+ "6": "MA232",
20
+ "7": "MA241",
21
+ "8": "MA251",
22
+ "9": "MA252",
23
+ "10": "MA253",
24
+ "11": "N11",
25
+ "12": "N12",
26
+ "13": "N13",
27
+ "14": "N14",
28
+ "15": "N15",
29
+ "16": "N16",
30
+ "17": "N17",
31
+ "18": "N18",
32
+ "19": "N19",
33
+ "20": "N1A",
34
+ "21": "N1B",
35
+ "22": "N1C",
36
+ "23": "N1D",
37
+ "24": "N1E",
38
+ "25": "N1F",
39
+ "26": "N1G",
40
+ "27": "N1H",
41
+ "28": "N1J",
42
+ "29": "N21",
43
+ "30": "N22",
44
+ "31": "N31",
45
+ "32": "N32",
46
+ "33": "N33",
47
+ "34": "N34",
48
+ "35": "N35",
49
+ "36": "Q11",
50
+ "37": "Q12",
51
+ "38": "Q21",
52
+ "39": "Q22",
53
+ "40": "Q23",
54
+ "41": "Q24",
55
+ "42": "Q25",
56
+ "43": "Q41",
57
+ "44": "Q42",
58
+ "45": "Q43",
59
+ "46": "Q44",
60
+ "47": "Q45",
61
+ "48": "Q46",
62
+ "49": "Q51",
63
+ "50": "Q52",
64
+ "51": "Q53",
65
+ "52": "Q54",
66
+ "53": "R11",
67
+ "54": "R12",
68
+ "55": "R13",
69
+ "56": "R14",
70
+ "57": "R15",
71
+ "58": "R16",
72
+ "59": "R17",
73
+ "60": "R18",
74
+ "61": "R19",
75
+ "62": "R1A",
76
+ "63": "R1B",
77
+ "64": "R1C",
78
+ "65": "R1D",
79
+ "66": "R1E",
80
+ "67": "R1F",
81
+ "68": "R1G",
82
+ "69": "R1H",
83
+ "70": "R1J",
84
+ "71": "R1K",
85
+ "72": "R1M",
86
+ "73": "R1P",
87
+ "74": "R1Q",
88
+ "75": "R1R",
89
+ "76": "R1S",
90
+ "77": "R21",
91
+ "78": "R22",
92
+ "79": "R23",
93
+ "80": "R24",
94
+ "81": "R31",
95
+ "82": "R32",
96
+ "83": "R33",
97
+ "84": "R34",
98
+ "85": "R35",
99
+ "86": "R36",
100
+ "87": "R37",
101
+ "88": "R41",
102
+ "89": "R42",
103
+ "90": "R43",
104
+ "91": "R44",
105
+ "92": "R45",
106
+ "93": "R51",
107
+ "94": "R52",
108
+ "95": "R53",
109
+ "96": "R54",
110
+ "97": "R55",
111
+ "98": "R56",
112
+ "99": "R57",
113
+ "100": "R61",
114
+ "101": "R62",
115
+ "102": "R63",
116
+ "103": "R64",
117
+ "104": "R65",
118
+ "105": "S11",
119
+ "106": "S12",
120
+ "107": "S21",
121
+ "108": "S22",
122
+ "109": "S23",
123
+ "110": "S24",
124
+ "111": "S25",
125
+ "112": "S26",
126
+ "113": "S31",
127
+ "114": "S32",
128
+ "115": "S33",
129
+ "116": "S34",
130
+ "117": "S35",
131
+ "118": "S36",
132
+ "119": "S37",
133
+ "120": "S38",
134
+ "121": "S41",
135
+ "122": "S42",
136
+ "123": "S51",
137
+ "124": "S52",
138
+ "125": "S53",
139
+ "126": "S54",
140
+ "127": "S61",
141
+ "128": "S62",
142
+ "129": "S63",
143
+ "130": "S64",
144
+ "131": "S65",
145
+ "132": "S66",
146
+ "133": "S67",
147
+ "134": "S68",
148
+ "135": "S71",
149
+ "136": "S72",
150
+ "137": "S73",
151
+ "138": "S74",
152
+ "139": "S75",
153
+ "140": "S76",
154
+ "141": "S81",
155
+ "142": "S82",
156
+ "143": "S91",
157
+ "144": "S92",
158
+ "145": "S93",
159
+ "146": "S94",
160
+ "147": "T11",
161
+ "148": "T12",
162
+ "149": "T13",
163
+ "150": "T14",
164
+ "151": "T15",
165
+ "152": "T16",
166
+ "153": "T17",
167
+ "154": "T18",
168
+ "155": "T19",
169
+ "156": "T1A",
170
+ "157": "T1B",
171
+ "158": "T1C",
172
+ "159": "T1D",
173
+ "160": "T1E",
174
+ "161": "T1F",
175
+ "162": "T1G",
176
+ "163": "T1H",
177
+ "164": "T21",
178
+ "165": "T22",
179
+ "166": "T23",
180
+ "167": "T24",
181
+ "168": "T25",
182
+ "169": "T27",
183
+ "170": "T28",
184
+ "171": "T29",
185
+ "172": "T31",
186
+ "173": "T32",
187
+ "174": "T33",
188
+ "175": "T34",
189
+ "176": "T35",
190
+ "177": "T36",
191
+ "178": "T37",
192
+ "179": "T38",
193
+ "180": "T39",
194
+ "181": "T3A",
195
+ "182": "T3B",
196
+ "183": "T3C",
197
+ "184": "T3D",
198
+ "185": "T3E",
199
+ "186": "T3F",
200
+ "187": "T3G",
201
+ "188": "T3H",
202
+ "189": "T3J",
203
+ "190": "T3K",
204
+ "191": "T3M",
205
+ "192": "U21",
206
+ "193": "U22",
207
+ "194": "U23",
208
+ "195": "U24",
209
+ "196": "U25",
210
+ "197": "U26",
211
+ "198": "U27",
212
+ "199": "U28",
213
+ "200": "U29",
214
+ "201": "U2A",
215
+ "202": "U32",
216
+ "203": "U33",
217
+ "204": "U34",
218
+ "205": "U35",
219
+ "206": "U36",
220
+ "207": "U37",
221
+ "208": "U38",
222
+ "209": "U3A",
223
+ "210": "U3B",
224
+ "211": "U3C",
225
+ "212": "U3D",
226
+ "213": "U61",
227
+ "214": "U62",
228
+ "215": "V11",
229
+ "216": "V12",
230
+ "217": "V13",
231
+ "218": "V14",
232
+ "219": "V15",
233
+ "220": "V32",
234
+ "221": "V33",
235
+ "222": "V34",
236
+ "223": "V35",
237
+ "224": "V37",
238
+ "225": "V38",
239
+ "226": "V39"
240
+ },
241
  "initializer_range": 0.02,
242
  "intermediate_size": 4096,
243
+ "label2id": {
244
+ "MA211": 0,
245
+ "MA221": 1,
246
+ "MA222": 2,
247
+ "MA223": 3,
248
+ "MA224": 4,
249
+ "MA225": 5,
250
+ "MA232": 6,
251
+ "MA241": 7,
252
+ "MA251": 8,
253
+ "MA252": 9,
254
+ "MA253": 10,
255
+ "N11": 11,
256
+ "N12": 12,
257
+ "N13": 13,
258
+ "N14": 14,
259
+ "N15": 15,
260
+ "N16": 16,
261
+ "N17": 17,
262
+ "N18": 18,
263
+ "N19": 19,
264
+ "N1A": 20,
265
+ "N1B": 21,
266
+ "N1C": 22,
267
+ "N1D": 23,
268
+ "N1E": 24,
269
+ "N1F": 25,
270
+ "N1G": 26,
271
+ "N1H": 27,
272
+ "N1J": 28,
273
+ "N21": 29,
274
+ "N22": 30,
275
+ "N31": 31,
276
+ "N32": 32,
277
+ "N33": 33,
278
+ "N34": 34,
279
+ "N35": 35,
280
+ "Q11": 36,
281
+ "Q12": 37,
282
+ "Q21": 38,
283
+ "Q22": 39,
284
+ "Q23": 40,
285
+ "Q24": 41,
286
+ "Q25": 42,
287
+ "Q41": 43,
288
+ "Q42": 44,
289
+ "Q43": 45,
290
+ "Q44": 46,
291
+ "Q45": 47,
292
+ "Q46": 48,
293
+ "Q51": 49,
294
+ "Q52": 50,
295
+ "Q53": 51,
296
+ "Q54": 52,
297
+ "R11": 53,
298
+ "R12": 54,
299
+ "R13": 55,
300
+ "R14": 56,
301
+ "R15": 57,
302
+ "R16": 58,
303
+ "R17": 59,
304
+ "R18": 60,
305
+ "R19": 61,
306
+ "R1A": 62,
307
+ "R1B": 63,
308
+ "R1C": 64,
309
+ "R1D": 65,
310
+ "R1E": 66,
311
+ "R1F": 67,
312
+ "R1G": 68,
313
+ "R1H": 69,
314
+ "R1J": 70,
315
+ "R1K": 71,
316
+ "R1M": 72,
317
+ "R1P": 73,
318
+ "R1Q": 74,
319
+ "R1R": 75,
320
+ "R1S": 76,
321
+ "R21": 77,
322
+ "R22": 78,
323
+ "R23": 79,
324
+ "R24": 80,
325
+ "R31": 81,
326
+ "R32": 82,
327
+ "R33": 83,
328
+ "R34": 84,
329
+ "R35": 85,
330
+ "R36": 86,
331
+ "R37": 87,
332
+ "R41": 88,
333
+ "R42": 89,
334
+ "R43": 90,
335
+ "R44": 91,
336
+ "R45": 92,
337
+ "R51": 93,
338
+ "R52": 94,
339
+ "R53": 95,
340
+ "R54": 96,
341
+ "R55": 97,
342
+ "R56": 98,
343
+ "R57": 99,
344
+ "R61": 100,
345
+ "R62": 101,
346
+ "R63": 102,
347
+ "R64": 103,
348
+ "R65": 104,
349
+ "S11": 105,
350
+ "S12": 106,
351
+ "S21": 107,
352
+ "S22": 108,
353
+ "S23": 109,
354
+ "S24": 110,
355
+ "S25": 111,
356
+ "S26": 112,
357
+ "S31": 113,
358
+ "S32": 114,
359
+ "S33": 115,
360
+ "S34": 116,
361
+ "S35": 117,
362
+ "S36": 118,
363
+ "S37": 119,
364
+ "S38": 120,
365
+ "S41": 121,
366
+ "S42": 122,
367
+ "S51": 123,
368
+ "S52": 124,
369
+ "S53": 125,
370
+ "S54": 126,
371
+ "S61": 127,
372
+ "S62": 128,
373
+ "S63": 129,
374
+ "S64": 130,
375
+ "S65": 131,
376
+ "S66": 132,
377
+ "S67": 133,
378
+ "S68": 134,
379
+ "S71": 135,
380
+ "S72": 136,
381
+ "S73": 137,
382
+ "S74": 138,
383
+ "S75": 139,
384
+ "S76": 140,
385
+ "S81": 141,
386
+ "S82": 142,
387
+ "S91": 143,
388
+ "S92": 144,
389
+ "S93": 145,
390
+ "S94": 146,
391
+ "T11": 147,
392
+ "T12": 148,
393
+ "T13": 149,
394
+ "T14": 150,
395
+ "T15": 151,
396
+ "T16": 152,
397
+ "T17": 153,
398
+ "T18": 154,
399
+ "T19": 155,
400
+ "T1A": 156,
401
+ "T1B": 157,
402
+ "T1C": 158,
403
+ "T1D": 159,
404
+ "T1E": 160,
405
+ "T1F": 161,
406
+ "T1G": 162,
407
+ "T1H": 163,
408
+ "T21": 164,
409
+ "T22": 165,
410
+ "T23": 166,
411
+ "T24": 167,
412
+ "T25": 168,
413
+ "T27": 169,
414
+ "T28": 170,
415
+ "T29": 171,
416
+ "T31": 172,
417
+ "T32": 173,
418
+ "T33": 174,
419
+ "T34": 175,
420
+ "T35": 176,
421
+ "T36": 177,
422
+ "T37": 178,
423
+ "T38": 179,
424
+ "T39": 180,
425
+ "T3A": 181,
426
+ "T3B": 182,
427
+ "T3C": 183,
428
+ "T3D": 184,
429
+ "T3E": 185,
430
+ "T3F": 186,
431
+ "T3G": 187,
432
+ "T3H": 188,
433
+ "T3J": 189,
434
+ "T3K": 190,
435
+ "T3M": 191,
436
+ "U21": 192,
437
+ "U22": 193,
438
+ "U23": 194,
439
+ "U24": 195,
440
+ "U25": 196,
441
+ "U26": 197,
442
+ "U27": 198,
443
+ "U28": 199,
444
+ "U29": 200,
445
+ "U2A": 201,
446
+ "U32": 202,
447
+ "U33": 203,
448
+ "U34": 204,
449
+ "U35": 205,
450
+ "U36": 206,
451
+ "U37": 207,
452
+ "U38": 208,
453
+ "U3A": 209,
454
+ "U3B": 210,
455
+ "U3C": 211,
456
+ "U3D": 212,
457
+ "U61": 213,
458
+ "U62": 214,
459
+ "V11": 215,
460
+ "V12": 216,
461
+ "V13": 217,
462
+ "V14": 218,
463
+ "V15": 219,
464
+ "V32": 220,
465
+ "V33": 221,
466
+ "V34": 222,
467
+ "V35": 223,
468
+ "V37": 224,
469
+ "V38": 225,
470
+ "V39": 226
471
+ },
472
  "layer_norm_eps": 1e-12,
473
  "max_position_embeddings": 512,
474
  "model_type": "bert",
 
476
  "num_hidden_layers": 24,
477
  "pad_token_id": 0,
478
  "position_embedding_type": "absolute",
479
+ "problem_type": "single_label_classification",
480
  "torch_dtype": "float32",
481
  "transformers_version": "4.36.2",
482
  "type_vocab_size": 2,
models/text_classification_model/generation_config.json DELETED
@@ -1,5 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "pad_token_id": 0,
4
- "transformers_version": "4.36.2"
5
- }
 
 
 
 
 
 
models/text_classification_model/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:962e8280624491e04f2d68af04c50ccaeb07d0bb9218652a0e7f2db6263c0fec
3
- size 1398907656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7568c87fa274d42e4920c5fba1ea7d86877494e4b0c3e306319973cd6af535fd
3
+ size 1399651156
models/text_classification_model/tokenizer.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
  "padding": null,
10
  "added_tokens": [
11
  {