Spaces:

CesarLeblanc
/

plantbert_space

Running

App Files Files Community

CesarLeblanc commited on Aug 26, 2024

Commit

aa09a05

1 Parent(s): 8da738a

Browse files

Files changed (13) hide show

app.py +13 -47
models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/added_tokens.json +0 -0
models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/config.json +0 -0
models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/generation_config.json +0 -0
models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/model.safetensors +0 -0
models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/special_tokens_map.json +0 -0
models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/tokenizer.json +0 -0
models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/tokenizer_config.json +0 -0
models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/vocab.txt +0 -0
models/text_classification_model/config.json +461 -2
models/text_classification_model/generation_config.json +0 -5
models/text_classification_model/model.safetensors +2 -2
models/text_classification_model/tokenizer.json +6 -1

app.py CHANGED Viewed

@@ -65,7 +65,7 @@ def gbif_normalization(text):
 def classification(text, k):
     text = gbif_normalization(text)
     result = classification_model(text)
-    habitat_labels = [res['label'] for res in result[:k]]
     if k == 1:
         text = f"This vegetation plot belongs to the habitat {habitat_labels[0]}."
     else:
@@ -75,70 +75,36 @@ def classification(text, k):
 def masking(text):
     text = gbif_normalization(text)
     max_score = 0
     best_prediction = None
     best_position = None
     best_sentence = None
-    # Case for the first position
-    masked_text = "[MASK], " + ', '.join(text.split(', '))
-    i = 0
-    while True:
-        prediction = mask_model(masked_text)[i]
-        species = prediction['token_str']
-        if species in text.split(', '):
-            i+=1
-        else:
-            break
-    score = prediction['score']
-    sentence = prediction['sequence']
-    if score > max_score:
-        max_score = score
-        best_prediction = species
-        best_position = 0
-        best_sentence = sentence
-    # Loop through each position in the middle of the sentence
-    for i in range(1, len(text.split(', '))):
-        masked_text = ', '.join(text.split(', ')[:i]) + ', [MASK], ' + ', '.join(text.split(', ')[i:])
-        i = 0
         while True:
-            prediction = mask_model(masked_text)[i]
             species = prediction['token_str']
-            if species in text.split(', '):
-                i+=1
             else:
                 break
         score = prediction['score']
         sentence = prediction['sequence']
         # Update best prediction and position if score is higher
         if score > max_score:
             max_score = score
             best_prediction = species
             best_position = i
             best_sentence = sentence
-    # Case for the last position
-    masked_text = ', '.join(text.split(', ')) + ', [MASK]'
-    i = 0
-    while True:
-        prediction = mask_model(masked_text)[i]
-        species = prediction['token_str']
-        if species in text.split(', '):
-            i+=1
-        else:
-            break
-    score = prediction['score']
-    sentence = prediction['sequence']
-    if score > max_score:
-        max_score = score
-        best_prediction = species
-        best_position = len(text.split(', '))
-        best_sentence = sentence
     text = f"The most likely missing species is {best_prediction} (position {best_position}).\nThe new vegetation plot is {best_sentence}."
     image = return_species_image(best_prediction)

 def classification(text, k):
     text = gbif_normalization(text)
     result = classification_model(text)
+    habitat_labels = [res['label'] for res in result[0][:k]]
     if k == 1:
         text = f"This vegetation plot belongs to the habitat {habitat_labels[0]}."
     else:
 def masking(text):
     text = gbif_normalization(text)
+    text_split = text.split(', ')
     max_score = 0
     best_prediction = None
     best_position = None
     best_sentence = None
+    # Loop through each position in the sentence
+    for i in range(len(text_split) + 1):
+        # Create masked text
+        masked_text = ', '.join(text_split[:i] + ['[MASK]'] + text_split[i:])
+        j = 0
         while True:
+            prediction = mask_model(masked_text)[j]
             species = prediction['token_str']
+            if species in text_split:
+                j += 1
             else:
                 break
         score = prediction['score']
         sentence = prediction['sequence']
         # Update best prediction and position if score is higher
         if score > max_score:
             max_score = score
             best_prediction = species
             best_position = i
             best_sentence = sentence
     text = f"The most likely missing species is {best_prediction} (position {best_position}).\nThe new vegetation plot is {best_sentence}."
     image = return_species_image(best_prediction)

models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/added_tokens.json RENAMED Viewed

File without changes

models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/config.json RENAMED Viewed

File without changes

models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/generation_config.json RENAMED Viewed

File without changes

models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/model.safetensors RENAMED Viewed

File without changes

models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/special_tokens_map.json RENAMED Viewed

File without changes

models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/tokenizer.json RENAMED Viewed

File without changes

models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/tokenizer_config.json RENAMED Viewed

File without changes

models/{fill_mask_model → plantbert_fill_mask_model_large-species_32_2e-05}/vocab.txt RENAMED Viewed

File without changes

models/text_classification_model/config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "_name_or_path": "../Models/bert-large-uncased/",
   "architectures": [
-    "BertForMaskedLM"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
@@ -9,8 +9,466 @@
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 1024,
   "initializer_range": 0.02,
   "intermediate_size": 4096,
   "layer_norm_eps": 1e-12,
   "max_position_embeddings": 512,
   "model_type": "bert",
@@ -18,6 +476,7 @@
   "num_hidden_layers": 24,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "torch_dtype": "float32",
   "transformers_version": "4.36.2",
   "type_vocab_size": 2,

 {
+  "_name_or_path": "../Models/plantbert_fill_mask_model_large-species_32_2e-05/",
   "architectures": [
+    "BertForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 1024,
+  "id2label": {
+    "0": "MA211",
+    "1": "MA221",
+    "2": "MA222",
+    "3": "MA223",
+    "4": "MA224",
+    "5": "MA225",
+    "6": "MA232",
+    "7": "MA241",
+    "8": "MA251",
+    "9": "MA252",
+    "10": "MA253",
+    "11": "N11",
+    "12": "N12",
+    "13": "N13",
+    "14": "N14",
+    "15": "N15",
+    "16": "N16",
+    "17": "N17",
+    "18": "N18",
+    "19": "N19",
+    "20": "N1A",
+    "21": "N1B",
+    "22": "N1C",
+    "23": "N1D",
+    "24": "N1E",
+    "25": "N1F",
+    "26": "N1G",
+    "27": "N1H",
+    "28": "N1J",
+    "29": "N21",
+    "30": "N22",
+    "31": "N31",
+    "32": "N32",
+    "33": "N33",
+    "34": "N34",
+    "35": "N35",
+    "36": "Q11",
+    "37": "Q12",
+    "38": "Q21",
+    "39": "Q22",
+    "40": "Q23",
+    "41": "Q24",
+    "42": "Q25",
+    "43": "Q41",
+    "44": "Q42",
+    "45": "Q43",
+    "46": "Q44",
+    "47": "Q45",
+    "48": "Q46",
+    "49": "Q51",
+    "50": "Q52",
+    "51": "Q53",
+    "52": "Q54",
+    "53": "R11",
+    "54": "R12",
+    "55": "R13",
+    "56": "R14",
+    "57": "R15",
+    "58": "R16",
+    "59": "R17",
+    "60": "R18",
+    "61": "R19",
+    "62": "R1A",
+    "63": "R1B",
+    "64": "R1C",
+    "65": "R1D",
+    "66": "R1E",
+    "67": "R1F",
+    "68": "R1G",
+    "69": "R1H",
+    "70": "R1J",
+    "71": "R1K",
+    "72": "R1M",
+    "73": "R1P",
+    "74": "R1Q",
+    "75": "R1R",
+    "76": "R1S",
+    "77": "R21",
+    "78": "R22",
+    "79": "R23",
+    "80": "R24",
+    "81": "R31",
+    "82": "R32",
+    "83": "R33",
+    "84": "R34",
+    "85": "R35",
+    "86": "R36",
+    "87": "R37",
+    "88": "R41",
+    "89": "R42",
+    "90": "R43",
+    "91": "R44",
+    "92": "R45",
+    "93": "R51",
+    "94": "R52",
+    "95": "R53",
+    "96": "R54",
+    "97": "R55",
+    "98": "R56",
+    "99": "R57",
+    "100": "R61",
+    "101": "R62",
+    "102": "R63",
+    "103": "R64",
+    "104": "R65",
+    "105": "S11",
+    "106": "S12",
+    "107": "S21",
+    "108": "S22",
+    "109": "S23",
+    "110": "S24",
+    "111": "S25",
+    "112": "S26",
+    "113": "S31",
+    "114": "S32",
+    "115": "S33",
+    "116": "S34",
+    "117": "S35",
+    "118": "S36",
+    "119": "S37",
+    "120": "S38",
+    "121": "S41",
+    "122": "S42",
+    "123": "S51",
+    "124": "S52",
+    "125": "S53",
+    "126": "S54",
+    "127": "S61",
+    "128": "S62",
+    "129": "S63",
+    "130": "S64",
+    "131": "S65",
+    "132": "S66",
+    "133": "S67",
+    "134": "S68",
+    "135": "S71",
+    "136": "S72",
+    "137": "S73",
+    "138": "S74",
+    "139": "S75",
+    "140": "S76",
+    "141": "S81",
+    "142": "S82",
+    "143": "S91",
+    "144": "S92",
+    "145": "S93",
+    "146": "S94",
+    "147": "T11",
+    "148": "T12",
+    "149": "T13",
+    "150": "T14",
+    "151": "T15",
+    "152": "T16",
+    "153": "T17",
+    "154": "T18",
+    "155": "T19",
+    "156": "T1A",
+    "157": "T1B",
+    "158": "T1C",
+    "159": "T1D",
+    "160": "T1E",
+    "161": "T1F",
+    "162": "T1G",
+    "163": "T1H",
+    "164": "T21",
+    "165": "T22",
+    "166": "T23",
+    "167": "T24",
+    "168": "T25",
+    "169": "T27",
+    "170": "T28",
+    "171": "T29",
+    "172": "T31",
+    "173": "T32",
+    "174": "T33",
+    "175": "T34",
+    "176": "T35",
+    "177": "T36",
+    "178": "T37",
+    "179": "T38",
+    "180": "T39",
+    "181": "T3A",
+    "182": "T3B",
+    "183": "T3C",
+    "184": "T3D",
+    "185": "T3E",
+    "186": "T3F",
+    "187": "T3G",
+    "188": "T3H",
+    "189": "T3J",
+    "190": "T3K",
+    "191": "T3M",
+    "192": "U21",
+    "193": "U22",
+    "194": "U23",
+    "195": "U24",
+    "196": "U25",
+    "197": "U26",
+    "198": "U27",
+    "199": "U28",
+    "200": "U29",
+    "201": "U2A",
+    "202": "U32",
+    "203": "U33",
+    "204": "U34",
+    "205": "U35",
+    "206": "U36",
+    "207": "U37",
+    "208": "U38",
+    "209": "U3A",
+    "210": "U3B",
+    "211": "U3C",
+    "212": "U3D",
+    "213": "U61",
+    "214": "U62",
+    "215": "V11",
+    "216": "V12",
+    "217": "V13",
+    "218": "V14",
+    "219": "V15",
+    "220": "V32",
+    "221": "V33",
+    "222": "V34",
+    "223": "V35",
+    "224": "V37",
+    "225": "V38",
+    "226": "V39"
+  },
   "initializer_range": 0.02,
   "intermediate_size": 4096,
+  "label2id": {
+    "MA211": 0,
+    "MA221": 1,
+    "MA222": 2,
+    "MA223": 3,
+    "MA224": 4,
+    "MA225": 5,
+    "MA232": 6,
+    "MA241": 7,
+    "MA251": 8,
+    "MA252": 9,
+    "MA253": 10,
+    "N11": 11,
+    "N12": 12,
+    "N13": 13,
+    "N14": 14,
+    "N15": 15,
+    "N16": 16,
+    "N17": 17,
+    "N18": 18,
+    "N19": 19,
+    "N1A": 20,
+    "N1B": 21,
+    "N1C": 22,
+    "N1D": 23,
+    "N1E": 24,
+    "N1F": 25,
+    "N1G": 26,
+    "N1H": 27,
+    "N1J": 28,
+    "N21": 29,
+    "N22": 30,
+    "N31": 31,
+    "N32": 32,
+    "N33": 33,
+    "N34": 34,
+    "N35": 35,
+    "Q11": 36,
+    "Q12": 37,
+    "Q21": 38,
+    "Q22": 39,
+    "Q23": 40,
+    "Q24": 41,
+    "Q25": 42,
+    "Q41": 43,
+    "Q42": 44,
+    "Q43": 45,
+    "Q44": 46,
+    "Q45": 47,
+    "Q46": 48,
+    "Q51": 49,
+    "Q52": 50,
+    "Q53": 51,
+    "Q54": 52,
+    "R11": 53,
+    "R12": 54,
+    "R13": 55,
+    "R14": 56,
+    "R15": 57,
+    "R16": 58,
+    "R17": 59,
+    "R18": 60,
+    "R19": 61,
+    "R1A": 62,
+    "R1B": 63,
+    "R1C": 64,
+    "R1D": 65,
+    "R1E": 66,
+    "R1F": 67,
+    "R1G": 68,
+    "R1H": 69,
+    "R1J": 70,
+    "R1K": 71,
+    "R1M": 72,
+    "R1P": 73,
+    "R1Q": 74,
+    "R1R": 75,
+    "R1S": 76,
+    "R21": 77,
+    "R22": 78,
+    "R23": 79,
+    "R24": 80,
+    "R31": 81,
+    "R32": 82,
+    "R33": 83,
+    "R34": 84,
+    "R35": 85,
+    "R36": 86,
+    "R37": 87,
+    "R41": 88,
+    "R42": 89,
+    "R43": 90,
+    "R44": 91,
+    "R45": 92,
+    "R51": 93,
+    "R52": 94,
+    "R53": 95,
+    "R54": 96,
+    "R55": 97,
+    "R56": 98,
+    "R57": 99,
+    "R61": 100,
+    "R62": 101,
+    "R63": 102,
+    "R64": 103,
+    "R65": 104,
+    "S11": 105,
+    "S12": 106,
+    "S21": 107,
+    "S22": 108,
+    "S23": 109,
+    "S24": 110,
+    "S25": 111,
+    "S26": 112,
+    "S31": 113,
+    "S32": 114,
+    "S33": 115,
+    "S34": 116,
+    "S35": 117,
+    "S36": 118,
+    "S37": 119,
+    "S38": 120,
+    "S41": 121,
+    "S42": 122,
+    "S51": 123,
+    "S52": 124,
+    "S53": 125,
+    "S54": 126,
+    "S61": 127,
+    "S62": 128,
+    "S63": 129,
+    "S64": 130,
+    "S65": 131,
+    "S66": 132,
+    "S67": 133,
+    "S68": 134,
+    "S71": 135,
+    "S72": 136,
+    "S73": 137,
+    "S74": 138,
+    "S75": 139,
+    "S76": 140,
+    "S81": 141,
+    "S82": 142,
+    "S91": 143,
+    "S92": 144,
+    "S93": 145,
+    "S94": 146,
+    "T11": 147,
+    "T12": 148,
+    "T13": 149,
+    "T14": 150,
+    "T15": 151,
+    "T16": 152,
+    "T17": 153,
+    "T18": 154,
+    "T19": 155,
+    "T1A": 156,
+    "T1B": 157,
+    "T1C": 158,
+    "T1D": 159,
+    "T1E": 160,
+    "T1F": 161,
+    "T1G": 162,
+    "T1H": 163,
+    "T21": 164,
+    "T22": 165,
+    "T23": 166,
+    "T24": 167,
+    "T25": 168,
+    "T27": 169,
+    "T28": 170,
+    "T29": 171,
+    "T31": 172,
+    "T32": 173,
+    "T33": 174,
+    "T34": 175,
+    "T35": 176,
+    "T36": 177,
+    "T37": 178,
+    "T38": 179,
+    "T39": 180,
+    "T3A": 181,
+    "T3B": 182,
+    "T3C": 183,
+    "T3D": 184,
+    "T3E": 185,
+    "T3F": 186,
+    "T3G": 187,
+    "T3H": 188,
+    "T3J": 189,
+    "T3K": 190,
+    "T3M": 191,
+    "U21": 192,
+    "U22": 193,
+    "U23": 194,
+    "U24": 195,
+    "U25": 196,
+    "U26": 197,
+    "U27": 198,
+    "U28": 199,
+    "U29": 200,
+    "U2A": 201,
+    "U32": 202,
+    "U33": 203,
+    "U34": 204,
+    "U35": 205,
+    "U36": 206,
+    "U37": 207,
+    "U38": 208,
+    "U3A": 209,
+    "U3B": 210,
+    "U3C": 211,
+    "U3D": 212,
+    "U61": 213,
+    "U62": 214,
+    "V11": 215,
+    "V12": 216,
+    "V13": 217,
+    "V14": 218,
+    "V15": 219,
+    "V32": 220,
+    "V33": 221,
+    "V34": 222,
+    "V35": 223,
+    "V37": 224,
+    "V38": 225,
+    "V39": 226
+  },
   "layer_norm_eps": 1e-12,
   "max_position_embeddings": 512,
   "model_type": "bert",
   "num_hidden_layers": 24,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
   "torch_dtype": "float32",
   "transformers_version": "4.36.2",
   "type_vocab_size": 2,

models/text_classification_model/generation_config.json DELETED Viewed

@@ -1,5 +0,0 @@
-{
-  "_from_model_config": true,
-  "pad_token_id": 0,
-  "transformers_version": "4.36.2"
-}

models/text_classification_model/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:962e8280624491e04f2d68af04c50ccaeb07d0bb9218652a0e7f2db6263c0fec
-size 1398907656

 version https://git-lfs.github.com/spec/v1
+oid sha256:7568c87fa274d42e4920c5fba1ea7d86877494e4b0c3e306319973cd6af535fd
+size 1399651156

models/text_classification_model/tokenizer.json CHANGED Viewed

@@ -1,6 +1,11 @@
 {
   "version": "1.0",
-  "truncation": null,
   "padding": null,
   "added_tokens": [
     {

 {
   "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 512,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
   "padding": null,
   "added_tokens": [
     {