Aureliano commited on
Commit
e98aaeb
·
1 Parent(s): 32192ff
Files changed (3) hide show
  1. README.md +91 -28
  2. pytorch_model.bin +1 -1
  3. tf_model.h5 +1 -1
README.md CHANGED
@@ -17,68 +17,131 @@ This repository contains a small ELECTRA discriminator finetuned on a corpus of
17
 
18
  ```python
19
  import math
 
20
 
21
  import tensorflow as tf
22
- from datasets import Dataset, ClassLabel, Features, Value
23
- from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, create_optimizer
 
24
 
25
  # This example shows how this model can be used:
26
- # you should finetune the model of your specific corpus if commands, bogger than this
27
  dict_train = {
28
- "idx": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"],
29
- "sentence": ["e", "get pen", "drop book", "x paper", "i", "south", "get paper", "drop pen", "x book", "inventory",
30
- "n", "get book", "drop paper", "examine Pen", "inv", "w"],
31
- "label": ["v01835496", "v01214265", "v01977701", "v02131279", "v02472495", "v01835496", "v01214265", "v01977701",
32
- "v02131279", "v02472495", "v01835496", "v01214265", "v01977701", "v02131279", "v02472495", "v01835496"]
 
 
 
 
 
 
 
 
33
  }
34
 
35
- num_labels = len(set(dict_train["label"]))
36
- features = Features({'idx': Value('uint32'), 'sentence': Value('string'),
37
- 'label': ClassLabel(names=list(set(dict_train["label"])))})
38
-
39
- raw_train_dataset = Dataset.from_dict(dict_train, features=features)
40
-
41
- discriminator = TFAutoModelForSequenceClassification.from_pretrained("Aureliano/electra-if", num_labels=num_labels)
 
 
 
 
 
 
 
 
 
 
 
 
42
  tokenizer = AutoTokenizer.from_pretrained("Aureliano/electra-if")
43
 
44
  tokenize_function = lambda example: tokenizer(example["sentence"], truncation=True)
45
 
46
- pre_tokenizer_columns = set(raw_train_dataset.features)
47
- train_dataset = raw_train_dataset.map(tokenize_function, batched=True)
48
- tokenizer_columns = list(set(train_dataset.features) - pre_tokenizer_columns)
49
 
50
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
51
 
52
- batch_size = 16
53
- tf_train_dataset = train_dataset.to_tf_dataset(
54
  columns=tokenizer_columns,
55
  label_cols=["labels"],
56
  shuffle=True,
57
  batch_size=batch_size,
58
  collate_fn=data_collator
59
  )
 
 
 
 
 
 
 
60
 
61
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
62
- num_epochs = 100
63
- batches_per_epoch = math.ceil(len(train_dataset) / batch_size)
64
  total_train_steps = int(batches_per_epoch * num_epochs)
65
 
66
  optimizer, schedule = create_optimizer(
67
- init_lr=1e-5, num_warmup_steps=1, num_train_steps=total_train_steps
68
  )
69
 
70
- discriminator.compile(optimizer=optimizer, loss=loss)
 
 
 
 
 
 
 
 
 
 
 
 
71
  discriminator.fit(
72
  tf_train_dataset,
73
- epochs=num_epochs
 
 
74
  )
75
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  text = "get lamp"
77
  encoded_input = tokenizer(text, return_tensors='tf')
78
  output = discriminator(encoded_input)
79
  prediction = tf.nn.softmax(output["logits"][0], -1)
80
- label = dict_train["label"][tf.math.argmax(prediction)]
81
- print(text, ":", label)
82
- # ideally [v01214265 -> take.v.04 -> "get into one's hands, take physically"], but probably only with a better dataset
 
 
 
 
 
 
 
 
83
 
84
  ```
 
17
 
18
  ```python
19
  import math
20
+ import numpy as np
21
 
22
  import tensorflow as tf
23
+ from datasets import load_metric, Dataset, DatasetDict
24
+ from transformers import TFAutoModel, TFAutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, create_optimizer
25
+ from transformers.keras_callbacks import KerasMetricCallback
26
 
27
  # This example shows how this model can be used:
28
+ # you should finetune the model of your specific corpus if commands, bigger than this
29
  dict_train = {
30
+ "idx": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18",
31
+ "19", "20"],
32
+ "sentence": ["e", "get pen", "drop book", "x paper", "i", "south", "get paper", "drop the pen", "x book",
33
+ "inventory", "n", "get the book", "drop paper", "look at Pen", "inv", "g", "s", "get sandwich",
34
+ "drop sandwich", "x sandwich", "agin"],
35
+ "label": ["travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "inventory.v.01", "travel.v.01", "take.v.04",
36
+ "drop.v.01", "examine.v.02", "inventory.v.01", "travel.v.01", "take.v.04", "drop.v.01", "examine.v.02",
37
+ "inventory.v.01", "repeat.v.01", "travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "repeat.v.01"]
38
+ }
39
+ dict_val = {
40
+ "idx": ["0", "1", "2", "3", "4", "5"],
41
+ "sentence": ["w", "get shield", "drop sword", "x spikes", "i", "repeat"],
42
+ "label": ["travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "inventory.v.01", "repeat.v.01"]
43
  }
44
 
45
+ raw_train_dataset = Dataset.from_dict(dict_train)
46
+ raw_val_dataset = Dataset.from_dict(dict_val)
47
+ raw_dataset = DatasetDict()
48
+ raw_dataset["train"] = raw_train_dataset
49
+ raw_dataset["val"] = raw_val_dataset
50
+ raw_dataset = raw_dataset.class_encode_column("label")
51
+ print(raw_dataset)
52
+ print(raw_dataset["train"].features)
53
+ print(raw_dataset["val"].features)
54
+ print(raw_dataset["train"][1])
55
+ label2id = {}
56
+ id2label = {}
57
+ for i, l in enumerate(raw_dataset["train"].features["label"].names):
58
+ label2id[l] = i
59
+ id2label[i] = l
60
+
61
+ discriminator = TFAutoModelForSequenceClassification.from_pretrained("Aureliano/electra-if",
62
+ label2id=label2id,
63
+ id2label=id2label)
64
  tokenizer = AutoTokenizer.from_pretrained("Aureliano/electra-if")
65
 
66
  tokenize_function = lambda example: tokenizer(example["sentence"], truncation=True)
67
 
68
+ pre_tokenizer_columns = set(raw_dataset["train"].features)
69
+ encoded_dataset = raw_dataset.map(tokenize_function, batched=True)
70
+ tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
71
 
72
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
73
 
74
+ batch_size = len(encoded_dataset["train"])
75
+ tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
76
  columns=tokenizer_columns,
77
  label_cols=["labels"],
78
  shuffle=True,
79
  batch_size=batch_size,
80
  collate_fn=data_collator
81
  )
82
+ tf_validation_dataset = encoded_dataset["val"].to_tf_dataset(
83
+ columns=tokenizer_columns,
84
+ label_cols=["labels"],
85
+ shuffle=False,
86
+ batch_size=batch_size,
87
+ collate_fn=data_collator
88
+ )
89
 
90
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
91
+ num_epochs = 20
92
+ batches_per_epoch = math.ceil(len(encoded_dataset["train"]) / batch_size)
93
  total_train_steps = int(batches_per_epoch * num_epochs)
94
 
95
  optimizer, schedule = create_optimizer(
96
+ init_lr=5e-5, num_warmup_steps=total_train_steps // 5, num_train_steps=total_train_steps
97
  )
98
 
99
+ metric = load_metric("accuracy")
100
+
101
+
102
+ def compute_metrics(eval_predictions):
103
+ logits, labels = eval_predictions
104
+ predictions = np.argmax(logits, axis=-1)
105
+ return metric.compute(predictions=predictions, references=labels)
106
+
107
+
108
+ metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_dataset)
109
+ callbacks = [metric_callback]
110
+
111
+ discriminator.compile(optimizer=optimizer, loss=loss, metrics=["sparse_categorical_accuracy"])
112
  discriminator.fit(
113
  tf_train_dataset,
114
+ epochs=num_epochs,
115
+ validation_data=tf_validation_dataset,
116
+ callbacks=callbacks
117
  )
118
 
119
+ print("Evaluate on test data")
120
+ results = discriminator.evaluate(tf_validation_dataset)
121
+ print("test loss, test acc:", results)
122
+
123
+ text = "i"
124
+ encoded_input = tokenizer(text, return_tensors='tf')
125
+ output = discriminator(encoded_input)
126
+ prediction = tf.nn.softmax(output["logits"][0], -1)
127
+ label = id2label[tf.math.argmax(prediction).numpy()]
128
+ print("\n", text, ":", label,
129
+ "\n") # ideally 'inventory.v.01' (-> "make or include in an itemized record or report"), but probably only with a better finetuning dataset
130
+
131
  text = "get lamp"
132
  encoded_input = tokenizer(text, return_tensors='tf')
133
  output = discriminator(encoded_input)
134
  prediction = tf.nn.softmax(output["logits"][0], -1)
135
+ label = id2label[tf.math.argmax(prediction).numpy()]
136
+ print("\n", text, ":", label,
137
+ "\n") # ideally 'take.v.04' (-> "get into one's hands, take physically"), but probably only with a better finetuning dataset
138
+
139
+ text = "w"
140
+ encoded_input = tokenizer(text, return_tensors='tf')
141
+ output = discriminator(encoded_input)
142
+ prediction = tf.nn.softmax(output["logits"][0], -1)
143
+ label = id2label[tf.math.argmax(prediction).numpy()]
144
+ print("\n", text, ":", label,
145
+ "\n") # ideally 'travel.v.01' (-> "change location; move, travel, or proceed, also metaphorically"), but probably only with a better finetuning dataset
146
 
147
  ```
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a613fb2aa3569bac1a4a5cb2be88706cdccd33a72dddfc78051353e6ec07cb46
3
  size 54011377
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b585bc34677a9d35a1ef60bf1a531067fad58e9f4259dda25d63b4ae54632f93
3
  size 54011377
tf_model.h5 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56a0bfbd98a2fb0c1a57ad0c8a4dd6234cc17b2e27a21b5c33971a5045d5ecd7
3
  size 54198792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bf1f25a9684bfe9bb90e65a365b9b10bdd3b413841c98dba94b900139af7c90
3
  size 54198792