Update
Browse files- README.md +91 -28
- pytorch_model.bin +1 -1
- tf_model.h5 +1 -1
README.md
CHANGED
@@ -17,68 +17,131 @@ This repository contains a small ELECTRA discriminator finetuned on a corpus of
|
|
17 |
|
18 |
```python
|
19 |
import math
|
|
|
20 |
|
21 |
import tensorflow as tf
|
22 |
-
from datasets import
|
23 |
-
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, create_optimizer
|
|
|
24 |
|
25 |
# This example shows how this model can be used:
|
26 |
-
# you should finetune the model of your specific corpus if commands,
|
27 |
dict_train = {
|
28 |
-
"idx": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
}
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
tokenizer = AutoTokenizer.from_pretrained("Aureliano/electra-if")
|
43 |
|
44 |
tokenize_function = lambda example: tokenizer(example["sentence"], truncation=True)
|
45 |
|
46 |
-
pre_tokenizer_columns = set(
|
47 |
-
|
48 |
-
tokenizer_columns = list(set(
|
49 |
|
50 |
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
|
51 |
|
52 |
-
batch_size =
|
53 |
-
tf_train_dataset =
|
54 |
columns=tokenizer_columns,
|
55 |
label_cols=["labels"],
|
56 |
shuffle=True,
|
57 |
batch_size=batch_size,
|
58 |
collate_fn=data_collator
|
59 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
62 |
-
num_epochs =
|
63 |
-
batches_per_epoch = math.ceil(len(
|
64 |
total_train_steps = int(batches_per_epoch * num_epochs)
|
65 |
|
66 |
optimizer, schedule = create_optimizer(
|
67 |
-
init_lr=
|
68 |
)
|
69 |
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
discriminator.fit(
|
72 |
tf_train_dataset,
|
73 |
-
epochs=num_epochs
|
|
|
|
|
74 |
)
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
text = "get lamp"
|
77 |
encoded_input = tokenizer(text, return_tensors='tf')
|
78 |
output = discriminator(encoded_input)
|
79 |
prediction = tf.nn.softmax(output["logits"][0], -1)
|
80 |
-
label =
|
81 |
-
print(text, ":", label
|
82 |
-
# ideally
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
```
|
|
|
17 |
|
18 |
```python
|
19 |
import math
|
20 |
+
import numpy as np
|
21 |
|
22 |
import tensorflow as tf
|
23 |
+
from datasets import load_metric, Dataset, DatasetDict
|
24 |
+
from transformers import TFAutoModel, TFAutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, create_optimizer
|
25 |
+
from transformers.keras_callbacks import KerasMetricCallback
|
26 |
|
27 |
# This example shows how this model can be used:
|
28 |
+
# you should finetune the model of your specific corpus if commands, bigger than this
|
29 |
dict_train = {
|
30 |
+
"idx": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18",
|
31 |
+
"19", "20"],
|
32 |
+
"sentence": ["e", "get pen", "drop book", "x paper", "i", "south", "get paper", "drop the pen", "x book",
|
33 |
+
"inventory", "n", "get the book", "drop paper", "look at Pen", "inv", "g", "s", "get sandwich",
|
34 |
+
"drop sandwich", "x sandwich", "agin"],
|
35 |
+
"label": ["travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "inventory.v.01", "travel.v.01", "take.v.04",
|
36 |
+
"drop.v.01", "examine.v.02", "inventory.v.01", "travel.v.01", "take.v.04", "drop.v.01", "examine.v.02",
|
37 |
+
"inventory.v.01", "repeat.v.01", "travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "repeat.v.01"]
|
38 |
+
}
|
39 |
+
dict_val = {
|
40 |
+
"idx": ["0", "1", "2", "3", "4", "5"],
|
41 |
+
"sentence": ["w", "get shield", "drop sword", "x spikes", "i", "repeat"],
|
42 |
+
"label": ["travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "inventory.v.01", "repeat.v.01"]
|
43 |
}
|
44 |
|
45 |
+
raw_train_dataset = Dataset.from_dict(dict_train)
|
46 |
+
raw_val_dataset = Dataset.from_dict(dict_val)
|
47 |
+
raw_dataset = DatasetDict()
|
48 |
+
raw_dataset["train"] = raw_train_dataset
|
49 |
+
raw_dataset["val"] = raw_val_dataset
|
50 |
+
raw_dataset = raw_dataset.class_encode_column("label")
|
51 |
+
print(raw_dataset)
|
52 |
+
print(raw_dataset["train"].features)
|
53 |
+
print(raw_dataset["val"].features)
|
54 |
+
print(raw_dataset["train"][1])
|
55 |
+
label2id = {}
|
56 |
+
id2label = {}
|
57 |
+
for i, l in enumerate(raw_dataset["train"].features["label"].names):
|
58 |
+
label2id[l] = i
|
59 |
+
id2label[i] = l
|
60 |
+
|
61 |
+
discriminator = TFAutoModelForSequenceClassification.from_pretrained("Aureliano/electra-if",
|
62 |
+
label2id=label2id,
|
63 |
+
id2label=id2label)
|
64 |
tokenizer = AutoTokenizer.from_pretrained("Aureliano/electra-if")
|
65 |
|
66 |
tokenize_function = lambda example: tokenizer(example["sentence"], truncation=True)
|
67 |
|
68 |
+
pre_tokenizer_columns = set(raw_dataset["train"].features)
|
69 |
+
encoded_dataset = raw_dataset.map(tokenize_function, batched=True)
|
70 |
+
tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
|
71 |
|
72 |
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
|
73 |
|
74 |
+
batch_size = len(encoded_dataset["train"])
|
75 |
+
tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
|
76 |
columns=tokenizer_columns,
|
77 |
label_cols=["labels"],
|
78 |
shuffle=True,
|
79 |
batch_size=batch_size,
|
80 |
collate_fn=data_collator
|
81 |
)
|
82 |
+
tf_validation_dataset = encoded_dataset["val"].to_tf_dataset(
|
83 |
+
columns=tokenizer_columns,
|
84 |
+
label_cols=["labels"],
|
85 |
+
shuffle=False,
|
86 |
+
batch_size=batch_size,
|
87 |
+
collate_fn=data_collator
|
88 |
+
)
|
89 |
|
90 |
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
91 |
+
num_epochs = 20
|
92 |
+
batches_per_epoch = math.ceil(len(encoded_dataset["train"]) / batch_size)
|
93 |
total_train_steps = int(batches_per_epoch * num_epochs)
|
94 |
|
95 |
optimizer, schedule = create_optimizer(
|
96 |
+
init_lr=5e-5, num_warmup_steps=total_train_steps // 5, num_train_steps=total_train_steps
|
97 |
)
|
98 |
|
99 |
+
metric = load_metric("accuracy")
|
100 |
+
|
101 |
+
|
102 |
+
def compute_metrics(eval_predictions):
|
103 |
+
logits, labels = eval_predictions
|
104 |
+
predictions = np.argmax(logits, axis=-1)
|
105 |
+
return metric.compute(predictions=predictions, references=labels)
|
106 |
+
|
107 |
+
|
108 |
+
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_dataset)
|
109 |
+
callbacks = [metric_callback]
|
110 |
+
|
111 |
+
discriminator.compile(optimizer=optimizer, loss=loss, metrics=["sparse_categorical_accuracy"])
|
112 |
discriminator.fit(
|
113 |
tf_train_dataset,
|
114 |
+
epochs=num_epochs,
|
115 |
+
validation_data=tf_validation_dataset,
|
116 |
+
callbacks=callbacks
|
117 |
)
|
118 |
|
119 |
+
print("Evaluate on test data")
|
120 |
+
results = discriminator.evaluate(tf_validation_dataset)
|
121 |
+
print("test loss, test acc:", results)
|
122 |
+
|
123 |
+
text = "i"
|
124 |
+
encoded_input = tokenizer(text, return_tensors='tf')
|
125 |
+
output = discriminator(encoded_input)
|
126 |
+
prediction = tf.nn.softmax(output["logits"][0], -1)
|
127 |
+
label = id2label[tf.math.argmax(prediction).numpy()]
|
128 |
+
print("\n", text, ":", label,
|
129 |
+
"\n") # ideally 'inventory.v.01' (-> "make or include in an itemized record or report"), but probably only with a better finetuning dataset
|
130 |
+
|
131 |
text = "get lamp"
|
132 |
encoded_input = tokenizer(text, return_tensors='tf')
|
133 |
output = discriminator(encoded_input)
|
134 |
prediction = tf.nn.softmax(output["logits"][0], -1)
|
135 |
+
label = id2label[tf.math.argmax(prediction).numpy()]
|
136 |
+
print("\n", text, ":", label,
|
137 |
+
"\n") # ideally 'take.v.04' (-> "get into one's hands, take physically"), but probably only with a better finetuning dataset
|
138 |
+
|
139 |
+
text = "w"
|
140 |
+
encoded_input = tokenizer(text, return_tensors='tf')
|
141 |
+
output = discriminator(encoded_input)
|
142 |
+
prediction = tf.nn.softmax(output["logits"][0], -1)
|
143 |
+
label = id2label[tf.math.argmax(prediction).numpy()]
|
144 |
+
print("\n", text, ":", label,
|
145 |
+
"\n") # ideally 'travel.v.01' (-> "change location; move, travel, or proceed, also metaphorically"), but probably only with a better finetuning dataset
|
146 |
|
147 |
```
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 54011377
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b585bc34677a9d35a1ef60bf1a531067fad58e9f4259dda25d63b4ae54632f93
|
3 |
size 54011377
|
tf_model.h5
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 54198792
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4bf1f25a9684bfe9bb90e65a365b9b10bdd3b413841c98dba94b900139af7c90
|
3 |
size 54198792
|