Aureliano
/

electra-if

@@ -13,23 +13,72 @@ For a detailed description and experimental results, please refer to the origina
 This repository contains a small ELECTRA discriminator finetuned on a corpus of interactive fiction commands labelled with the WordNet synset offset of the verb in the sentence. The original dataset has been collected from the list of action in the walkthroughs for the game included in the [Jericho](https://github.com/microsoft/jericho) framework and manually annotated. For more information visit https://github.com/aporporato/electra and https://github.com/aporporato/jericho-corpora.
 ## How to use the discriminator in `transformers`
 ```python
-from transformers import ElectraForPreTraining, ElectraTokenizerFast
-import torch
-discriminator = ElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
-tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-small-discriminator")
-sentence = "The quick brown fox jumps over the lazy dog"
-fake_sentence = "The quick brown fox fake over the lazy dog"
-fake_tokens = tokenizer.tokenize(fake_sentence)
-fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
-discriminator_outputs = discriminator(fake_inputs)
-predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)
-[print("%7s" % token, end="") for token in fake_tokens]
-[print("%7s" % int(prediction), end="") for prediction in predictions.squeeze().tolist()]
 ```

 This repository contains a small ELECTRA discriminator finetuned on a corpus of interactive fiction commands labelled with the WordNet synset offset of the verb in the sentence. The original dataset has been collected from the list of action in the walkthroughs for the game included in the [Jericho](https://github.com/microsoft/jericho) framework and manually annotated. For more information visit https://github.com/aporporato/electra and https://github.com/aporporato/jericho-corpora.
 ## How to use the discriminator in `transformers`
+(Heavily based on: https://github.com/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb)
 ```python
+import math
+import tensorflow as tf
+from datasets import Dataset, ClassLabel, Features, Value
+from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, create_optimizer
+# This example shows how this model can be used:
+#  you should finetune the model of your specific corpus if commands, bogger than this
+dict_train = {
+    "idx": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"],
+    "sentence": ["e", "get pen", "drop book", "x paper", "i", "south", "get paper", "drop pen", "x book", "inventory",
+                 "n", "get book", "drop paper", "examine Pen", "inv", "w"],
+    "label": ["v01835496", "v01214265", "v01977701", "v02131279", "v02472495", "v01835496", "v01214265", "v01977701",
+              "v02131279", "v02472495", "v01835496", "v01214265", "v01977701", "v02131279", "v02472495", "v01835496"]
+}
+num_labels = len(set(dict_train["label"]))
+features = Features({'idx': Value('uint32'), 'sentence': Value('string'),
+                     'label': ClassLabel(names=list(set(dict_train["label"])))})
+raw_train_dataset = Dataset.from_dict(dict_train, features=features)
+discriminator = TFAutoModelForSequenceClassification.from_pretrained("Aureliano/electra-if", num_labels=num_labels)
+tokenizer = AutoTokenizer.from_pretrained("Aureliano/electra-if")
+tokenize_function = lambda example: tokenizer(example["sentence"], truncation=True)
+pre_tokenizer_columns = set(raw_train_dataset.features)
+train_dataset = raw_train_dataset.map(tokenize_function, batched=True)
+tokenizer_columns = list(set(train_dataset.features) - pre_tokenizer_columns)
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
+batch_size = 16
+tf_train_dataset = train_dataset.to_tf_dataset(
+    columns=tokenizer_columns,
+    label_cols=["labels"],
+    shuffle=True,
+    batch_size=batch_size,
+    collate_fn=data_collator
+)
+loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+num_epochs = 100
+batches_per_epoch = math.ceil(len(train_dataset) / batch_size)
+total_train_steps = int(batches_per_epoch * num_epochs)
+optimizer, schedule = create_optimizer(
+    init_lr=1e-5, num_warmup_steps=1, num_train_steps=total_train_steps
+)
+discriminator.compile(optimizer=optimizer, loss=loss)
+discriminator.fit(
+    tf_train_dataset,
+    epochs=num_epochs
+)
+text = "get lamp"
+encoded_input = tokenizer(text, return_tensors='tf')
+output = discriminator(encoded_input)
+prediction = tf.nn.softmax(output["logits"][0], -1)
+label = dict_train["label"][tf.math.argmax(prediction)]
+print(text, ":", label)
+# ideally [v01214265 -> take.v.04 -> "get into one's hands, take physically"], but probably only with a better dataset
 ```