Spaces:

lunadebruyne
/

EmotioNL

Running

App Files Files Community

lunadebruyne commited on Feb 17, 2023

Commit

f625e51

1 Parent(s): a1955a4

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -15

app.py CHANGED Viewed

@@ -1,12 +1,81 @@
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification
 description_sentence = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotion in a sentence."
 description2 = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotions in a dataset.\nThe data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected."
 inference_modelpath = "model/checkpoint-128"
 def what_happened(text, file_object, option_list):
     if file_object:
@@ -24,21 +93,14 @@ def what_happened1(text):
     return output
 def what_happened2(file_object, option_list):
-    """"
-    if file_object:
-        output1 = "You uploaded a file."
-        if len(option_list) > 0:
-            output1 = output1 + "\nYou selected these options:\n- " + "\n- ".join(option_list)
-    else:
-        output1 = "You should upload a file."
-    """
-    input_file = open(file_object.name, 'r')
-    lines = input_file.read()
-    input_file.close()
-    output_file = open('output.txt', 'w')
-    output_file.write(lines)
-    output_file.close()
-    output1 = 'output.txt'
     output2 = output3 = output4 = output5 = "This option was not selected."
     if "emotion frequencies" in option_list:
         output2 = "This option was selected."

 import gradio as gr
 import torch
+import numpy as np
 from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification
+from transformers import TrainingArguments, Trainer
+from datasets import load_dataset
 description_sentence = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotion in a sentence."
 description2 = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotions in a dataset.\nThe data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected."
 inference_modelpath = "model/checkpoint-128"
+output_dir = "model"
+model_config = {
+    "model_weights": "pdelobelle/robbert-v2-dutch-base",
+    "num_labels": 6,
+    "max_length": 128,
+    "device": "cpu"
+}
+## Tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(model_config["model_weights"])
+model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
+# Function for encoding (tokenizing) data
+def encode_data(data):
+  text = data["text"]
+  label = data["label"]
+  encoded_input = tokenizer(
+                text,
+                add_special_tokens=True,
+                max_length= model_config["max_length"],
+                padding= "max_length",
+                return_overflowing_tokens=True,
+                truncation=True
+            )
+  encoded_input["labels"] = label
+  return encoded_input
+# Test arguments for Trainer
+test_args = TrainingArguments(
+    output_dir = output_dir,
+    do_train = False,
+    do_predict = True,
+    per_device_eval_batch_size = 64,
+    dataloader_drop_last = False
+)
+trainer = Trainer(
+              model = model,
+              args = test_args)
+def inference_dataset(file_object):
+  #input_file = open(file_object.name, 'r')
+  input_file = file_object
+  data_paths = {"train": input_file, "inference": input_file}
+  dataset = load_dataset('csv', skiprows=1, data_files=data_paths, column_names = ['id', 'text', 'label'], delimiter='\t')
+  encoded_dataset = dataset.map(encode_data, batched=True)
+  encoded_dataset.set_format("torch")
+  encoded_dataset["inference"] = encoded_dataset["inference"].remove_columns("label")
+  # Run trainer in prediction mode
+  prediction_output = trainer.predict(encoded_dataset["inference"])
+  predictions = prediction_output[0]
+  ids = dataset["inference"]["id"]
+  texts = dataset["inference"]["text"]
+  preds = np.argmax(predictions, axis=1)
+  preds = [model.config.id2label[pred] for pred in preds]
+  predictions_content = list(zip(ids, texts, preds))
+  # write predictions to file
+  output = "output.txt"
+  f = open(output, 'w')
+  f.write("id\ttext\tprediction\n")
+  for line in predictions_content:
+      f.write(str(line[0]) + '\t' + str(line[1]) + '\t' + str(line[2]) + '\n')
+  f.close()
+  return output
 def what_happened(text, file_object, option_list):
     if file_object:
     return output
 def what_happened2(file_object, option_list):
+    #input_file = open(file_object.name, 'r')
+    #lines = input_file.read()
+    #input_file.close()
+    #output_file = open('output.txt', 'w')
+    #output_file.write(lines)
+    #output_file.close()
+    #output1 = 'output.txt'
+    output1 = inference_dataset(file_object.name)
     output2 = output3 = output4 = output5 = "This option was not selected."
     if "emotion frequencies" in option_list:
         output2 = "This option was selected."