lunadebruyne commited on
Commit
f625e51
·
1 Parent(s): a1955a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -15
app.py CHANGED
@@ -1,12 +1,81 @@
1
  import gradio as gr
2
  import torch
 
 
3
  from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification
 
 
 
4
 
5
 
6
  description_sentence = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotion in a sentence."
7
  description2 = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotions in a dataset.\nThe data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected."
8
 
9
  inference_modelpath = "model/checkpoint-128"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def what_happened(text, file_object, option_list):
12
  if file_object:
@@ -24,21 +93,14 @@ def what_happened1(text):
24
  return output
25
 
26
  def what_happened2(file_object, option_list):
27
- """"
28
- if file_object:
29
- output1 = "You uploaded a file."
30
- if len(option_list) > 0:
31
- output1 = output1 + "\nYou selected these options:\n- " + "\n- ".join(option_list)
32
- else:
33
- output1 = "You should upload a file."
34
- """
35
- input_file = open(file_object.name, 'r')
36
- lines = input_file.read()
37
- input_file.close()
38
- output_file = open('output.txt', 'w')
39
- output_file.write(lines)
40
- output_file.close()
41
- output1 = 'output.txt'
42
  output2 = output3 = output4 = output5 = "This option was not selected."
43
  if "emotion frequencies" in option_list:
44
  output2 = "This option was selected."
 
1
  import gradio as gr
2
  import torch
3
+ import numpy as np
4
+
5
  from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification
6
+ from transformers import TrainingArguments, Trainer
7
+
8
+ from datasets import load_dataset
9
 
10
 
11
  description_sentence = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotion in a sentence."
12
  description2 = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotions in a dataset.\nThe data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected."
13
 
14
  inference_modelpath = "model/checkpoint-128"
15
+ output_dir = "model"
16
+ model_config = {
17
+ "model_weights": "pdelobelle/robbert-v2-dutch-base",
18
+ "num_labels": 6,
19
+ "max_length": 128,
20
+ "device": "cpu"
21
+ }
22
+
23
+ ## Tokenizer and model
24
+ tokenizer = AutoTokenizer.from_pretrained(model_config["model_weights"])
25
+ model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
26
+
27
+ # Function for encoding (tokenizing) data
28
+ def encode_data(data):
29
+ text = data["text"]
30
+ label = data["label"]
31
+ encoded_input = tokenizer(
32
+ text,
33
+ add_special_tokens=True,
34
+ max_length= model_config["max_length"],
35
+ padding= "max_length",
36
+ return_overflowing_tokens=True,
37
+ truncation=True
38
+ )
39
+ encoded_input["labels"] = label
40
+ return encoded_input
41
+
42
+
43
+ # Test arguments for Trainer
44
+ test_args = TrainingArguments(
45
+ output_dir = output_dir,
46
+ do_train = False,
47
+ do_predict = True,
48
+ per_device_eval_batch_size = 64,
49
+ dataloader_drop_last = False
50
+ )
51
+ trainer = Trainer(
52
+ model = model,
53
+ args = test_args)
54
+
55
+ def inference_dataset(file_object):
56
+ #input_file = open(file_object.name, 'r')
57
+ input_file = file_object
58
+ data_paths = {"train": input_file, "inference": input_file}
59
+ dataset = load_dataset('csv', skiprows=1, data_files=data_paths, column_names = ['id', 'text', 'label'], delimiter='\t')
60
+ encoded_dataset = dataset.map(encode_data, batched=True)
61
+ encoded_dataset.set_format("torch")
62
+ encoded_dataset["inference"] = encoded_dataset["inference"].remove_columns("label")
63
+ # Run trainer in prediction mode
64
+ prediction_output = trainer.predict(encoded_dataset["inference"])
65
+ predictions = prediction_output[0]
66
+ ids = dataset["inference"]["id"]
67
+ texts = dataset["inference"]["text"]
68
+ preds = np.argmax(predictions, axis=1)
69
+ preds = [model.config.id2label[pred] for pred in preds]
70
+ predictions_content = list(zip(ids, texts, preds))
71
+ # write predictions to file
72
+ output = "output.txt"
73
+ f = open(output, 'w')
74
+ f.write("id\ttext\tprediction\n")
75
+ for line in predictions_content:
76
+ f.write(str(line[0]) + '\t' + str(line[1]) + '\t' + str(line[2]) + '\n')
77
+ f.close()
78
+ return output
79
 
80
  def what_happened(text, file_object, option_list):
81
  if file_object:
 
93
  return output
94
 
95
  def what_happened2(file_object, option_list):
96
+ #input_file = open(file_object.name, 'r')
97
+ #lines = input_file.read()
98
+ #input_file.close()
99
+ #output_file = open('output.txt', 'w')
100
+ #output_file.write(lines)
101
+ #output_file.close()
102
+ #output1 = 'output.txt'
103
+ output1 = inference_dataset(file_object.name)
 
 
 
 
 
 
 
104
  output2 = output3 = output4 = output5 = "This option was not selected."
105
  if "emotion frequencies" in option_list:
106
  output2 = "This option was selected."