daniel-de-leon commited on
Commit
bfa693b
·
1 Parent(s): 46074bc

added more metrics

Browse files
Files changed (1) hide show
  1. test-suite.py +11 -4
test-suite.py CHANGED
@@ -1,6 +1,13 @@
1
  import evaluate
2
  from evaluate.evaluation_suite import SubTask
3
 
 
 
 
 
 
 
 
4
 
5
  class Suite(evaluate.EvaluationSuite):
6
 
@@ -11,9 +18,9 @@ class Suite(evaluate.EvaluationSuite):
11
  SubTask(
12
  task_type="text-classification",
13
  data="hate_speech18",
14
- split="train[:10]",
15
  args_for_task={
16
- "metric": "accuracy",
17
  "input_column": "text",
18
  "label_column": "label",
19
  "label_mapping": {
@@ -27,9 +34,9 @@ class Suite(evaluate.EvaluationSuite):
27
  SubTask(
28
  task_type="text-classification",
29
  data="mteb/toxic_conversations_50k",
30
- split="test[:10]",
31
  args_for_task={
32
- "metric": "accuracy",
33
  "input_column": "text",
34
  "label_column": "label",
35
  "label_mapping": {
 
1
  import evaluate
2
  from evaluate.evaluation_suite import SubTask
3
 
4
+ # This is odd because the first dataset is multi-class and
5
+ # the second dataset is binary. The model I'm using has 4 labels
6
+ # and is finetuned to the first dataset.
7
+ # So what does it mean for this model to be evaluated on the second
8
+ # dataset?
9
+
10
+ metric = evaluate.combine(["accuracy", "recall", "precision", "f1"])
11
 
12
  class Suite(evaluate.EvaluationSuite):
13
 
 
18
  SubTask(
19
  task_type="text-classification",
20
  data="hate_speech18",
21
+ split="train[:1000]",
22
  args_for_task={
23
+ "metric": metric,
24
  "input_column": "text",
25
  "label_column": "label",
26
  "label_mapping": {
 
34
  SubTask(
35
  task_type="text-classification",
36
  data="mteb/toxic_conversations_50k",
37
+ split="test[:1000]",
38
  args_for_task={
39
+ "metric": metric,
40
  "input_column": "text",
41
  "label_column": "label",
42
  "label_mapping": {