Spaces:
Running
Running
Update Space (evaluate main: 0b7ed95a)
Browse files
README.md
CHANGED
@@ -38,12 +38,8 @@ At minimum, this metric takes as input a list of predictions and a list of refer
|
|
38 |
>>> references = ["hello there", "general kenobi"]
|
39 |
>>> results = rouge.compute(predictions=predictions,
|
40 |
... references=references)
|
41 |
-
>>> print(
|
42 |
-
|
43 |
-
>>> print(results["rouge1"])
|
44 |
-
AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0))
|
45 |
-
>>> print(results["rouge1"].mid.fmeasure)
|
46 |
-
1.0
|
47 |
```
|
48 |
|
49 |
### Inputs
|
@@ -62,18 +58,18 @@ AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(pre
|
|
62 |
- **use_stemmer** (`boolean`): If `True`, uses Porter stemmer to strip word suffixes. Defaults to `False`.
|
63 |
|
64 |
### Output Values
|
65 |
-
The output is a dictionary with one entry for each rouge type in the input list `rouge_types`. If `use_aggregator=False`, each dictionary entry is a list of
|
66 |
|
67 |
```python
|
68 |
-
{'rouge1': [
|
69 |
```
|
70 |
|
71 |
If `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=True`, the output is of the following format:
|
72 |
```python
|
73 |
-
{'rouge1':
|
74 |
```
|
75 |
|
76 |
-
The
|
77 |
|
78 |
|
79 |
#### Values from Popular Papers
|
@@ -86,11 +82,12 @@ An example without aggregation:
|
|
86 |
>>> predictions = ["hello goodbye", "ankh morpork"]
|
87 |
>>> references = ["goodbye", "general kenobi"]
|
88 |
>>> results = rouge.compute(predictions=predictions,
|
89 |
-
... references=references
|
|
|
90 |
>>> print(list(results.keys()))
|
91 |
['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
|
92 |
>>> print(results["rouge1"])
|
93 |
-
[
|
94 |
```
|
95 |
|
96 |
The same example, but with aggregation:
|
@@ -104,7 +101,7 @@ The same example, but with aggregation:
|
|
104 |
>>> print(list(results.keys()))
|
105 |
['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
|
106 |
>>> print(results["rouge1"])
|
107 |
-
|
108 |
```
|
109 |
|
110 |
The same example, but only calculating `rouge_1`:
|
@@ -119,7 +116,7 @@ The same example, but only calculating `rouge_1`:
|
|
119 |
>>> print(list(results.keys()))
|
120 |
['rouge1']
|
121 |
>>> print(results["rouge1"])
|
122 |
-
|
123 |
```
|
124 |
|
125 |
## Limitations and Bias
|
|
|
38 |
>>> references = ["hello there", "general kenobi"]
|
39 |
>>> results = rouge.compute(predictions=predictions,
|
40 |
... references=references)
|
41 |
+
>>> print(results)
|
42 |
+
{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
|
|
|
|
|
|
|
|
|
43 |
```
|
44 |
|
45 |
### Inputs
|
|
|
58 |
- **use_stemmer** (`boolean`): If `True`, uses Porter stemmer to strip word suffixes. Defaults to `False`.
|
59 |
|
60 |
### Output Values
|
61 |
+
The output is a dictionary with one entry for each rouge type in the input list `rouge_types`. If `use_aggregator=False`, each dictionary entry is a list of scores, with one score for each sentence. E.g. if `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=False`, the output is:
|
62 |
|
63 |
```python
|
64 |
+
{'rouge1': [0.6666666666666666, 1.0], 'rouge2': [0.0, 1.0]}
|
65 |
```
|
66 |
|
67 |
If `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=True`, the output is of the following format:
|
68 |
```python
|
69 |
+
{'rouge1': 1.0, 'rouge2': 1.0}
|
70 |
```
|
71 |
|
72 |
+
The ROUGE values are in the range of 0 to 1.
|
73 |
|
74 |
|
75 |
#### Values from Popular Papers
|
|
|
82 |
>>> predictions = ["hello goodbye", "ankh morpork"]
|
83 |
>>> references = ["goodbye", "general kenobi"]
|
84 |
>>> results = rouge.compute(predictions=predictions,
|
85 |
+
... references=references,
|
86 |
+
... use_aggregator=False)
|
87 |
>>> print(list(results.keys()))
|
88 |
['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
|
89 |
>>> print(results["rouge1"])
|
90 |
+
[0.5, 0.0]
|
91 |
```
|
92 |
|
93 |
The same example, but with aggregation:
|
|
|
101 |
>>> print(list(results.keys()))
|
102 |
['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
|
103 |
>>> print(results["rouge1"])
|
104 |
+
0.25
|
105 |
```
|
106 |
|
107 |
The same example, but only calculating `rouge_1`:
|
|
|
116 |
>>> print(list(results.keys()))
|
117 |
['rouge1']
|
118 |
>>> print(results["rouge1"])
|
119 |
+
0.25
|
120 |
```
|
121 |
|
122 |
## Limitations and Bias
|
rouge.py
CHANGED
@@ -65,22 +65,18 @@ Args:
|
|
65 |
use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
|
66 |
use_aggregator: Return aggregates if this is set to True
|
67 |
Returns:
|
68 |
-
rouge1: rouge_1 (
|
69 |
-
rouge2: rouge_2 (
|
70 |
-
rougeL: rouge_l (
|
71 |
-
rougeLsum: rouge_lsum (
|
72 |
Examples:
|
73 |
|
74 |
>>> rouge = evaluate.load('rouge')
|
75 |
>>> predictions = ["hello there", "general kenobi"]
|
76 |
>>> references = ["hello there", "general kenobi"]
|
77 |
>>> results = rouge.compute(predictions=predictions, references=references)
|
78 |
-
>>> print(
|
79 |
-
|
80 |
-
>>> print(results["rouge1"])
|
81 |
-
AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0))
|
82 |
-
>>> print(results["rouge1"].mid.fmeasure)
|
83 |
-
1.0
|
84 |
"""
|
85 |
|
86 |
|
@@ -123,9 +119,12 @@ class Rouge(evaluate.EvaluationModule):
|
|
123 |
|
124 |
if use_aggregator:
|
125 |
result = aggregator.aggregate()
|
|
|
|
|
|
|
126 |
else:
|
127 |
result = {}
|
128 |
for key in scores[0]:
|
129 |
-
result[key] = list(score[key] for score in scores)
|
130 |
|
131 |
return result
|
|
|
65 |
use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
|
66 |
use_aggregator: Return aggregates if this is set to True
|
67 |
Returns:
|
68 |
+
rouge1: rouge_1 (f1),
|
69 |
+
rouge2: rouge_2 (f1),
|
70 |
+
rougeL: rouge_l (f1),
|
71 |
+
rougeLsum: rouge_lsum (f1)
|
72 |
Examples:
|
73 |
|
74 |
>>> rouge = evaluate.load('rouge')
|
75 |
>>> predictions = ["hello there", "general kenobi"]
|
76 |
>>> references = ["hello there", "general kenobi"]
|
77 |
>>> results = rouge.compute(predictions=predictions, references=references)
|
78 |
+
>>> print(results)
|
79 |
+
{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
|
|
|
|
|
|
|
|
|
80 |
"""
|
81 |
|
82 |
|
|
|
119 |
|
120 |
if use_aggregator:
|
121 |
result = aggregator.aggregate()
|
122 |
+
for key in result:
|
123 |
+
result[key] = result[key].mid.fmeasure
|
124 |
+
|
125 |
else:
|
126 |
result = {}
|
127 |
for key in scores[0]:
|
128 |
+
result[key] = list(score[key].fmeasure for score in scores)
|
129 |
|
130 |
return result
|