|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
import pdb |
|
|
|
def my_lcs(string, sub): |
|
""" |
|
Calculates longest common subsequence for a pair of tokenized strings |
|
:param string : list of str : tokens from a string split using whitespace |
|
:param sub : list of str : shorter string, also split using whitespace |
|
:returns: length (list of int): length of the longest common subsequence between the two strings |
|
|
|
Note: my_lcs only gives length of the longest common subsequence, not the actual LCS |
|
""" |
|
if(len(string)< len(sub)): |
|
sub, string = string, sub |
|
|
|
lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] |
|
|
|
for j in range(1,len(sub)+1): |
|
for i in range(1,len(string)+1): |
|
if(string[i-1] == sub[j-1]): |
|
lengths[i][j] = lengths[i-1][j-1] + 1 |
|
else: |
|
lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) |
|
|
|
return lengths[len(string)][len(sub)] |
|
|
|
class Rouge(): |
|
''' |
|
Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set |
|
|
|
''' |
|
def __init__(self): |
|
|
|
self.beta = 1.2 |
|
|
|
def calc_score(self, candidate, refs): |
|
""" |
|
Compute ROUGE-L score given one candidate and references for an image |
|
:param candidate: str : candidate sentence to be evaluated |
|
:param refs: list of str : COCO reference sentences for the particular image to be evaluated |
|
:returns score: int (ROUGE-L score for the candidate evaluated against references) |
|
""" |
|
assert(len(candidate)==1) |
|
assert(len(refs)>0) |
|
prec = [] |
|
rec = [] |
|
|
|
|
|
token_c = candidate[0].split(" ") |
|
|
|
for reference in refs: |
|
|
|
token_r = reference.split(" ") |
|
|
|
lcs = my_lcs(token_r, token_c) |
|
prec.append(lcs/float(len(token_c))) |
|
rec.append(lcs/float(len(token_r))) |
|
|
|
prec_max = max(prec) |
|
rec_max = max(rec) |
|
|
|
if(prec_max!=0 and rec_max !=0): |
|
score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) |
|
else: |
|
score = 0.0 |
|
return score |
|
|
|
def compute_score(self, gts, res): |
|
""" |
|
Computes Rouge-L score given a set of reference and candidate sentences for the dataset |
|
Invoked by evaluate_captions.py |
|
:param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values |
|
:param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values |
|
:returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) |
|
""" |
|
assert(gts.keys() == res.keys()) |
|
imgIds = gts.keys() |
|
|
|
score = [] |
|
for id in imgIds: |
|
hypo = res[id] |
|
ref = gts[id] |
|
|
|
score.append(self.calc_score(hypo, ref)) |
|
|
|
|
|
assert(type(hypo) is list) |
|
assert(len(hypo) == 1) |
|
assert(type(ref) is list) |
|
assert(len(ref) > 0) |
|
|
|
average_score = np.mean(np.array(score)) |
|
return average_score, np.array(score) |
|
|
|
def method(self): |
|
return "Rouge" |
|
|