File size: 2,249 Bytes
e41b03f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def evaluate_pr(system, gold, system_score_cutoff=0, k=5, verbosity=0):
    """
    Returns the precision,recall and f1 score @k. 
    Also prints the precision,recall and f1 score @k=1 to 5.

    Parameters
    ----------
    system : list of tuples
        System output for sentence in form (position, score).
    gold : list of tuple
        Gold standard for sentence in form (position, score).
    system_score_cutoff : float
        Threshold of importance score for system output, deafaul to 0.
    k : int
        Top k recommendations to be evaluate on.
        
    Returns
    -------
    tuple
        A tuple contains precision, recall and f1 score for the system.
    """
    # recommended by system and gold
    system = [i for i in system if i[1] > system_score_cutoff]  # have the flexibility to change the number of recommendation
    gold = [i for i in gold if i[1] > 0]

    if len(gold)>k:
        n = len(gold)
    else:
        n = 0

    # sort
    system.sort(key=lambda x: -x[1])
    gold_sent = [j[0] for j in gold]

    # print("system:", system)
    # print("gold:", gold)

    for i in range(1, k + 1):  # show how precision and recall change at different k
        num_correct = 0
        if len(system)<i:
            sys = system
        else:
            sys = system[:i]

        for s in sys:
            if s[0] in gold_sent:
                num_correct+=1

        precision = num_correct / len(sys)
        recall = num_correct / len(gold)
        if verbosity > 0:
            print("k=", i, "\nprecision=", precision, "\nrecall=", recall)
    
    if n:
        num_correct = 0
        sys = system[:n]
        for s in sys:
            if s[0] in gold_sent:
                num_correct += 1

        precision = num_correct/len(sys)
        recall = num_correct/len(gold)
        if verbosity > 0:
            print("k=", i, "\nprecision=", precision, "\nrecall=", recall)
    
    try:
        f_score = 2 * precision * recall / (precision + recall)
    except:
        f_score = 0

    if verbosity > 0:
        print("f1 score=", f_score)
    
    return (precision, recall, f_score)  # return precision and recall at k=n, showing how the system performs by recommending the same number of sent as gold