File size: 2,577 Bytes
98e2ea5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import argparse
import os
import logging
import json
import numpy as np
from coref_utils.metrics import CorefEvaluator
from coref_utils.utils import get_mention_to_cluster


os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.basicConfig(format="%(message)s", level=logging.INFO)
logger = logging.getLogger()


def process_args():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser()

    # Add arguments to parser
    parser.add_argument("log_file", help="Log file", type=str)

    args = parser.parse_args()
    return args


def singleton_analysis(data):
    max_length = 0
    max_doc_id = ""
    max_cluster = []

    for instance in data:

        gold_clusters, gold_mentions_to_cluster = get_mention_to_cluster(
            instance["clusters"]
        )
        pred_clusters, pred_mentions_to_cluster = get_mention_to_cluster(
            instance["predicted_clusters"]
        )

        for cluster in gold_clusters:
            all_mention_unseen = True
            for mention in cluster:
                if mention in pred_mentions_to_cluster:
                    all_mention_unseen = False
                    break

            if all_mention_unseen:
                if len(cluster) > max_length:
                    max_length = len(cluster)
                    max_doc_id = instance["doc_key"]
                    max_cluster = cluster

    print(max_doc_id)
    print(max_length, max_cluster)


def reverse_analysis(data):
    max_length = 0
    max_doc_id = ""
    max_cluster = []

    for instance in data:

        gold_clusters, gold_mentions_to_cluster = get_mention_to_cluster(
            instance["clusters"]
        )
        pred_clusters, pred_mentions_to_cluster = get_mention_to_cluster(
            instance["predicted_clusters"]
        )

        for cluster in pred_clusters:
            all_mention_unseen = True
            for mention in cluster:
                if mention in gold_mentions_to_cluster:
                    all_mention_unseen = False
                    break

            if all_mention_unseen:
                if len(cluster) > max_length:
                    max_length = len(cluster)
                    max_doc_id = instance["doc_key"]
                    max_cluster = cluster

    print(max_doc_id)
    print(max_length, max_cluster)


def main():
    args = process_args()
    data = []
    with open(args.log_file) as f:
        for line in f:
            data.append(json.loads(line))
    singleton_analysis(data)
    reverse_analysis(data)


if __name__ == "__main__":
    main()