Spaces:

zhaorui-nb
/

de-identification-leaderboard

Sleeping

File size: 7,026 Bytes

import re 
import os 
from collections import Counter
import json


class Tag:
    def __init__(self, txt_line:str):
        # | file_name | label_type | label_start | label_end | label_text |
        # match = re.match(r'(.+)\t(\w+)\t(\d+)\t(\d+)\t(.+)', txt_line)
        try:
            sep = txt_line.strip().split('\t')
            self.file_id = sep[0]    
            self.type = sep[1]
            self.start = sep[2] # int(sep[2])
            self.end =  sep[3] # int(sep[3])
            self.text = sep[4]
        except:
            raise ValueError('The format of the input line is not correct. Please check the input line format.')

    def get_type(self):
        return self.type

    def get_file_id(self):
        return self.file_id
    
    def __eq__(self, other: 'Tag'):
        # if all file_id, type, start, end,  are the same, return True
        # text is not considered for the comparison
        ck_file_id = self.file_id == other.file_id
        ck_type = self.type == other.type
        ck_start = self.start == other.start
        ck_end = self.end == other.end
        # ck_text = self.text == other.text
        if ck_file_id and ck_type and ck_start and ck_end:
            return True
        else:
            return False
    def __repr__(self):
        return f'<{self.__class__.__name__} {self.file_id:10} {self.type:10} s:{self.start:5} e:{self.end:5} {self.text}>\n'

    def __hash__(self):
        return hash((self.file_id, self.type, self.start, self.end))
    
class Evaluation_answer_txt:
    def __init__(self, gold_answer, pred_answer):
        self.gold_answer = gold_answer
        self.pred_answer = pred_answer

        self.gold_set = set() # set of Tag
        self.pred_set = set() # set of Tag

        self.type_set = set() # set of label type str
        self.gold_label_counter = Counter() # Counter of gold label type

        self.resault_score = {}

    def _lines_to_tag_set(self, lines, set_type): # set_type: 'gold' or 'pred'
        tags = []
        for i in range(len(lines)):
            try:
                tag = Tag(lines[i])
                tags.append(tag)
            except:
                print(f'Error at {set_type} answer line: {i+1}, {lines[i]}')
        return set(tags)
    
    def _set_filter(self, tag_set, type):
        # tag set filter by type
        return {tag for tag in tag_set if tag.get_type() == type}
    
    def _division(self, a, b):
        try:
            return a / b
        except:
            return 0.0

    def _f1_score(self, TP=None, FP=None, FN=None):
        if TP is None or FP is None or FN is None:
            raise ValueError('TP, FP, FN should be given.')

        precision = self._division(TP, TP + FP)
        recall = self._division(TP, TP + FN)
        f1 = self._division(2 * precision * recall, precision + recall)
            
        return {'precision': precision, 'recall': recall, 'f1': f1}            
        

    def eval(self, ignore_no_gold_tag_file=True):
        with open(self.gold_answer, 'r') as f:
            gold_line = f.readlines()
        # with open(self.pred_answer, 'r') as f:
        #     pred_line = f.readlines()
        ########## add to support the input is a file object ##########
        if isinstance(self.pred_answer, str):
            with open(self.pred_answer, 'r') as f:
                pred_line = f.readlines()


        else:
            pred_line = self.pred_answer.readlines()
            #pred_line is bytes, need to decode
            pred_line = [line.decode('utf-8') for line in pred_line]

        self.gold_set = self._lines_to_tag_set(gold_line, 'gold')
        self.pred_set = self._lines_to_tag_set(pred_line, 'pred')

        # in islab aicup program, it will ignore the files that have no gold tags
        # that program only consider the files that write in gold answer.txt
        if ignore_no_gold_tag_file:
            # filter the files that have no gold tags
            gold_files = {tag.get_file_id() for tag in self.gold_set}
            self.pred_set = {tag for tag in self.pred_set if tag.get_file_id() in gold_files}

        # statistics tags and types 
        for tag in self.gold_set:
            self.type_set.add(tag.get_type())
            self.gold_label_counter[tag.get_type()] += 1
        for tag in self.pred_set:
            self.type_set.add(tag.get_type())

        TP_set = self.gold_set & self.pred_set
        FP_set = self.pred_set - self.gold_set
        FN_set = self.gold_set - self.pred_set

        # count each type of label
        for label in self.type_set:
            filter_TP = self._set_filter(TP_set, label)
            filter_FP = self._set_filter(FP_set, label)
            filter_FN = self._set_filter(FN_set, label)
            score = self._f1_score(len(filter_TP), len(filter_FP), len(filter_FN))
            self.resault_score[label] = score
       
        # MICRO_AVERAGE
        self.resault_score['MICRO_AVERAGE'] = self._f1_score(len(TP_set), len(FP_set), len(FN_set))

        # MACRO_AVERAGE
        precision_sum = 0
        recall_sum = 0
        # f1_sum = 0 # at aicup, calc by MACRO_AVERAGE precision and recall
        for label in self.type_set:
            precision_sum += self.resault_score[label]['precision']
            recall_sum += self.resault_score[label]['recall']
            # f1_sum += self.resault_score[label]['f1']
        
        precision = self._division(precision_sum, len(self.type_set))
        recall = self._division(recall_sum, len(self.type_set))
        # f1 = 2 * precision * recall / (precision + recall)
        f1 = self._division(2 * precision * recall , (precision + recall))

        self.resault_score['MACRO_AVERAGE'] = {'precision': precision, 'recall': recall, 'f1': f1}

        # add Support to each type of label
        for label in self.type_set:
            self.resault_score[label]['support'] = self.gold_label_counter[label]
        self.resault_score['MICRO_AVERAGE']['support'] = len(self.gold_set)
        self.resault_score['MACRO_AVERAGE']['support'] = len(self.gold_set)

        # return json.dumps(self.resault_score, indent=4)
        return self.resault_score


if __name__=="__main__":
    # with open('.output/[meta-llama@Llama-2-7b-hf][Setting3][icl]answer.txt', 'r', encoding='utf-8') as f:
    #     lines = [line.strip() for line in f.readlines() if line.strip() != '']

    # gold_path = 'dataset/Setting3_test_answer.txt'
    # pred_path = '.output/EleutherAI-pythia-1b-Setting3_answer.txt'


    # gold_path = './.output/test_eval/gold_answer.txt'
    # pred_path = './.output/test_eval/pred_answer.txt'

    gold_path = 'dataset/Setting3_test_answer.txt'
    pred_path = '.output/[meta-llama@Llama-2-7b-hf][Setting3][icl]answer.txt'


    eval = Evaluation_answer_txt(gold_path, pred_path)
    res = eval.eval()
    print(res)