zhao rui commited on
Commit
e1ad072
·
1 Parent(s): cd0544b

first commit

Browse files
Files changed (5) hide show
  1. .gitignore +9 -0
  2. README.md +32 -2
  3. app.py +224 -0
  4. utils/Evaluation_answer_txt.py +180 -0
  5. utils/upload_hub.py +56 -0
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+
2
+ secrets.toml
3
+ __pycache__
4
+
5
+ # *.txt
6
+ *.tsv
7
+ *.csv
8
+ *.json
9
+
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  title: De Identification Leaderboard
3
- emoji: 📊
4
- colorFrom: purple
5
  colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.35.0
@@ -11,3 +11,33 @@ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: De Identification Leaderboard
3
+ emoji: 🏃
4
+ colorFrom: pink
5
  colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.35.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+
16
+
17
+
18
+ # de-identification-leaderboard
19
+
20
+ ## leaderboard data
21
+ score wil save to huggingface dataset
22
+ [zhaorui-nb/leaderboard-score](https://huggingface.co/datasets/zhaorui-nb/leaderboard-score)
23
+
24
+
25
+ ## submit
26
+ ### filename format
27
+ replace '/' to '@'
28
+ ```
29
+ [{Organization@Model}][{Dataaset}][{Method}]{Filename}.txt"
30
+ ```
31
+
32
+ ### line in answer txt (tsv)
33
+ ```
34
+ {file_name}\t{label_type}\t{label_start}\t{label_end}\t{label_text}\n
35
+ ```
36
+
37
+ ## Support dataset
38
+ ```
39
+ Setting1
40
+ Setting2
41
+ Setting3
42
+ ```
43
+
app.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ from utils.Evaluation_answer_txt import Evaluation_answer_txt
5
+ from utils.upload_hub import upload_scores_to_hub, file_name_decode
6
+ import time
7
+ import json
8
+ import datasets
9
+ from huggingface_hub import HfApi
10
+ from huggingface_hub import hf_hub_download
11
+ # st.set_page_config(layout="wide")
12
+ st.set_page_config(layout="centered")
13
+ st.markdown(
14
+ f"""
15
+ <style>
16
+ .appview-container .main .block-container{{
17
+ max-width: 80%;
18
+ padding: 50px;
19
+ }}
20
+ </style>
21
+ """,
22
+ unsafe_allow_html=True
23
+ )
24
+
25
+ @st.cache_data
26
+ def download_gold_answer(repo, filename, token, force_download=False):
27
+ ret = hf_hub_download(repo_id=repo, repo_type='dataset', filename=filename, token=token, force_download=force_download)
28
+ return ret
29
+
30
+
31
+ HUB_TOKEN = st.secrets['hf']
32
+ HUB_API = HfApi(token=HUB_TOKEN)
33
+
34
+ LEADERBOARD_DATASET_REPO = 'zhaorui-nb/leaderboard-score'
35
+ # Setting1 Setting2 Setting3
36
+
37
+ ANSWER_REPO = 'zhaorui-nb/leaderboard-answer'
38
+ GET_GOLD_ANSWER_PATH = {
39
+ 'Setting1': download_gold_answer(ANSWER_REPO, 'dataset/Setting1_test_answer.txt', HUB_TOKEN),
40
+ 'Setting2': download_gold_answer(ANSWER_REPO, 'dataset/Setting2_test_answer.txt', HUB_TOKEN),
41
+ 'Setting3': download_gold_answer(ANSWER_REPO, 'dataset/Setting3_test_answer.txt', HUB_TOKEN)
42
+ }
43
+
44
+
45
+ # cache the dataset in the session state
46
+ def get_leaderboard_df():
47
+ with st.spinner('Loading leaderboard data...'):
48
+ if st.session_state.get('leaderboard_df') is None:
49
+ dataset = datasets.load_dataset(LEADERBOARD_DATASET_REPO)
50
+ df = pd.DataFrame(dataset['train'])
51
+ st.session_state['leaderboard_df'] = df
52
+ return df
53
+ else:
54
+ return st.session_state['leaderboard_df']
55
+
56
+
57
+ st.title('De-identification Model Leaderboard')
58
+
59
+ try:
60
+ with st.container():
61
+ # columns
62
+ # ['model name', 'dataset', 'method', 'file name', 'submitter',
63
+ # 'MICRO precision', 'MICRO recall', 'MICRO f1', 'MACRO precision',
64
+ # 'MACRO recall', 'MACRO f1', 'detail result']
65
+
66
+ df = get_leaderboard_df()
67
+ # replace model name column @ to /
68
+ df['model name'] = df['model name'].str.replace('@', '/')
69
+
70
+ # remove the detail result column
71
+ default_columns = [c for c in df.columns if c not in ['detail result']]
72
+ selected_columns = st.multiselect('Select columns to display', df.columns, default=default_columns)
73
+
74
+ leaderboard_df = st.dataframe(df[selected_columns], selection_mode='multi-row', on_select='rerun', key='leaderboard')
75
+
76
+ st.subheader("Detail Result")
77
+ det_ind = st.session_state.leaderboard['selection']['rows']
78
+ if len(det_ind) == 0:
79
+ st.write(f'Please check the boxes to view the detailed results.')
80
+ else:
81
+ col_detial = st.columns(len(det_ind))
82
+ for i, dind in enumerate(det_ind):
83
+ with col_detial[i]:
84
+ dis = f"{df.iloc[dind]['model name']}___{df.iloc[dind]['dataset']}___{df.iloc[dind]['method']}"
85
+ color = [st.success, st.info, st.warning, st.error]
86
+ color[i % 4](dis)
87
+
88
+ dic = df.iloc[dind]['detail result']
89
+ dt_df = pd.DataFrame(dic).T
90
+ st.dataframe(dt_df)
91
+
92
+ except Exception as e:
93
+ st.error(f"Error: {e}")
94
+
95
+ st.markdown("---")
96
+
97
+ # ############################################################################################################
98
+ # ############################################### Evaluation_answer_txt
99
+ # ############################################################################################################
100
+
101
+ model_name_input = ''
102
+ dataset_input = ''
103
+ method_input = ''
104
+ file_name = ''
105
+ submitter_input = ''
106
+
107
+ if 'score_json' not in st.session_state:
108
+ st.session_state['score_json'] = None
109
+
110
+ @st.cache_data()
111
+ def get_file_info(uploaded_file):
112
+ filename_info = file_name_decode(uploaded_file.name)
113
+ return filename_info
114
+
115
+ @st.cache_data()
116
+ def eval_answer_txt(set_name, uploaded_file):
117
+ print(f"eval_answer_txt: {time.time()}" , set_name)
118
+
119
+ if set_name not in GET_GOLD_ANSWER_PATH:
120
+ return None
121
+ gold_answer_txt = GET_GOLD_ANSWER_PATH[set_name]
122
+ eval = Evaluation_answer_txt(gold_answer_txt, uploaded_file)
123
+ score_json = eval.eval()
124
+ return score_json
125
+
126
+ def clear_score_json():
127
+ st.session_state['score_json'] = None
128
+
129
+ st.title("Model Evaluation")
130
+ st.write("Support file naming: [{Organization@Model}][{Dataaset}][{Method}]{Filename}.txt")
131
+
132
+ col_upload = st.columns([3,1])
133
+ with col_upload[0]:
134
+ uploaded_file = st.file_uploader("Please upload the answer.txt file", type=["txt"], key="uploaded_file", on_change=clear_score_json)
135
+ with col_upload[1]:
136
+ if not uploaded_file:
137
+ st.warning("please upload file")
138
+ st.session_state['score_json'] = None
139
+ else:
140
+ st.success("file uploaded successfully")
141
+
142
+ filename_info = get_file_info(uploaded_file)
143
+ if filename_info:
144
+ model_name_input = filename_info['model_name']
145
+ dataset_input = filename_info['dataset']
146
+ method_input = filename_info['method']
147
+ file_name = filename_info['file_name']
148
+
149
+ col_score = st.columns([7,5])
150
+ if uploaded_file:
151
+ with col_score[1], st.container(border=True):
152
+ model_name_input = st.text_input("model name", model_name_input)
153
+ dataset_input = st.text_input("dataset", dataset_input)
154
+ method_input = st.text_input("method", method_input)
155
+ file_name = st.text_input("file name", file_name)
156
+ submitter_input = st.text_input("submitter", submitter_input)
157
+ check_all_fill_in = model_name_input and dataset_input and method_input and file_name and submitter_input
158
+
159
+ col_sumit_and_recalculate = st.columns(2)
160
+ with col_sumit_and_recalculate[0]:
161
+ calculate_btn = st.button("calculate", type='secondary', use_container_width=True)
162
+ with col_sumit_and_recalculate[1]:
163
+ submit_btn = st.button("SUBMIT", type='primary', use_container_width=True , disabled=not check_all_fill_in)
164
+
165
+ if calculate_btn or st.session_state['score_json'] is None:
166
+ set_name = dataset_input
167
+ st.session_state['score_json'] = eval_answer_txt(set_name, uploaded_file)
168
+ if st.session_state['score_json']:
169
+ st.success("evaluation success")
170
+ else:
171
+ st.error("evaluation failed, please check the file content or set the correct dataset name.")
172
+
173
+ if st.session_state['score_json']:
174
+ with col_score[0], st.container(border=True):
175
+ df = pd.DataFrame(st.session_state['score_json']).T
176
+ # split the column MICRO_AVERAGE and MACRO_AVERAGE into another dataframe
177
+ tag_df = df.drop(["MICRO_AVERAGE", "MACRO_AVERAGE"], axis=0)
178
+ avg_df = df.loc[["MICRO_AVERAGE", "MACRO_AVERAGE"]]
179
+
180
+ col_sort_func = st.columns(2)
181
+
182
+ with col_sort_func[0]:
183
+ sorted_column = st.selectbox("选择排序列", df.columns)
184
+
185
+ with col_sort_func[1]:
186
+ ascending = st.radio("Sort Order", ["Ascending", "Descending"])
187
+
188
+ tag_df = tag_df.sort_values(by=sorted_column, ascending=ascending=="Ascending")
189
+
190
+ st.dataframe(pd.concat([tag_df, avg_df]), use_container_width=True)
191
+
192
+
193
+ if not check_all_fill_in:
194
+ st.warning("Please fill in the complete information.")
195
+
196
+ if submit_btn:
197
+ if st.session_state['score_json']:
198
+ score_json = st.session_state['score_json']
199
+
200
+ leaderboard_dict = {
201
+ "model name": model_name_input,
202
+ "dataset": dataset_input,
203
+ "method": method_input,
204
+ "file name": file_name,
205
+ "submitter": submitter_input,
206
+
207
+ "MICRO precision": score_json["MICRO_AVERAGE"]["precision"],
208
+ "MICRO recall": score_json["MICRO_AVERAGE"]["recall"],
209
+ "MICRO f1": score_json["MICRO_AVERAGE"]["f1"],
210
+ "MACRO precision": score_json["MACRO_AVERAGE"]["precision"],
211
+ "MACRO recall": score_json["MACRO_AVERAGE"]["recall"],
212
+ "MACRO f1": score_json["MACRO_AVERAGE"]["f1"],
213
+ "detail result": score_json
214
+ }
215
+
216
+ repo_file_path = f'data/train-[{model_name_input}][{dataset_input}][{method_input}][{file_name}].json'
217
+ upload_res = upload_scores_to_hub(HUB_API, leaderboard_dict, repo_file_path, hub_repo=LEADERBOARD_DATASET_REPO)
218
+ if upload_res:
219
+ st.success(f"submit success")
220
+ st.success(f"your score at here: {upload_res}")
221
+ else:
222
+ st.error("submit failed")
223
+
224
+
utils/Evaluation_answer_txt.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ from collections import Counter
4
+ import json
5
+
6
+
7
+ class Tag:
8
+ def __init__(self, txt_line:str):
9
+ # | file_name | label_type | label_start | label_end | label_text |
10
+ # match = re.match(r'(.+)\t(\w+)\t(\d+)\t(\d+)\t(.+)', txt_line)
11
+ try:
12
+ sep = txt_line.strip().split('\t')
13
+ self.file_id = sep[0]
14
+ self.type = sep[1]
15
+ self.start = sep[2] # int(sep[2])
16
+ self.end = sep[3] # int(sep[3])
17
+ self.text = sep[4]
18
+ except:
19
+ raise ValueError('The format of the input line is not correct. Please check the input line format.')
20
+
21
+ def get_type(self):
22
+ return self.type
23
+
24
+ def get_file_id(self):
25
+ return self.file_id
26
+
27
+ def __eq__(self, other: 'Tag'):
28
+ # if all file_id, type, start, end, are the same, return True
29
+ # text is not considered for the comparison
30
+ ck_file_id = self.file_id == other.file_id
31
+ ck_type = self.type == other.type
32
+ ck_start = self.start == other.start
33
+ ck_end = self.end == other.end
34
+ # ck_text = self.text == other.text
35
+ if ck_file_id and ck_type and ck_start and ck_end:
36
+ return True
37
+ else:
38
+ return False
39
+ def __repr__(self):
40
+ return f'<{self.__class__.__name__} {self.file_id:10} {self.type:10} s:{self.start:5} e:{self.end:5} {self.text}>\n'
41
+
42
+ def __hash__(self):
43
+ return hash((self.file_id, self.type, self.start, self.end))
44
+
45
+ class Evaluation_answer_txt:
46
+ def __init__(self, gold_answer, pred_answer):
47
+ self.gold_answer = gold_answer
48
+ self.pred_answer = pred_answer
49
+
50
+ self.gold_set = set() # set of Tag
51
+ self.pred_set = set() # set of Tag
52
+
53
+ self.type_set = set() # set of label type str
54
+ self.gold_label_counter = Counter() # Counter of gold label type
55
+
56
+ self.resault_score = {}
57
+
58
+ def _lines_to_tag_set(self, lines, set_type): # set_type: 'gold' or 'pred'
59
+ tags = []
60
+ for i in range(len(lines)):
61
+ try:
62
+ tag = Tag(lines[i])
63
+ tags.append(tag)
64
+ except:
65
+ print(f'Error at {set_type} answer line: {i+1}, {lines[i]}')
66
+ return set(tags)
67
+
68
+ def _set_filter(self, tag_set, type):
69
+ # tag set filter by type
70
+ return {tag for tag in tag_set if tag.get_type() == type}
71
+
72
+ def _division(self, a, b):
73
+ try:
74
+ return a / b
75
+ except:
76
+ return 0.0
77
+
78
+ def _f1_score(self, TP=None, FP=None, FN=None):
79
+ if TP is None or FP is None or FN is None:
80
+ raise ValueError('TP, FP, FN should be given.')
81
+
82
+ precision = self._division(TP, TP + FP)
83
+ recall = self._division(TP, TP + FN)
84
+ f1 = self._division(2 * precision * recall, precision + recall)
85
+
86
+ return {'precision': precision, 'recall': recall, 'f1': f1}
87
+
88
+
89
+ def eval(self, ignore_no_gold_tag_file=True):
90
+ with open(self.gold_answer, 'r') as f:
91
+ gold_line = f.readlines()
92
+ # with open(self.pred_answer, 'r') as f:
93
+ # pred_line = f.readlines()
94
+ ########## add to support the input is a file object ##########
95
+ if isinstance(self.pred_answer, str):
96
+ with open(self.pred_answer, 'r') as f:
97
+ pred_line = f.readlines()
98
+
99
+
100
+ else:
101
+ pred_line = self.pred_answer.readlines()
102
+ #pred_line is bytes, need to decode
103
+ pred_line = [line.decode('utf-8') for line in pred_line]
104
+
105
+ self.gold_set = self._lines_to_tag_set(gold_line, 'gold')
106
+ self.pred_set = self._lines_to_tag_set(pred_line, 'pred')
107
+
108
+ # in islab aicup program, it will ignore the files that have no gold tags
109
+ # that program only consider the files that write in gold answer.txt
110
+ if ignore_no_gold_tag_file:
111
+ # filter the files that have no gold tags
112
+ gold_files = {tag.get_file_id() for tag in self.gold_set}
113
+ self.pred_set = {tag for tag in self.pred_set if tag.get_file_id() in gold_files}
114
+
115
+ # statistics tags and types
116
+ for tag in self.gold_set:
117
+ self.type_set.add(tag.get_type())
118
+ self.gold_label_counter[tag.get_type()] += 1
119
+ for tag in self.pred_set:
120
+ self.type_set.add(tag.get_type())
121
+
122
+ TP_set = self.gold_set & self.pred_set
123
+ FP_set = self.pred_set - self.gold_set
124
+ FN_set = self.gold_set - self.pred_set
125
+
126
+ # count each type of label
127
+ for label in self.type_set:
128
+ filter_TP = self._set_filter(TP_set, label)
129
+ filter_FP = self._set_filter(FP_set, label)
130
+ filter_FN = self._set_filter(FN_set, label)
131
+ score = self._f1_score(len(filter_TP), len(filter_FP), len(filter_FN))
132
+ self.resault_score[label] = score
133
+
134
+ # MICRO_AVERAGE
135
+ self.resault_score['MICRO_AVERAGE'] = self._f1_score(len(TP_set), len(FP_set), len(FN_set))
136
+
137
+ # MACRO_AVERAGE
138
+ precision_sum = 0
139
+ recall_sum = 0
140
+ # f1_sum = 0 # at aicup, calc by MACRO_AVERAGE precision and recall
141
+ for label in self.type_set:
142
+ precision_sum += self.resault_score[label]['precision']
143
+ recall_sum += self.resault_score[label]['recall']
144
+ # f1_sum += self.resault_score[label]['f1']
145
+
146
+ precision = self._division(precision_sum, len(self.type_set))
147
+ recall = self._division(recall_sum, len(self.type_set))
148
+ # f1 = 2 * precision * recall / (precision + recall)
149
+ f1 = self._division(2 * precision * recall , (precision + recall))
150
+
151
+ self.resault_score['MACRO_AVERAGE'] = {'precision': precision, 'recall': recall, 'f1': f1}
152
+
153
+ # add Support to each type of label
154
+ for label in self.type_set:
155
+ self.resault_score[label]['support'] = self.gold_label_counter[label]
156
+ self.resault_score['MICRO_AVERAGE']['support'] = len(self.gold_set)
157
+ self.resault_score['MACRO_AVERAGE']['support'] = len(self.gold_set)
158
+
159
+ # return json.dumps(self.resault_score, indent=4)
160
+ return self.resault_score
161
+
162
+
163
+ if __name__=="__main__":
164
+ # with open('.output/[meta-llama@Llama-2-7b-hf][Setting3][icl]answer.txt', 'r', encoding='utf-8') as f:
165
+ # lines = [line.strip() for line in f.readlines() if line.strip() != '']
166
+
167
+ # gold_path = 'dataset/Setting3_test_answer.txt'
168
+ # pred_path = '.output/EleutherAI-pythia-1b-Setting3_answer.txt'
169
+
170
+
171
+ # gold_path = './.output/test_eval/gold_answer.txt'
172
+ # pred_path = './.output/test_eval/pred_answer.txt'
173
+
174
+ gold_path = 'dataset/Setting3_test_answer.txt'
175
+ pred_path = '.output/[meta-llama@Llama-2-7b-hf][Setting3][icl]answer.txt'
176
+
177
+
178
+ eval = Evaluation_answer_txt(gold_path, pred_path)
179
+ res = eval.eval()
180
+ print(res)
utils/upload_hub.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import uuid
3
+ import os
4
+ import re
5
+ from huggingface_hub import HfApi
6
+ from huggingface_hub import hf_hub_download
7
+
8
+
9
+
10
+ def file_name_decode(file_name):
11
+ # model_name,dataset,method,answer.txt
12
+ # input file name example: [[email protected]][Setting3][icl]answer.txt
13
+
14
+ match = re.match(rf'\[([^\[^\]]+)\]\[([^\[^\]]+)\]\[([^\[^\]]+)\]([^\[^\]]+)', file_name)
15
+
16
+ if match:
17
+ model_name, dataset, method, file_name = match.groups()
18
+ ret_dict = {
19
+ 'model_name': model_name,
20
+ 'dataset': dataset,
21
+ 'method': method,
22
+ 'file_name': file_name
23
+ }
24
+ return ret_dict
25
+ return None
26
+
27
+ def upload_scores_to_hub(api, scores_dict, path_in_repo,hub_repo='zhaorui-nb/test_json'):
28
+ # id = str(uuid.uuid4())
29
+ save_json_path = f'.output/upload.json'
30
+ os.makedirs(os.path.dirname(save_json_path), exist_ok=True)
31
+ with open(save_json_path, 'w') as f:
32
+ json.dump(scores_dict, f , indent=4)
33
+
34
+ # SAVE JSON TO HUB
35
+ res = api.upload_file(
36
+ path_or_fileobj=save_json_path,
37
+ path_in_repo=path_in_repo, #f'data/train,{os.path.basename(save_json_path)}',
38
+ repo_id=hub_repo,
39
+ repo_type="dataset",
40
+ )
41
+
42
+ return res
43
+
44
+
45
+
46
+
47
+
48
+ if __name__ == "__main__":
49
+
50
+ pass
51
+
52
+
53
+
54
+
55
+
56
+