Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -64,17 +64,22 @@ if not os.path.exists(data_path):
|
|
64 |
exit()
|
65 |
|
66 |
|
67 |
-
# Loading
|
68 |
data = pd.read_csv(data_path, dtype={"id": "Int64"}) # 確保 id 為標準 Python int
|
69 |
|
70 |
current_index = 0
|
71 |
-
|
72 |
-
|
|
|
|
|
73 |
# ---------------------------------------------------------------------------------------------------------------------
|
74 |
|
75 |
-
annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
|
76 |
-
score_file = DATASET_DIR / f"test_score-{uuid4()}.json"
|
77 |
|
|
|
|
|
|
|
|
|
|
|
78 |
def get_current_text():
|
79 |
global current_index, data
|
80 |
source = data.loc[current_index, "source"]
|
@@ -91,80 +96,156 @@ def save_to_json(entry: dict, json_file: Path):
|
|
91 |
f.write("\n")
|
92 |
# scheduler.push_to_hub(commit_message=f"更新檔案 {json_file.name}")
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
|
95 |
-
global current_index, data,
|
|
|
|
|
|
|
96 |
|
97 |
system = data.loc[current_index, "system"]
|
98 |
lp = data.loc[current_index, "lp"]
|
99 |
doc = data.loc[current_index, "doc"]
|
100 |
id = int(data.loc[current_index, "id"])
|
101 |
reference = data.loc[current_index, "reference"]
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
category_value = f"{category}/{subcategory}"
|
106 |
else:
|
107 |
category_value = category
|
108 |
|
109 |
if error_span and error_span in target:
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
else:
|
114 |
-
|
115 |
|
116 |
-
|
117 |
-
"
|
118 |
-
"
|
119 |
-
"
|
120 |
-
"
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
"category": category_value,
|
125 |
-
"
|
126 |
-
|
127 |
-
"rater": rater_selector,
|
128 |
-
}
|
129 |
-
save_to_json(new_entry, annotations_file)
|
130 |
|
131 |
# [error_span, status]
|
132 |
-
return "", f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
def save_and_next(source, target, score, rater_selector):
|
135 |
-
global current_index, data,
|
|
|
136 |
system = data.loc[current_index, "system"]
|
137 |
lp = data.loc[current_index, "lp"]
|
138 |
doc = data.loc[current_index, "doc"]
|
139 |
id = int(data.loc[current_index, "id"])
|
140 |
reference = data.loc[current_index, "reference"]
|
141 |
|
142 |
-
|
143 |
-
max_id = int(id_list[-1]) # 取得最大的 ID
|
144 |
-
|
145 |
-
new_entry = {
|
146 |
"system": system,
|
147 |
"lp": lp,
|
148 |
"doc": doc,
|
149 |
"id": id,
|
|
|
150 |
"src": source,
|
151 |
"mt": target,
|
152 |
"ref": reference,
|
153 |
-
"
|
154 |
-
"
|
|
|
155 |
}
|
156 |
-
save_to_json(
|
|
|
|
|
|
|
|
|
157 |
|
158 |
current_index += 1
|
159 |
if current_index >= len(data):
|
160 |
-
return "已完成所有文本標記", "已完成所有文本標記", "", "", f"所有標記已完成並保存到 {
|
161 |
-
|
162 |
next_source, next_target = get_current_text()
|
163 |
-
|
164 |
-
|
165 |
|
166 |
def update_file_selection(selected_file):
|
167 |
-
global data_path, data, current_index, annotations_file
|
168 |
data_path = os.path.join(current_dir, selected_file)
|
169 |
data = pd.read_csv(data_path)
|
170 |
|
@@ -175,7 +256,7 @@ def update_file_selection(selected_file):
|
|
175 |
|
176 |
file_base_name = os.path.splitext(selected_file)[0]
|
177 |
annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json"
|
178 |
-
score_file = DATASET_DIR / f"{file_base_name}_score-{uuid4()}.json"
|
179 |
|
180 |
# [source, target, error_span, index_selector, current_index_display, status]
|
181 |
return get_current_text() + ("", gr.update(choices=id_list, value=str(min_id)), str(min_id), f"已加載檔案:{selected_file}")
|
|
|
64 |
exit()
|
65 |
|
66 |
|
67 |
+
# Loading & Setting --------------------------------------------------------------------------------------------------
|
68 |
data = pd.read_csv(data_path, dtype={"id": "Int64"}) # 確保 id 為標準 Python int
|
69 |
|
70 |
current_index = 0
|
71 |
+
current_errors = []
|
72 |
+
current_others = []
|
73 |
+
|
74 |
+
annotations_file = DATASET_DIR / f"test-{uuid4()}.json"
|
75 |
# ---------------------------------------------------------------------------------------------------------------------
|
76 |
|
|
|
|
|
77 |
|
78 |
+
# score_file = DATASET_DIR / f"test_score-{uuid4()}.json"
|
79 |
+
|
80 |
+
def get_all_ids():
|
81 |
+
return [str(id) for id in data["id"].tolist()]
|
82 |
+
|
83 |
def get_current_text():
|
84 |
global current_index, data
|
85 |
source = data.loc[current_index, "source"]
|
|
|
96 |
f.write("\n")
|
97 |
# scheduler.push_to_hub(commit_message=f"更新檔案 {json_file.name}")
|
98 |
|
99 |
+
# def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
|
100 |
+
# global current_index, data, annotations_file
|
101 |
+
|
102 |
+
# system = data.loc[current_index, "system"]
|
103 |
+
# lp = data.loc[current_index, "lp"]
|
104 |
+
# doc = data.loc[current_index, "doc"]
|
105 |
+
# id = int(data.loc[current_index, "id"])
|
106 |
+
# reference = data.loc[current_index, "reference"]
|
107 |
+
|
108 |
+
|
109 |
+
# if category != "Non-translation" and category != "No-error":
|
110 |
+
# category_value = f"{category}/{subcategory}"
|
111 |
+
# else:
|
112 |
+
# category_value = category
|
113 |
+
|
114 |
+
# if error_span and error_span in target:
|
115 |
+
# highlighted_error_span = target.replace(error_span, f"<v>{error_span}</v>")
|
116 |
+
# elif not error_span:
|
117 |
+
# highlighted_error_span = target
|
118 |
+
# else:
|
119 |
+
# highlighted_error_span = error_span # 若 error_span 不存在於 target,則保持原樣
|
120 |
+
|
121 |
+
# new_entry = {
|
122 |
+
# "system": system,
|
123 |
+
# "lp": lp,
|
124 |
+
# "doc": doc,
|
125 |
+
# "id": id,
|
126 |
+
# "source": source,
|
127 |
+
# "mt": target,
|
128 |
+
# "target": highlighted_error_span,
|
129 |
+
# "category": category_value,
|
130 |
+
# "severity": severity,
|
131 |
+
# "other": other if other else "",
|
132 |
+
# "rater": rater_selector,
|
133 |
+
# }
|
134 |
+
# save_to_json(new_entry, annotations_file)
|
135 |
+
|
136 |
+
# # [error_span, status]
|
137 |
+
# return "", f"當前資料已保存到 {annotations_file.name},請繼續標記!"
|
138 |
+
|
139 |
def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
|
140 |
+
global current_index, data, current_errors
|
141 |
+
|
142 |
+
if category == "No-error":
|
143 |
+
return "", "無錯誤,不需要保存錯誤區間。"
|
144 |
|
145 |
system = data.loc[current_index, "system"]
|
146 |
lp = data.loc[current_index, "lp"]
|
147 |
doc = data.loc[current_index, "doc"]
|
148 |
id = int(data.loc[current_index, "id"])
|
149 |
reference = data.loc[current_index, "reference"]
|
150 |
+
|
151 |
+
if category != "Non-translation":
|
152 |
+
category_value = f"{category}/{subcategory}" if subcategory else category
|
|
|
153 |
else:
|
154 |
category_value = category
|
155 |
|
156 |
if error_span and error_span in target:
|
157 |
+
start = target.find(error_span)
|
158 |
+
end = start + len(error_span)
|
159 |
+
print(f"start: {start}, end: {end}")
|
160 |
else:
|
161 |
+
return "", "錯誤區間不存在於翻譯文本中,請檢查!"
|
162 |
|
163 |
+
current_errors.append({
|
164 |
+
"text": error_span,
|
165 |
+
"severity": severity.lower(),
|
166 |
+
"start": start,
|
167 |
+
"end": end
|
168 |
+
})
|
169 |
+
|
170 |
+
current_others.append({
|
171 |
"category": category_value,
|
172 |
+
"others": other if other else "",
|
173 |
+
})
|
|
|
|
|
|
|
174 |
|
175 |
# [error_span, status]
|
176 |
+
return "", f"已記錄錯誤區間: {error_span},範圍 {start}-{end}。"
|
177 |
+
|
178 |
+
|
179 |
+
# def save_and_next(source, target, score, rater_selector):
|
180 |
+
# global current_index, data, score_file
|
181 |
+
# system = data.loc[current_index, "system"]
|
182 |
+
# lp = data.loc[current_index, "lp"]
|
183 |
+
# doc = data.loc[current_index, "doc"]
|
184 |
+
# id = int(data.loc[current_index, "id"])
|
185 |
+
# reference = data.loc[current_index, "reference"]
|
186 |
+
|
187 |
+
# id_list = [str(id) for id in sorted(data["id"].unique())]
|
188 |
+
# max_id = int(id_list[-1]) # 取得最大的 ID
|
189 |
+
|
190 |
+
# new_entry = {
|
191 |
+
# "system": system,
|
192 |
+
# "lp": lp,
|
193 |
+
# "doc": doc,
|
194 |
+
# "id": id,
|
195 |
+
# "src": source,
|
196 |
+
# "mt": target,
|
197 |
+
# "ref": reference,
|
198 |
+
# "score": score,
|
199 |
+
# "rater": rater_selector,
|
200 |
+
# }
|
201 |
+
# save_to_json(new_entry, score_file)
|
202 |
+
|
203 |
+
# current_index += 1
|
204 |
+
# if current_index >= len(data):
|
205 |
+
# return "已完成所有文本標記", "已完成所有文本標記", "", "", f"所有標記已完成並保存到 {score_file.name}!"
|
206 |
+
|
207 |
+
# next_source, next_target = get_current_text()
|
208 |
+
# # [source, target, error_span, current_index_display, status]
|
209 |
+
# return next_source, next_target, "", str(current_index), f"分數已保存到 {score_file.name},請繼續下一筆!"
|
210 |
|
211 |
def save_and_next(source, target, score, rater_selector):
|
212 |
+
global current_index, data, annotations_file, current_errors, current_others
|
213 |
+
|
214 |
system = data.loc[current_index, "system"]
|
215 |
lp = data.loc[current_index, "lp"]
|
216 |
doc = data.loc[current_index, "doc"]
|
217 |
id = int(data.loc[current_index, "id"])
|
218 |
reference = data.loc[current_index, "reference"]
|
219 |
|
220 |
+
annotations_entry = {
|
|
|
|
|
|
|
221 |
"system": system,
|
222 |
"lp": lp,
|
223 |
"doc": doc,
|
224 |
"id": id,
|
225 |
+
"rater": rater_selector,
|
226 |
"src": source,
|
227 |
"mt": target,
|
228 |
"ref": reference,
|
229 |
+
"sentence_score": score / 100.0, # 標準化到 [0, 1]
|
230 |
+
"errors": current_errors,
|
231 |
+
"others": current_others,
|
232 |
}
|
233 |
+
save_to_json(annotations_entry, annotations_file)
|
234 |
+
|
235 |
+
# 清空當前錯誤緩存
|
236 |
+
current_errors = []
|
237 |
+
current_others = []
|
238 |
|
239 |
current_index += 1
|
240 |
if current_index >= len(data):
|
241 |
+
return "已完成所有文本標記", "已完成所有文本標記", "", "", f"所有標記已完成並保存到 {annotations_file.name}!"
|
242 |
+
|
243 |
next_source, next_target = get_current_text()
|
244 |
+
return next_source, next_target, "", str(current_index), f"分數與錯誤已保存到 {annotations_file.name},請繼續下一筆!"
|
245 |
+
|
246 |
|
247 |
def update_file_selection(selected_file):
|
248 |
+
global data_path, data, current_index, annotations_file
|
249 |
data_path = os.path.join(current_dir, selected_file)
|
250 |
data = pd.read_csv(data_path)
|
251 |
|
|
|
256 |
|
257 |
file_base_name = os.path.splitext(selected_file)[0]
|
258 |
annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json"
|
259 |
+
# score_file = DATASET_DIR / f"{file_base_name}_score-{uuid4()}.json"
|
260 |
|
261 |
# [source, target, error_span, index_selector, current_index_display, status]
|
262 |
return get_current_text() + ("", gr.update(choices=id_list, value=str(min_id)), str(min_id), f"已加載檔案:{selected_file}")
|