Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,8 @@ from huggingface_hub import CommitScheduler, snapshot_download
|
|
8 |
from uuid import uuid4
|
9 |
import shutil
|
10 |
|
11 |
-
#
|
|
|
12 |
category_map = {
|
13 |
"正確性": "Accuracy",
|
14 |
"流暢度": "Fluency",
|
@@ -16,7 +17,6 @@ category_map = {
|
|
16 |
"風格": "Style",
|
17 |
"在地化": "Locale"
|
18 |
}
|
19 |
-
|
20 |
subcategory_map = {
|
21 |
("正確性", "誤譯"): ("Accuracy", "Mistranslation"),
|
22 |
("正確性", "新增"): ("Accuracy", "Addition"),
|
@@ -44,7 +44,6 @@ subcategory_map = {
|
|
44 |
("在地化", "地址格式"): ("Locale", "Address format"),
|
45 |
("在地化", "其他"): ("Locale", "Other"),
|
46 |
}
|
47 |
-
|
48 |
categories_display = {
|
49 |
"正確性": ["誤譯", "新增", "漏譯", "其他"],
|
50 |
"流暢度": ["文法", "拼字", "標點符號", "前後不一致", "語域", "其他"],
|
@@ -53,35 +52,50 @@ categories_display = {
|
|
53 |
"在地化": ["貨幣格式", "時間格式", "人名格式", "日期格式", "地址格式", "其他"]
|
54 |
}
|
55 |
|
56 |
-
severity_choices_display = ["輕微 (Minor)", "嚴重 (Major)"]
|
57 |
severity_map = {
|
58 |
"輕微 (Minor)": "Minor",
|
59 |
"嚴重 (Major)": "Major"
|
60 |
}
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
DATASET_DIR = Path("json_dataset")
|
63 |
DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
64 |
|
65 |
scheduler = CommitScheduler(
|
66 |
-
repo_id="350016z/TaiwanCOMET_dataset",
|
67 |
repo_type="dataset",
|
68 |
folder_path=DATASET_DIR,
|
69 |
path_in_repo="data"
|
70 |
)
|
71 |
|
72 |
-
# ---------------------------下載CSV資料檔--------------------------------
|
73 |
def download_dataset_file(dataset_id, local_dir):
|
74 |
snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset")
|
75 |
contents = os.listdir(snapshot_path)
|
76 |
|
77 |
for file_name in contents:
|
78 |
-
print("Checking file: ", file_name)
|
79 |
if file_name.endswith(".csv"):
|
80 |
source_file_path = os.path.join(snapshot_path, file_name)
|
81 |
local_file_path = os.path.join(local_dir, file_name)
|
82 |
shutil.copy(source_file_path, local_file_path)
|
83 |
-
print(f"Copied {file_name} to {local_file_path}")
|
84 |
-
print(f"Permissions for {local_file_path}: {oct(os.stat(local_file_path).st_mode)}")
|
85 |
time.sleep(1)
|
86 |
return local_dir
|
87 |
|
@@ -95,8 +109,6 @@ if not csv_files:
|
|
95 |
exit()
|
96 |
|
97 |
data_path = os.path.join(current_dir, 'test.csv') if 'test.csv' in csv_files else os.path.join(current_dir, csv_files[0])
|
98 |
-
print(f"Data path: {data_path}")
|
99 |
-
|
100 |
if not os.path.exists(data_path):
|
101 |
print(f"Error: {data_path} does not exist. Please check the file path.")
|
102 |
exit()
|
@@ -106,9 +118,7 @@ current_index = 0
|
|
106 |
current_errors = []
|
107 |
|
108 |
annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
|
109 |
-
|
110 |
-
# 存放所有已提交標註(方便後續查看歷史),此範例主要顯示當前紀錄
|
111 |
-
annotation_history = []
|
112 |
|
113 |
def get_all_ids():
|
114 |
"""
|
@@ -137,24 +147,12 @@ def save_to_json(entry: dict, json_file: Path):
|
|
137 |
json.dump(entry, f, ensure_ascii=False)
|
138 |
f.write("\n")
|
139 |
|
140 |
-
def get_error_dataframe():
|
141 |
-
"""
|
142 |
-
只顯示「text」「severity」「category」三個欄位,不顯示 start/end。
|
143 |
-
"""
|
144 |
-
df = pd.DataFrame(current_errors)
|
145 |
-
if df.empty:
|
146 |
-
return pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"])
|
147 |
-
# 轉成中文欄位
|
148 |
-
display_df = pd.DataFrame()
|
149 |
-
display_df["錯誤文字"] = df["text"]
|
150 |
-
display_df["嚴重度"] = df["severity"]
|
151 |
-
display_df["分類"] = df["category"]
|
152 |
-
return display_df
|
153 |
-
|
154 |
def highlight_errors_in_text(text, errors):
|
155 |
"""
|
156 |
-
在文本中以 <span style="background-color:yellow;">...</span>
|
157 |
"""
|
|
|
|
|
158 |
highlighted = ""
|
159 |
last_end = 0
|
160 |
for err in sorted(errors, key=lambda e: e["start"]):
|
@@ -168,45 +166,139 @@ def highlight_errors_in_text(text, errors):
|
|
168 |
highlighted += text[last_end:]
|
169 |
return highlighted
|
170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
|
172 |
global current_index, data, current_errors
|
173 |
if len(current_errors) >= 5:
|
174 |
-
return "", "您已標記超過 5
|
175 |
|
176 |
if error_span and error_span not in target:
|
177 |
return "", "錯誤區間不存在於翻譯文本,請檢查!"
|
178 |
|
179 |
-
#
|
180 |
cat_val, subcat_val = subcategory_map.get((category, subcategory), (category_map.get(category, "Other"), "Other"))
|
181 |
severity_val = severity_map.get(severity, "Minor")
|
182 |
|
183 |
if error_span:
|
184 |
start = target.find(error_span)
|
185 |
end = start + len(error_span)
|
|
|
186 |
for err in current_errors:
|
187 |
if err["start"] == start and err["end"] == end:
|
188 |
-
return "", "
|
189 |
-
|
190 |
-
if subcat_val == "Other" and other:
|
191 |
-
|
|
|
192 |
|
193 |
current_errors.append({
|
194 |
"text": error_span,
|
195 |
"severity": severity_val,
|
196 |
"start": start,
|
197 |
"end": end,
|
198 |
-
"category": f"{cat_val}/{subcat_val}"
|
199 |
})
|
200 |
-
return "", f"
|
201 |
else:
|
202 |
-
return "", "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
def save_and_next(source, target, score, rater_selector, alternative_translation):
|
205 |
global current_index, data, annotations_file, current_errors, annotation_history
|
206 |
|
207 |
if not rater_selector:
|
208 |
return (
|
209 |
-
source, target, "",
|
210 |
str(data.loc[current_index, "id"]),
|
211 |
"請先選擇標註人員!",
|
212 |
get_error_dataframe(),
|
@@ -248,17 +340,17 @@ def save_and_next(source, target, score, rater_selector, alternative_translation
|
|
248 |
|
249 |
if current_index >= len(data):
|
250 |
return (
|
251 |
-
"已完成所有文本標記",
|
252 |
-
"已完成所有文本標記",
|
253 |
-
"",
|
254 |
-
"",
|
255 |
-
f"
|
256 |
pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]),
|
257 |
""
|
258 |
)
|
259 |
|
260 |
next_source, next_target = get_current_text()
|
261 |
-
status_msg = f"
|
262 |
return (
|
263 |
next_source,
|
264 |
next_target,
|
@@ -298,45 +390,34 @@ def update_index_selection(selected_display):
|
|
298 |
selected_id = parse_id_from_display(selected_display)
|
299 |
row_list = data.index[data["id"] == selected_id].tolist()
|
300 |
if not row_list:
|
301 |
-
return
|
|
|
|
|
|
|
|
|
|
|
302 |
current_index = row_list[0]
|
303 |
src, tgt = get_current_text()
|
304 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
-
|
307 |
-
|
308 |
-
current_errors.append({
|
309 |
-
"text": "",
|
310 |
-
"severity": "No-error",
|
311 |
-
"start": 0,
|
312 |
-
"end": 0,
|
313 |
-
"category": "No-error"
|
314 |
-
})
|
315 |
-
return "", "標註為完全正確,無錯誤!", get_error_dataframe()
|
316 |
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
"severity": "Major",
|
322 |
-
"start": 0,
|
323 |
-
"end": 0,
|
324 |
-
"category": "Non-translation"
|
325 |
-
})
|
326 |
-
return "", "已標註為過多錯誤!", get_error_dataframe()
|
327 |
|
328 |
-
DEMO_EXPLANATION = """
|
329 |
-
## 翻譯標記工具
|
330 |
-
1. 選擇檔案、標註人員、以及想要檢視的索引(句子)。
|
331 |
-
2. 檢查「翻譯文本」是否有錯誤,如有,請選擇「錯誤類別」、「子類別」、「嚴重度」,並在「錯誤區間」貼上有問題的翻譯文字。
|
332 |
-
3. 按「保存並繼續標記當前資料」,錯誤會暫時列在右方的「當前句子錯誤紀錄」中。
|
333 |
-
4. 全部錯誤標記完後,可給分(0-100),並可在「建議翻譯」中填寫更好的譯文。
|
334 |
-
5. 按「保存並顯示下一筆」,會提交當前這筆紀錄並跳至下一筆。
|
335 |
-
6. 若整句都正確,可按「完全正確」。若錯誤超過五處,可按「過多錯誤」。
|
336 |
"""
|
337 |
|
338 |
with gr.Blocks(css="""
|
339 |
-
/*
|
340 |
* {
|
341 |
font-size: 15px;
|
342 |
line-height: 1.4;
|
@@ -346,24 +427,30 @@ with gr.Blocks(css="""
|
|
346 |
padding: 10px;
|
347 |
margin-bottom: 10px;
|
348 |
}
|
349 |
-
/*
|
350 |
#correct_button {
|
351 |
-
background-color: #4CAF50;
|
352 |
color: white;
|
353 |
font-size: 14px;
|
354 |
margin-bottom: 5px;
|
355 |
}
|
356 |
#too_many_errors_button {
|
357 |
-
background-color: #f44336;
|
358 |
color: white;
|
359 |
font-size: 14px;
|
360 |
margin-bottom: 5px;
|
361 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
""") as demo:
|
363 |
gr.Markdown(DEMO_EXPLANATION)
|
364 |
|
365 |
with gr.Tab("標記工具"):
|
366 |
-
# -------------------
|
367 |
with gr.Row():
|
368 |
with gr.Column(scale=1):
|
369 |
rater_selector = gr.Dropdown(
|
@@ -377,7 +464,7 @@ with gr.Blocks(css="""
|
|
377 |
value="test.csv"
|
378 |
)
|
379 |
index_selector = gr.Dropdown(
|
380 |
-
label="選擇索引(id-原文前10字)",
|
381 |
choices=get_all_ids(),
|
382 |
value=f"{data.loc[current_index, 'id']}-{str(data.loc[current_index, 'source'])[:10]}"
|
383 |
)
|
@@ -387,34 +474,33 @@ with gr.Blocks(css="""
|
|
387 |
interactive=False
|
388 |
)
|
389 |
|
390 |
-
#
|
391 |
with gr.Column(scale=4):
|
392 |
-
source = gr.Textbox(label="原始文本", lines=
|
393 |
-
|
394 |
-
# ----------------- 右側:翻譯文本 -----------------
|
395 |
with gr.Column(scale=4):
|
396 |
-
target = gr.Textbox(label="翻譯文本", lines=
|
397 |
|
398 |
-
# -------------------
|
399 |
with gr.Row():
|
400 |
with gr.Column(scale=5):
|
401 |
-
|
|
|
402 |
with gr.Column(scale=5):
|
403 |
error_table = gr.Dataframe(
|
404 |
headers=["錯誤文字", "嚴重度", "分類"],
|
405 |
-
label="當前句子錯誤紀錄",
|
406 |
datatype=["str", "str", "str"],
|
407 |
interactive=False
|
408 |
)
|
409 |
|
410 |
-
# -------------------
|
411 |
-
with gr.
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
)
|
417 |
-
|
418 |
category = gr.Dropdown(
|
419 |
label="錯誤類別",
|
420 |
choices=list(categories_display.keys()),
|
@@ -425,89 +511,91 @@ with gr.Blocks(css="""
|
|
425 |
choices=categories_display["正確性"],
|
426 |
value="誤譯"
|
427 |
)
|
428 |
-
|
429 |
-
|
430 |
severity = gr.Dropdown(
|
431 |
label="嚴重度",
|
432 |
choices=severity_choices_display,
|
433 |
value="輕微 (Minor)"
|
434 |
)
|
435 |
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
|
|
|
|
455 |
save_next_button = gr.Button("保存並顯示下一筆")
|
456 |
|
457 |
-
#
|
458 |
status = gr.Textbox(label="當前狀態", lines=1, interactive=False)
|
459 |
|
460 |
-
# -------------------
|
461 |
-
def
|
462 |
subcats = categories_display[selected_category]
|
463 |
return gr.update(choices=subcats, value=subcats[0])
|
464 |
|
465 |
file_selector.change(
|
466 |
-
update_file_selection,
|
467 |
-
inputs=[file_selector],
|
468 |
outputs=[
|
469 |
-
source, target, error_span,
|
470 |
-
index_selector, current_index_display,
|
471 |
status, error_table, highlighted_target
|
472 |
]
|
473 |
)
|
474 |
index_selector.change(
|
475 |
-
update_index_selection,
|
476 |
-
inputs=[index_selector],
|
477 |
outputs=[
|
478 |
-
source, target, current_index_display,
|
479 |
status, error_table, highlighted_target
|
480 |
]
|
481 |
)
|
482 |
category.change(
|
483 |
-
|
484 |
-
inputs=[category],
|
485 |
outputs=[subcategory]
|
486 |
)
|
487 |
|
488 |
correct_button.click(
|
489 |
-
mark_as_correct,
|
490 |
outputs=[error_span, status, error_table]
|
491 |
)
|
492 |
too_many_errors_button.click(
|
493 |
-
mark_as_too_many_errors,
|
494 |
outputs=[error_span, status, error_table]
|
495 |
)
|
496 |
|
497 |
-
#
|
498 |
save_current_button.click(
|
499 |
-
save_current,
|
500 |
inputs=[source, target, rater_selector, error_span, category, subcategory, severity, other],
|
501 |
outputs=[error_span, status]
|
502 |
)
|
503 |
-
#
|
504 |
save_current_button.click(
|
505 |
fn=lambda tgt: (get_error_dataframe(), highlight_errors_in_text(tgt, current_errors)),
|
506 |
inputs=[target],
|
507 |
outputs=[error_table, highlighted_target]
|
508 |
)
|
509 |
|
510 |
-
#
|
511 |
save_next_button.click(
|
512 |
save_and_next,
|
513 |
inputs=[source, target, score, rater_selector, alternative_translation],
|
@@ -518,7 +606,7 @@ with gr.Blocks(css="""
|
|
518 |
]
|
519 |
)
|
520 |
|
521 |
-
#
|
522 |
init_src, init_tgt = get_current_text()
|
523 |
source.value = init_src
|
524 |
target.value = init_tgt
|
|
|
8 |
from uuid import uuid4
|
9 |
import shutil
|
10 |
|
11 |
+
# --------------------------- 中英對照的字典 ---------------------------
|
12 |
+
# 後端儲存(English),前端顯示(中文)
|
13 |
category_map = {
|
14 |
"正確性": "Accuracy",
|
15 |
"流暢度": "Fluency",
|
|
|
17 |
"風格": "Style",
|
18 |
"在地化": "Locale"
|
19 |
}
|
|
|
20 |
subcategory_map = {
|
21 |
("正確性", "誤譯"): ("Accuracy", "Mistranslation"),
|
22 |
("正確性", "新增"): ("Accuracy", "Addition"),
|
|
|
44 |
("在地化", "地址格式"): ("Locale", "Address format"),
|
45 |
("在地化", "其他"): ("Locale", "Other"),
|
46 |
}
|
|
|
47 |
categories_display = {
|
48 |
"正確性": ["誤譯", "新增", "漏譯", "其他"],
|
49 |
"流暢度": ["文法", "拼字", "標點符號", "前後不一致", "語域", "其他"],
|
|
|
52 |
"在地化": ["貨幣格式", "時間格式", "人名格式", "日期格式", "地址格式", "其他"]
|
53 |
}
|
54 |
|
55 |
+
severity_choices_display = ["輕微 (Minor)", "嚴重 (Major)"]
|
56 |
severity_map = {
|
57 |
"輕微 (Minor)": "Minor",
|
58 |
"嚴重 (Major)": "Major"
|
59 |
}
|
60 |
|
61 |
+
# 這兩個字典用於前端顯示資料表時,把英文轉回中文顯示
|
62 |
+
severity_display_map = {
|
63 |
+
"Minor": "輕微 (Minor)",
|
64 |
+
"Major": "嚴重 (Major)",
|
65 |
+
"No-error": "無錯誤",
|
66 |
+
"Non-translation": "過多錯誤"
|
67 |
+
}
|
68 |
+
category_display_map = {
|
69 |
+
"Accuracy": "正確性",
|
70 |
+
"Fluency": "流暢度",
|
71 |
+
"Terminology": "專有名詞",
|
72 |
+
"Style": "風格",
|
73 |
+
"Locale": "在地化",
|
74 |
+
"Other": "其他",
|
75 |
+
"No-error": "無錯誤",
|
76 |
+
"Non-translation": "過多錯誤"
|
77 |
+
}
|
78 |
+
|
79 |
+
# ---------------------------下載CSV資料檔--------------------------------
|
80 |
DATASET_DIR = Path("json_dataset")
|
81 |
DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
82 |
|
83 |
scheduler = CommitScheduler(
|
84 |
+
repo_id="350016z/TaiwanCOMET_dataset",
|
85 |
repo_type="dataset",
|
86 |
folder_path=DATASET_DIR,
|
87 |
path_in_repo="data"
|
88 |
)
|
89 |
|
|
|
90 |
def download_dataset_file(dataset_id, local_dir):
|
91 |
snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset")
|
92 |
contents = os.listdir(snapshot_path)
|
93 |
|
94 |
for file_name in contents:
|
|
|
95 |
if file_name.endswith(".csv"):
|
96 |
source_file_path = os.path.join(snapshot_path, file_name)
|
97 |
local_file_path = os.path.join(local_dir, file_name)
|
98 |
shutil.copy(source_file_path, local_file_path)
|
|
|
|
|
99 |
time.sleep(1)
|
100 |
return local_dir
|
101 |
|
|
|
109 |
exit()
|
110 |
|
111 |
data_path = os.path.join(current_dir, 'test.csv') if 'test.csv' in csv_files else os.path.join(current_dir, csv_files[0])
|
|
|
|
|
112 |
if not os.path.exists(data_path):
|
113 |
print(f"Error: {data_path} does not exist. Please check the file path.")
|
114 |
exit()
|
|
|
118 |
current_errors = []
|
119 |
|
120 |
annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
|
121 |
+
annotation_history = [] # 若需顯示歷史可擴充
|
|
|
|
|
122 |
|
123 |
def get_all_ids():
|
124 |
"""
|
|
|
147 |
json.dump(entry, f, ensure_ascii=False)
|
148 |
f.write("\n")
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
def highlight_errors_in_text(text, errors):
|
151 |
"""
|
152 |
+
在文本中以 <span style="background-color:yellow;">...</span> 方式高亮。
|
153 |
"""
|
154 |
+
if not text:
|
155 |
+
return ""
|
156 |
highlighted = ""
|
157 |
last_end = 0
|
158 |
for err in sorted(errors, key=lambda e: e["start"]):
|
|
|
166 |
highlighted += text[last_end:]
|
167 |
return highlighted
|
168 |
|
169 |
+
def get_error_dataframe():
|
170 |
+
"""
|
171 |
+
只顯示「錯誤文字」「嚴重度」「分類」(皆為中文顯示),後端仍存英文。
|
172 |
+
"""
|
173 |
+
df = pd.DataFrame(current_errors)
|
174 |
+
if df.empty:
|
175 |
+
return pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"])
|
176 |
+
|
177 |
+
display_df = pd.DataFrame()
|
178 |
+
# 顯示錯誤文字
|
179 |
+
display_df["錯誤文字"] = df["text"]
|
180 |
+
|
181 |
+
# 顯示嚴重度 (中文)
|
182 |
+
display_df["嚴重度"] = df["severity"].apply(lambda x: severity_display_map.get(x, x))
|
183 |
+
|
184 |
+
# 顯示分類 (中文)
|
185 |
+
# 後端儲存格式為 "Accuracy/Mistranslation" 或 "No-error"
|
186 |
+
def map_category(cat_str):
|
187 |
+
if cat_str in ["No-error", "Non-translation"]:
|
188 |
+
return severity_display_map.get(cat_str, cat_str)
|
189 |
+
if "/" not in cat_str:
|
190 |
+
# Single part (e.g. "Accuracy" or "Other")
|
191 |
+
return category_display_map.get(cat_str, cat_str)
|
192 |
+
main_cat, sub_cat = cat_str.split("/", 1)
|
193 |
+
main_cat_zh = category_display_map.get(main_cat, main_cat)
|
194 |
+
# sub_cat 可能是 "Mistranslation" or "Other" or "Addition"...
|
195 |
+
# 若需要更細,可再進一層 map;這裡示範單純中文對照
|
196 |
+
# 也可自行定義 sub_cat_map dict
|
197 |
+
sub_cat_zh = None
|
198 |
+
# 簡易示範: 逐一對照
|
199 |
+
if sub_cat == "Mistranslation":
|
200 |
+
sub_cat_zh = "誤譯"
|
201 |
+
elif sub_cat == "Addition":
|
202 |
+
sub_cat_zh = "新增"
|
203 |
+
elif sub_cat == "Omission":
|
204 |
+
sub_cat_zh = "漏譯"
|
205 |
+
elif sub_cat == "Grammar":
|
206 |
+
sub_cat_zh = "文法"
|
207 |
+
elif sub_cat == "Spelling":
|
208 |
+
sub_cat_zh = "拼字"
|
209 |
+
elif sub_cat == "Punctuation":
|
210 |
+
sub_cat_zh = "標點符號"
|
211 |
+
elif sub_cat == "Inconsistency":
|
212 |
+
sub_cat_zh = "前後不一致"
|
213 |
+
elif sub_cat == "Register":
|
214 |
+
sub_cat_zh = "語域"
|
215 |
+
elif sub_cat == "Inappropriate":
|
216 |
+
sub_cat_zh = "使用不當"
|
217 |
+
elif sub_cat == "Inconsistent":
|
218 |
+
sub_cat_zh = "不一致"
|
219 |
+
elif sub_cat == "Awkward":
|
220 |
+
sub_cat_zh = "用字笨拙"
|
221 |
+
elif sub_cat == "Currency format":
|
222 |
+
sub_cat_zh = "貨幣格式"
|
223 |
+
elif sub_cat == "Time format":
|
224 |
+
sub_cat_zh = "時間格式"
|
225 |
+
elif sub_cat == "Name format":
|
226 |
+
sub_cat_zh = "人名格式"
|
227 |
+
elif sub_cat == "Date format":
|
228 |
+
sub_cat_zh = "日期格式"
|
229 |
+
elif sub_cat == "Address format":
|
230 |
+
sub_cat_zh = "地址格式"
|
231 |
+
else:
|
232 |
+
# 若無對應就顯示原本
|
233 |
+
sub_cat_zh = sub_cat
|
234 |
+
return f"{main_cat_zh}/{sub_cat_zh}"
|
235 |
+
|
236 |
+
display_df["分類"] = df["category"].apply(map_category)
|
237 |
+
return display_df
|
238 |
+
|
239 |
def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
|
240 |
global current_index, data, current_errors
|
241 |
if len(current_errors) >= 5:
|
242 |
+
return "", "您已標記超過 5 處錯誤,可直接按『過多錯誤』或繼續。"
|
243 |
|
244 |
if error_span and error_span not in target:
|
245 |
return "", "錯誤區間不存在於翻譯文本,請檢查!"
|
246 |
|
247 |
+
# 轉英文
|
248 |
cat_val, subcat_val = subcategory_map.get((category, subcategory), (category_map.get(category, "Other"), "Other"))
|
249 |
severity_val = severity_map.get(severity, "Minor")
|
250 |
|
251 |
if error_span:
|
252 |
start = target.find(error_span)
|
253 |
end = start + len(error_span)
|
254 |
+
|
255 |
for err in current_errors:
|
256 |
if err["start"] == start and err["end"] == end:
|
257 |
+
return "", "此錯誤區間已標記過,請勿重複。"
|
258 |
+
|
259 |
+
if subcat_val == "Other" and other.strip():
|
260 |
+
# 如果子類別選『其他』且填了自訂內容
|
261 |
+
subcat_val = other.strip()
|
262 |
|
263 |
current_errors.append({
|
264 |
"text": error_span,
|
265 |
"severity": severity_val,
|
266 |
"start": start,
|
267 |
"end": end,
|
268 |
+
"category": f"{cat_val}/{subcat_val}"
|
269 |
})
|
270 |
+
return "", f"已標記錯誤: {error_span} (範圍 {start}-{end})"
|
271 |
else:
|
272 |
+
return "", "尚未輸入錯誤區間,如無錯誤請按『完全正確』"
|
273 |
+
|
274 |
+
def mark_as_correct():
|
275 |
+
global current_errors
|
276 |
+
current_errors.append({
|
277 |
+
"text": "",
|
278 |
+
"severity": "No-error",
|
279 |
+
"start": 0,
|
280 |
+
"end": 0,
|
281 |
+
"category": "No-error"
|
282 |
+
})
|
283 |
+
return "", "標註為完全正確!", get_error_dataframe()
|
284 |
+
|
285 |
+
def mark_as_too_many_errors():
|
286 |
+
global current_errors
|
287 |
+
current_errors.append({
|
288 |
+
"text": "",
|
289 |
+
"severity": "Major",
|
290 |
+
"start": 0,
|
291 |
+
"end": 0,
|
292 |
+
"category": "Non-translation"
|
293 |
+
})
|
294 |
+
return "", "已標註為過多錯誤!", get_error_dataframe()
|
295 |
|
296 |
def save_and_next(source, target, score, rater_selector, alternative_translation):
|
297 |
global current_index, data, annotations_file, current_errors, annotation_history
|
298 |
|
299 |
if not rater_selector:
|
300 |
return (
|
301 |
+
source, target, "", # return empty error_span
|
302 |
str(data.loc[current_index, "id"]),
|
303 |
"請先選擇標註人員!",
|
304 |
get_error_dataframe(),
|
|
|
340 |
|
341 |
if current_index >= len(data):
|
342 |
return (
|
343 |
+
"已完成所有文本標記", # source
|
344 |
+
"已完成所有文本標記", # target
|
345 |
+
"", # error_span
|
346 |
+
"", # current_index_display
|
347 |
+
f"標記完成並儲存到 {annotations_file.name}!(共 {len(data)} 筆)",
|
348 |
pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]),
|
349 |
""
|
350 |
)
|
351 |
|
352 |
next_source, next_target = get_current_text()
|
353 |
+
status_msg = f"已提交!目前進度:第 {current_index} 筆 / 共 {len(data)} 筆。"
|
354 |
return (
|
355 |
next_source,
|
356 |
next_target,
|
|
|
390 |
selected_id = parse_id_from_display(selected_display)
|
391 |
row_list = data.index[data["id"] == selected_id].tolist()
|
392 |
if not row_list:
|
393 |
+
return (
|
394 |
+
"", "", str(selected_id),
|
395 |
+
f"找不到 id: {selected_id}",
|
396 |
+
get_error_dataframe(),
|
397 |
+
""
|
398 |
+
)
|
399 |
current_index = row_list[0]
|
400 |
src, tgt = get_current_text()
|
401 |
+
return (
|
402 |
+
src, tgt,
|
403 |
+
str(selected_id),
|
404 |
+
f"已跳轉至 id={selected_id}",
|
405 |
+
get_error_dataframe(),
|
406 |
+
highlight_errors_in_text(tgt, current_errors)
|
407 |
+
)
|
408 |
|
409 |
+
DEMO_EXPLANATION = """
|
410 |
+
## 翻譯標記工具:階段性操作流程
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
411 |
|
412 |
+
### 操作步驟
|
413 |
+
1. **先選擇標註人員與檔案**,並在「索引」下拉中挑選要標註的句子。
|
414 |
+
2. 在「步驟 1:錯誤標註」中,若翻譯文本有錯,請輸入「錯誤區間」、選擇「錯誤類別/子類別/嚴重度」並點「保存並繼續標記」。多個錯誤可重複此步驟;若無錯誤則可直接點「完全正確」。
|
415 |
+
3. 錯誤標完後,在「步驟 2:評分與提交」中,拉動滑桿給分,若有更好譯文,可在「建議翻譯」填入。再按「保存並顯示下一筆」送出本句標註並進入下一句。
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
417 |
"""
|
418 |
|
419 |
with gr.Blocks(css="""
|
420 |
+
/* 整體字體與行距 */
|
421 |
* {
|
422 |
font-size: 15px;
|
423 |
line-height: 1.4;
|
|
|
427 |
padding: 10px;
|
428 |
margin-bottom: 10px;
|
429 |
}
|
430 |
+
/* 按鈕分色 */
|
431 |
#correct_button {
|
432 |
+
background-color: #4CAF50; /* 綠 */
|
433 |
color: white;
|
434 |
font-size: 14px;
|
435 |
margin-bottom: 5px;
|
436 |
}
|
437 |
#too_many_errors_button {
|
438 |
+
background-color: #f44336; /* 紅 */
|
439 |
color: white;
|
440 |
font-size: 14px;
|
441 |
margin-bottom: 5px;
|
442 |
}
|
443 |
+
/* 螢光標示外層加框,便於視覺聚焦 */
|
444 |
+
#highlight_box {
|
445 |
+
border: 1px solid #aaa;
|
446 |
+
padding: 10px;
|
447 |
+
min-height: 80px;
|
448 |
+
}
|
449 |
""") as demo:
|
450 |
gr.Markdown(DEMO_EXPLANATION)
|
451 |
|
452 |
with gr.Tab("標記工具"):
|
453 |
+
# ------------------- 頂部: 檔案 & 索引控制 -------------------
|
454 |
with gr.Row():
|
455 |
with gr.Column(scale=1):
|
456 |
rater_selector = gr.Dropdown(
|
|
|
464 |
value="test.csv"
|
465 |
)
|
466 |
index_selector = gr.Dropdown(
|
467 |
+
label="選擇索引 (id-原文前10字)",
|
468 |
choices=get_all_ids(),
|
469 |
value=f"{data.loc[current_index, 'id']}-{str(data.loc[current_index, 'source'])[:10]}"
|
470 |
)
|
|
|
474 |
interactive=False
|
475 |
)
|
476 |
|
477 |
+
# 左: 原始文本 / 右: 翻譯文本
|
478 |
with gr.Column(scale=4):
|
479 |
+
source = gr.Textbox(label="原始文本", lines=4, interactive=False)
|
|
|
|
|
480 |
with gr.Column(scale=4):
|
481 |
+
target = gr.Textbox(label="翻譯文本", lines=4, interactive=False)
|
482 |
|
483 |
+
# ------------------- 螢光標記區(帶外框)&錯誤紀錄表 -------------------
|
484 |
with gr.Row():
|
485 |
with gr.Column(scale=5):
|
486 |
+
with gr.Box(elem_id="highlight_box"):
|
487 |
+
highlighted_target = gr.HTML(value="", label="螢光標示區 (已標註的錯誤)")
|
488 |
with gr.Column(scale=5):
|
489 |
error_table = gr.Dataframe(
|
490 |
headers=["錯誤文字", "嚴重度", "分類"],
|
491 |
+
label="當前句子錯誤紀錄 (中文顯示)",
|
492 |
datatype=["str", "str", "str"],
|
493 |
interactive=False
|
494 |
)
|
495 |
|
496 |
+
# ------------------- 步驟1:錯誤標註 -------------------
|
497 |
+
with gr.Box(elem_id="step1_box", css="panel"):
|
498 |
+
gr.Markdown("### 步驟 1:錯誤標註")
|
499 |
+
|
500 |
+
with gr.Row():
|
501 |
+
# 錯誤區間 / 錯誤類別 / 子類別 / 嚴重度
|
502 |
+
error_span = gr.Textbox(label="錯誤區間 (可複製『翻譯文本』貼上)", lines=2)
|
503 |
+
|
504 |
category = gr.Dropdown(
|
505 |
label="錯誤類別",
|
506 |
choices=list(categories_display.keys()),
|
|
|
511 |
choices=categories_display["正確性"],
|
512 |
value="誤譯"
|
513 |
)
|
514 |
+
other = gr.Textbox(label="其他子類別(如選『其他』則填寫)")
|
515 |
+
|
516 |
severity = gr.Dropdown(
|
517 |
label="嚴重度",
|
518 |
choices=severity_choices_display,
|
519 |
value="輕微 (Minor)"
|
520 |
)
|
521 |
|
522 |
+
with gr.Row():
|
523 |
+
save_current_button = gr.Button("保存並繼續標記當前資料")
|
524 |
+
correct_button = gr.Button("✔ 完全正確", elem_id="correct_button")
|
525 |
+
too_many_errors_button = gr.Button("✖ 過多錯誤", elem_id="too_many_errors_button")
|
526 |
+
|
527 |
+
# ------------------- 步驟2:評分與提交 -------------------
|
528 |
+
with gr.Box(elem_id="step2_box", css="panel"):
|
529 |
+
gr.Markdown("### 步驟 2:評分與提交")
|
530 |
+
with gr.Row():
|
531 |
+
alternative_translation = gr.Textbox(
|
532 |
+
label="建議翻譯(如有更好譯法可填)",
|
533 |
+
lines=2
|
534 |
+
)
|
535 |
+
score = gr.Slider(
|
536 |
+
label="翻譯評分 (0=最差, 100=最好)",
|
537 |
+
minimum=0,
|
538 |
+
maximum=100,
|
539 |
+
step=1,
|
540 |
+
value=66
|
541 |
+
)
|
542 |
+
# 提交按鈕
|
543 |
save_next_button = gr.Button("保存並顯示下一筆")
|
544 |
|
545 |
+
# 最下方: 狀態
|
546 |
status = gr.Textbox(label="當前狀態", lines=1, interactive=False)
|
547 |
|
548 |
+
# ------------------- 邏輯綁定 -------------------
|
549 |
+
def update_subcats(selected_category):
|
550 |
subcats = categories_display[selected_category]
|
551 |
return gr.update(choices=subcats, value=subcats[0])
|
552 |
|
553 |
file_selector.change(
|
554 |
+
update_file_selection,
|
555 |
+
inputs=[file_selector],
|
556 |
outputs=[
|
557 |
+
source, target, error_span,
|
558 |
+
index_selector, current_index_display,
|
559 |
status, error_table, highlighted_target
|
560 |
]
|
561 |
)
|
562 |
index_selector.change(
|
563 |
+
update_index_selection,
|
564 |
+
inputs=[index_selector],
|
565 |
outputs=[
|
566 |
+
source, target, current_index_display,
|
567 |
status, error_table, highlighted_target
|
568 |
]
|
569 |
)
|
570 |
category.change(
|
571 |
+
update_subcats,
|
572 |
+
inputs=[category],
|
573 |
outputs=[subcategory]
|
574 |
)
|
575 |
|
576 |
correct_button.click(
|
577 |
+
mark_as_correct,
|
578 |
outputs=[error_span, status, error_table]
|
579 |
)
|
580 |
too_many_errors_button.click(
|
581 |
+
mark_as_too_many_errors,
|
582 |
outputs=[error_span, status, error_table]
|
583 |
)
|
584 |
|
585 |
+
# 按「保存並繼續標記」 -> 在同一句上加錯誤
|
586 |
save_current_button.click(
|
587 |
+
save_current,
|
588 |
inputs=[source, target, rater_selector, error_span, category, subcategory, severity, other],
|
589 |
outputs=[error_span, status]
|
590 |
)
|
591 |
+
# 再更新表格 & 高亮
|
592 |
save_current_button.click(
|
593 |
fn=lambda tgt: (get_error_dataframe(), highlight_errors_in_text(tgt, current_errors)),
|
594 |
inputs=[target],
|
595 |
outputs=[error_table, highlighted_target]
|
596 |
)
|
597 |
|
598 |
+
# 按「保存並顯示下一筆」 -> 送出當前整句標註 & 進下一句
|
599 |
save_next_button.click(
|
600 |
save_and_next,
|
601 |
inputs=[source, target, score, rater_selector, alternative_translation],
|
|
|
606 |
]
|
607 |
)
|
608 |
|
609 |
+
# 初始化介面
|
610 |
init_src, init_tgt = get_current_text()
|
611 |
source.value = init_src
|
612 |
target.value = init_tgt
|