Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import os | |
import time | |
import json | |
from pathlib import Path | |
from huggingface_hub import CommitScheduler, snapshot_download | |
from uuid import uuid4 | |
import shutil | |
# --------------------------- 中英對照的字典 --------------------------- | |
# 後端儲存(English),前端顯示(中文) | |
category_map = { | |
"正確性": "Accuracy", | |
"流暢度": "Fluency", | |
"專有名詞": "Terminology", | |
"風格": "Style", | |
"在地化": "Locale", | |
"純正性": "Purity", | |
} | |
subcategory_map = { | |
("正確性", "誤譯"): ("Accuracy", "Mistranslation"), | |
("正確性", "多譯"): ("Accuracy", "Addition"), | |
("正確性", "漏譯"): ("Accuracy", "Omission"), | |
("正確性", "其他"): ("Accuracy", "Other"), | |
("流暢度", "文法"): ("Fluency", "Grammar"), | |
("流暢度", "拼字"): ("Fluency", "Spelling"), | |
("流暢度", "標點符號"): ("Fluency", "Punctuation"), | |
("流暢度", "前後不一致"): ("Fluency", "Inconsistency"), | |
("流暢度", "語域"): ("Fluency", "Register"), | |
("流暢度", "其他"): ("Fluency", "Other"), | |
("專有名詞", "使用不當"): ("Terminology", "Inappropriate"), | |
("專有名詞", "不一致"): ("Terminology", "Inconsistent"), | |
("專有名詞", "其他"): ("Terminology", "Other"), | |
("風格", "用字尷尬"): ("Style", "Awkward"), | |
("風格", "其他"): ("Style", "Other"), | |
("在地化", "貨幣格式"): ("Locale", "Currency format"), | |
("在地化", "時間格式"): ("Locale", "Time format"), | |
("在地化", "姓名格式"): ("Locale", "Name format"), | |
("在地化", "日期格式"): ("Locale", "Date format"), | |
("在地化", "地址格式"): ("Locale", "Address format"), | |
("在地化", "其他"): ("Locale", "Other"), | |
} | |
categories_display = { | |
"正確性": ["誤譯", "多譯", "漏譯", "其他"], | |
"流暢度": ["文法", "拼字", "標點符號", "前後不一致", "語域", "其他"], | |
"專有名詞": ["使用不當", "不一致", "其他"], | |
"風格": ["用字尷尬", "其他"], | |
"在地化": ["貨幣格式", "時間格式", "姓名格式", "日期格式", "地址格式", "其他"], | |
"純正性": [] | |
} | |
severity_choices_display = ["輕微", "嚴重"] | |
severity_map = { | |
"輕微": "Minor", | |
"嚴重": "Major" | |
} | |
# 這兩個字典用於前端顯示資料表時,把英文轉回中文顯示 | |
severity_display_map = { | |
"Minor": "輕微", | |
"Major": "嚴重", | |
"No-error": "無錯誤", | |
"Non-translation": "過多錯誤" | |
} | |
category_display_map = { | |
"Accuracy": "正確性", | |
"Fluency": "流暢度", | |
"Terminology": "專有名詞", | |
"Style": "風格", | |
"Locale": "在地化", | |
"Other": "其他", | |
"No-error": "無錯誤", | |
"Non-translation": "過多錯誤", | |
"Purity": "純正性" | |
} | |
# ---------------------------下載CSV資料檔-------------------------------- | |
DATASET_DIR = Path("json_dataset") | |
DATASET_DIR.mkdir(parents=True, exist_ok=True) | |
scheduler = CommitScheduler( | |
repo_id="350016z/TaiwanCOMET_dataset", | |
repo_type="dataset", | |
folder_path=DATASET_DIR, | |
path_in_repo="data" | |
) | |
def download_dataset_file(dataset_id, local_dir): | |
snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset") | |
contents = os.listdir(snapshot_path) | |
for file_name in contents: | |
if file_name.endswith(".csv"): | |
source_file_path = os.path.join(snapshot_path, file_name) | |
local_file_path = os.path.join(local_dir, file_name) | |
shutil.copy(source_file_path, local_file_path) | |
time.sleep(1) | |
return local_dir | |
DATASET_ID = "350016z/Taiwanese_dataset" | |
current_dir = os.getcwd() | |
download_dataset_file(DATASET_ID, current_dir) | |
csv_files = [f for f in os.listdir(current_dir) if f.endswith('.csv')] | |
if not csv_files: | |
print("Error: No CSV files found in the current directory.") | |
exit() | |
data_path = os.path.join(current_dir, 'test.csv') if 'test.csv' in csv_files else os.path.join(current_dir, csv_files[0]) | |
if not os.path.exists(data_path): | |
print(f"Error: {data_path} does not exist. Please check the file path.") | |
exit() | |
data = pd.read_csv(data_path, dtype={"id": "Int64"}) | |
# 先按照 id 由小到大排序,並重新整理索引 | |
data = data.sort_values(by="id", ascending=True, ignore_index=True) | |
current_index = 0 | |
current_errors = [] | |
annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json" | |
annotation_history = [] # 若需顯示歷史可擴充 | |
def get_all_ids(): | |
""" | |
顯示格式: [id-原文前10字] 以便快速鎖定哪一筆 | |
""" | |
id_list = [] | |
for i in range(len(data)): | |
idx_value = data.loc[i, "id"] | |
src_text = str(data.loc[i, "source"])[:10].replace("\n", " ") | |
display_str = f"{idx_value}-{src_text}" | |
id_list.append(display_str) | |
return id_list | |
def parse_id_from_display(display_str): | |
return int(display_str.split("-", 1)[0]) | |
def get_current_text(): | |
global current_index, data | |
source = data.loc[current_index, "source"] | |
target = data.loc[current_index, "target"] | |
return source, target | |
def save_to_json(entry: dict, json_file: Path): | |
with scheduler.lock: | |
with json_file.open("a") as f: | |
json.dump(entry, f, ensure_ascii=False) | |
f.write("\n") | |
def highlight_errors_in_text(text, errors): | |
""" | |
在文本中以 <span style="background-color:yellow;">...</span> 方式高亮。 | |
""" | |
if not text: | |
return "" | |
highlighted = "" | |
last_end = 0 | |
for err in sorted(errors, key=lambda e: e["start"]): | |
st = err["start"] | |
ed = err["end"] | |
if st < 0 or ed > len(text): | |
continue | |
highlighted += text[last_end:st] | |
highlighted += f"<span style='background-color:yellow;'>{text[st:ed]}</span>" | |
last_end = ed | |
highlighted += text[last_end:] | |
return highlighted | |
def get_error_dataframe(): | |
""" | |
只顯示「錯誤文字」「嚴重度」「分類」(皆為中文顯示),後端仍存英文。 | |
""" | |
df = pd.DataFrame(current_errors) | |
if df.empty: | |
return pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]) | |
display_df = pd.DataFrame() | |
# 顯示錯誤文字 | |
display_df["錯誤文字"] = df["text"] | |
# 顯示嚴重度 (中文) | |
display_df["嚴重度"] = df["severity"].apply(lambda x: severity_display_map.get(x, x)) | |
# 顯示分類 (中文) | |
def map_category(cat_str): | |
if cat_str in ["No-error", "Non-translation"]: | |
# 代表 "完全正確" 或 "過多錯誤" | |
return severity_display_map.get(cat_str, cat_str) | |
if "/" not in cat_str: | |
# Single part (e.g. "Accuracy" or "Other") | |
return category_display_map.get(cat_str, cat_str) | |
main_cat, sub_cat = cat_str.split("/", 1) | |
main_cat_zh = category_display_map.get(main_cat, main_cat) | |
# sub_cat -> e.g. "Mistranslation", "Addition", "Omission", ... | |
# 這裡可逐一對照,略示如下: | |
if sub_cat == "Mistranslation": | |
sub_cat_zh = "誤譯" | |
elif sub_cat == "Addition": | |
sub_cat_zh = "多譯" | |
elif sub_cat == "Omission": | |
sub_cat_zh = "漏譯" | |
elif sub_cat == "Grammar": | |
sub_cat_zh = "文法" | |
elif sub_cat == "Spelling": | |
sub_cat_zh = "拼字" | |
elif sub_cat == "Punctuation": | |
sub_cat_zh = "標點符號" | |
elif sub_cat == "Inconsistency": | |
sub_cat_zh = "前後不一致" | |
elif sub_cat == "Register": | |
sub_cat_zh = "語域" | |
elif sub_cat == "Inappropriate": | |
sub_cat_zh = "使用不當" | |
elif sub_cat == "Inconsistent": | |
sub_cat_zh = "不一致" | |
elif sub_cat == "Awkward": | |
sub_cat_zh = "用字尷尬" | |
elif sub_cat == "Currency format": | |
sub_cat_zh = "貨幣格式" | |
elif sub_cat == "Time format": | |
sub_cat_zh = "時間格式" | |
elif sub_cat == "Name format": | |
sub_cat_zh = "姓名格式" | |
elif sub_cat == "Date format": | |
sub_cat_zh = "日期格式" | |
elif sub_cat == "Address format": | |
sub_cat_zh = "地址格式" | |
else: | |
sub_cat_zh = sub_cat | |
return f"{main_cat_zh}/{sub_cat_zh}" | |
display_df["分類"] = df["category"].apply(map_category) | |
return display_df | |
# === 關鍵修正:把「保存並繼續標記」後,要同時更新表格與螢光區 === | |
def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other): | |
""" | |
原本的邏輯 + 一次回傳 error_span, status, error_table, highlighted_target, | |
使得按下按鈕後能同步更新介面。 | |
""" | |
global current_index, data, current_errors | |
# 若已標記超過 5 處錯誤 | |
if len(current_errors) >= 5: | |
return ( | |
"", # error_span 清空 | |
"您已標記超過 5 處錯誤,可直接按『過多錯誤』或繼續。", | |
get_error_dataframe(), | |
highlight_errors_in_text(target, current_errors) | |
) | |
if error_span and error_span not in target: | |
return ( | |
"", | |
"錯誤區間不存在於翻譯文本,請檢查!", | |
get_error_dataframe(), | |
highlight_errors_in_text(target, current_errors) | |
) | |
# 轉英文 | |
cat_val, subcat_val = subcategory_map.get((category, subcategory), (category_map.get(category, "Other"), "Other")) | |
severity_val = severity_map.get(severity, "Minor") | |
if error_span: | |
start = target.find(error_span) | |
end = start + len(error_span) | |
# 檢查是否重複標記 | |
for err in current_errors: | |
if err["start"] == start and err["end"] == end: | |
return ( | |
"", | |
"此錯誤區間已標記過,請勿重複。", | |
get_error_dataframe(), | |
highlight_errors_in_text(target, current_errors) | |
) | |
if subcat_val == "Other" and other.strip(): | |
subcat_val = other.strip() | |
current_errors.append({ | |
"text": error_span, | |
"severity": severity_val, | |
"start": start, | |
"end": end, | |
"category": f"{cat_val}/{subcat_val}" | |
}) | |
status_msg = f"已標記錯誤: {error_span} (範圍 {start}-{end})" | |
else: | |
# 未輸入錯誤區間 | |
status_msg = "尚未輸入錯誤區間,如無錯誤請按『完全正確』" | |
return ( | |
"", | |
status_msg, | |
get_error_dataframe(), | |
highlight_errors_in_text(target, current_errors) | |
) | |
def mark_as_correct(target): | |
""" | |
標記為完全正確 (No-error),同時更新表格 & 螢光區。 | |
""" | |
global current_errors | |
current_errors.append({ | |
"text": "", | |
"severity": "No-error", | |
"start": 0, | |
"end": 0, | |
"category": "No-error" | |
}) | |
return ( | |
"", # error_span | |
"標註為完全正確!", | |
get_error_dataframe(), | |
highlight_errors_in_text(target, current_errors) | |
) | |
def mark_as_too_many_errors(target): | |
""" | |
標記為過多錯誤 (Non-translation),同時更新表格 & 螢光區。 | |
""" | |
global current_errors | |
current_errors.append({ | |
"text": "", | |
"severity": "Major", | |
"start": 0, | |
"end": 0, | |
"category": "Non-translation" | |
}) | |
return ( | |
"", | |
"已標註為過多錯誤!", | |
get_error_dataframe(), | |
highlight_errors_in_text(target, current_errors) | |
) | |
def save_and_next(source, target, score, rater_selector, alternative_translation): | |
global current_index, data, annotations_file, current_errors, annotation_history | |
if not rater_selector: | |
return ( | |
source, target, "", # return empty error_span | |
str(data.loc[current_index, "id"]), | |
"請先選擇標註人員!", | |
get_error_dataframe(), | |
highlight_errors_in_text(target, current_errors) | |
) | |
if score is None: | |
return ( | |
source, target, "", | |
str(data.loc[current_index, "id"]), | |
"請先填寫評分!", | |
get_error_dataframe(), | |
highlight_errors_in_text(target, current_errors) | |
) | |
system = data.loc[current_index, "system"] | |
lp = data.loc[current_index, "lp"] | |
doc = data.loc[current_index, "doc"] | |
id_val = int(data.loc[current_index, "id"]) | |
reference = data.loc[current_index, "reference"] | |
annotations_entry = { | |
"system": system, | |
"lp": lp, | |
"doc": doc, | |
"id": id_val, | |
"rater": rater_selector, | |
"src": source, | |
"mt": target, | |
"ref": reference, | |
"esa_score": score, | |
"esa_spans": current_errors, | |
"alternative_translation": alternative_translation if alternative_translation else "" | |
} | |
save_to_json(annotations_entry, annotations_file) | |
annotation_history.append(annotations_entry) | |
current_errors = [] | |
current_index += 1 | |
if current_index >= len(data): | |
return ( | |
"已完成所有文本標記", # source | |
"已完成所有文本標記", # target | |
"", # error_span | |
"", # current_index_display | |
f"標記完成並儲存到 {annotations_file.name}!(共 {len(data)} 筆)", | |
pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]), | |
"" | |
) | |
next_source, next_target = get_current_text() | |
status_msg = f"已提交!目前進度:已完成第 {current_index} 筆 (id={current_index-1}) / 共 {len(data)} 筆。" | |
highlighted_next = highlight_errors_in_text(next_target, current_errors) | |
return ( | |
next_source, | |
next_target, | |
"", | |
str(data.loc[current_index, "id"]), | |
status_msg, | |
pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]), | |
highlighted_next | |
) | |
def update_file_selection(selected_file): | |
global data_path, data, current_index, annotations_file, current_errors, annotation_history | |
data_path = os.path.join(current_dir, selected_file) | |
data = pd.read_csv(data_path, dtype={"id": "Int64"}) | |
current_errors = [] | |
annotation_history = [] | |
min_id = data["id"].min() | |
current_index = data.index[data["id"] == min_id].tolist()[0] | |
file_base_name = os.path.splitext(selected_file)[0] | |
annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json" | |
src, tgt = get_current_text() | |
default_index_display = f"{min_id}-{str(src)[:10]}" | |
return ( | |
src, tgt, "", | |
gr.update(choices=get_all_ids(), value=default_index_display), | |
str(data.loc[current_index, "id"]), | |
f"已加載檔案:{selected_file}", | |
pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]), | |
highlight_errors_in_text(tgt, []) | |
) | |
def update_index_selection(selected_display): | |
global current_index, data, current_errors | |
selected_id = parse_id_from_display(selected_display) | |
row_list = data.index[data["id"] == selected_id].tolist() | |
if not row_list: | |
return ( | |
"", "", str(selected_id), | |
f"找不到 id: {selected_id}", | |
get_error_dataframe(), | |
"" | |
) | |
current_index = row_list[0] | |
src, tgt = get_current_text() | |
return ( | |
src, tgt, | |
str(selected_id), | |
f"已跳轉至 id={selected_id}", | |
get_error_dataframe(), | |
highlight_errors_in_text(tgt, current_errors) | |
) | |
DEMO_EXPLANATION = """ | |
## 翻譯標記工具 | |
### 💡[使用規則](https://huggingface.co/spaces/350016z/TranslationError_Gradio/blob/main/README.md) (第一次使用務必查看) | |
### 操作步驟 | |
1. **先選擇標註人員與檔案**,並在「索引」下拉中挑選要標註的句子。 | |
2. 在「步驟 1:錯誤標註」中,若翻譯文本有錯,請輸入「錯誤區間」、選擇「錯誤類別/子類別/嚴重度」並點「保存並繼續標記」。 | |
- 多個錯誤可重複此步驟;若無錯誤則可直接點「完全正確」。 | |
3. 錯誤標完後,在「步驟 2:評分與提交」中,拉動滑桿給分,若有更好譯文,可在「建議翻譯」填入。 | |
4. 按「保存並顯示下一筆」送出本句標註並進入下一句。 | |
""" | |
with gr.Blocks(css=""" | |
/* 整體字體與行距 */ | |
* { | |
font-size: 15px; | |
line-height: 1.4; | |
} | |
/* 按鈕分色 */ | |
#correct_button { | |
background-color: #4CAF50; /* 綠 */ | |
color: white; | |
font-size: 14px; | |
margin-bottom: 5px; | |
} | |
#too_many_errors_button { | |
background-color: #f44336; /* 紅 */ | |
color: white; | |
font-size: 14px; | |
margin-bottom: 5px; | |
} | |
#save_current_button { | |
background-color: #1565C0; /* 藍 */ | |
color: white; | |
font-size: 14px; | |
margin-bottom: 5px; | |
} | |
#save_next_button { | |
background-color: #1565C0; /* 藍 */ | |
color: white; | |
font-size: 14px; | |
margin-bottom: 5px; | |
} | |
/* 模擬帶框風格 */ | |
#highlight_box_group { | |
border: 1px solid #aaa; | |
padding: 10px; | |
margin-bottom: 10px; | |
min-height: 80px; | |
} | |
/* 讓「步驟區塊」顯示類似面板效果 */ | |
#step1_box, #step2_box { | |
border: 1px solid #ccc; | |
padding: 10px; | |
margin-bottom: 10px; | |
} | |
""") as demo: | |
gr.Markdown(DEMO_EXPLANATION) | |
# ------------------- 頂部: 檔案 & 索引控制 ------------------- | |
with gr.Row(): | |
with gr.Column(scale=1): | |
rater_selector = gr.Dropdown( | |
label="標註人員", | |
choices=["rater_test", "rater1", "rater2", "rater3", "rater4", "rater5", "rater6", "rater7"], | |
value="rater_test" | |
) | |
file_selector = gr.Dropdown( | |
label="選擇檔案", | |
choices=csv_files, | |
value="test.csv" | |
) | |
index_selector = gr.Dropdown( | |
label="選擇索引 (id-原文前10字)", | |
choices=get_all_ids(), | |
value=f"{data.loc[current_index, 'id']}-{str(data.loc[current_index, 'source'])[:10]}" | |
) | |
current_index_display = gr.Textbox( | |
label="當前索引(id)", | |
value=str(data.loc[current_index, "id"]), | |
interactive=False | |
) | |
# 左: 原始文本 / 右: 翻譯文本 | |
with gr.Column(scale=4): | |
source = gr.Textbox(label="原始文本", lines=14, interactive=False) | |
with gr.Column(scale=4): | |
target = gr.Textbox(label="翻譯文本", lines=14, interactive=False) | |
with gr.Tab("步驟1:錯誤標註"): | |
# ------------------- 螢光標記區(用 Group + elem_id)&錯誤紀錄表 ------------------- | |
with gr.Row(): | |
with gr.Column(scale=5): | |
with gr.Group(elem_id="highlight_box_group"): | |
highlighted_target = gr.HTML(value="", label="螢光標示區 (已標註的錯誤)") | |
with gr.Column(scale=5): | |
error_table = gr.Dataframe( | |
headers=["錯誤文字", "嚴重度", "分類"], | |
label="當前句子錯誤紀錄 (中文顯示)", | |
datatype=["str", "str", "str"], | |
interactive=False | |
) | |
# ------------------- 步驟1:錯誤標註 ------------------- | |
# with gr.Group(elem_id="step1_box"): | |
with gr.Row(equal_height=True): | |
error_span = gr.Textbox(label="錯誤區間 (可複製『翻譯文本』貼上)", lines=2, placeholder="請輸入翻譯中文本的錯誤區間") | |
# with gr.Row(equal_height=True): | |
category = gr.Dropdown( | |
label="錯誤類別", | |
choices=list(categories_display.keys()), | |
value="正確性" | |
) | |
subcategory = gr.Dropdown( | |
label="子類別", | |
choices=categories_display["正確性"], | |
value="誤譯" | |
) | |
other = gr.Textbox(label="其他子類別", placeholder="如子類別選『其他』則填寫") | |
severity = gr.Dropdown( | |
label="嚴重程度", | |
choices=severity_choices_display, | |
value="輕微" | |
) | |
with gr.Row(): | |
correct_button = gr.Button("✔ 完全正確", elem_id="correct_button") | |
too_many_errors_button = gr.Button("✖ 過多錯誤", elem_id="too_many_errors_button") | |
save_current_button = gr.Button("保存並繼續標記當前資料", elem_id="save_current_button") | |
with gr.Tab("步驟2:評分與提交"): | |
# ------------------- 步驟2:評分與提交 ------------------- | |
# with gr.Group(elem_id="step2_box"): | |
with gr.Row(): | |
alternative_translation = gr.Textbox( | |
label="建議翻譯(如有更好譯法可填)", | |
lines=2 | |
) | |
score = gr.Slider( | |
label="翻譯評分 (0=最差, 100=最好)", | |
minimum=0, | |
maximum=100, | |
step=1, | |
value=66 | |
) | |
save_next_button = gr.Button("保存並顯示下一筆", elem_id="save_next_button") | |
# ------------------- 當前狀態 ------------------- | |
status = gr.Textbox(label="當前狀態", lines=1, interactive=False) | |
# ------------------- callback 綁定 ------------------- | |
def update_subcats(selected_category): | |
subcats = categories_display[selected_category] | |
if len(subcats) == 0: | |
# 沒有任何子類別 -> 傳回空的 choices | |
return gr.update(choices=[], value=None) | |
else: | |
return gr.update(choices=subcats, value=subcats[0]) | |
file_selector.change( | |
update_file_selection, | |
inputs=[file_selector], | |
outputs=[ | |
source, target, error_span, | |
index_selector, current_index_display, | |
status, error_table, highlighted_target | |
] | |
) | |
index_selector.change( | |
update_index_selection, | |
inputs=[index_selector], | |
outputs=[ | |
source, target, current_index_display, | |
status, error_table, highlighted_target | |
] | |
) | |
category.change( | |
update_subcats, | |
inputs=[category], | |
outputs=[subcategory] | |
) | |
# === 以下三個按鈕,皆一次更新表格與螢光區 === | |
# 按「保存並繼續標記」 -> 在同一句上加錯誤並更新表格 & 高亮 | |
correct_button.click( | |
mark_as_correct, | |
inputs=[target], | |
outputs=[error_span, status, error_table, highlighted_target] | |
) | |
too_many_errors_button.click( | |
mark_as_too_many_errors, | |
inputs=[target], | |
outputs=[error_span, status, error_table, highlighted_target] | |
) | |
save_current_button.click( | |
save_current, | |
inputs=[source, target, rater_selector, error_span, category, subcategory, severity, other], | |
outputs=[error_span, status, error_table, highlighted_target] | |
) | |
# 按「保存並顯示下一筆」 -> 送出當前整句標註 & 進下一句 | |
save_next_button.click( | |
save_and_next, | |
inputs=[source, target, score, rater_selector, alternative_translation], | |
outputs=[ | |
source, target, error_span, | |
current_index_display, status, | |
error_table, highlighted_target | |
] | |
) | |
# 初始化介面 | |
init_src, init_tgt = get_current_text() | |
source.value = init_src | |
target.value = init_tgt | |
error_table.value = pd.DataFrame(columns=["錯誤文字","嚴重度","分類"]) | |
highlighted_target.value = highlight_errors_in_text(init_tgt, []) | |
demo.launch() |