350016z's picture
Update app.py
7550ffc verified
import gradio as gr
import pandas as pd
import os
import time
import json
from pathlib import Path
from huggingface_hub import CommitScheduler, snapshot_download
from uuid import uuid4
import shutil
# --------------------------- 中英對照的字典 ---------------------------
# 後端儲存(English),前端顯示(中文)
category_map = {
"正確性": "Accuracy",
"流暢度": "Fluency",
"專有名詞": "Terminology",
"風格": "Style",
"在地化": "Locale",
"純正性": "Purity",
}
subcategory_map = {
("正確性", "誤譯"): ("Accuracy", "Mistranslation"),
("正確性", "多譯"): ("Accuracy", "Addition"),
("正確性", "漏譯"): ("Accuracy", "Omission"),
("正確性", "其他"): ("Accuracy", "Other"),
("流暢度", "文法"): ("Fluency", "Grammar"),
("流暢度", "拼字"): ("Fluency", "Spelling"),
("流暢度", "標點符號"): ("Fluency", "Punctuation"),
("流暢度", "前後不一致"): ("Fluency", "Inconsistency"),
("流暢度", "語域"): ("Fluency", "Register"),
("流暢度", "其他"): ("Fluency", "Other"),
("專有名詞", "使用不當"): ("Terminology", "Inappropriate"),
("專有名詞", "不一致"): ("Terminology", "Inconsistent"),
("專有名詞", "其他"): ("Terminology", "Other"),
("風格", "用字尷尬"): ("Style", "Awkward"),
("風格", "其他"): ("Style", "Other"),
("在地化", "貨幣格式"): ("Locale", "Currency format"),
("在地化", "時間格式"): ("Locale", "Time format"),
("在地化", "姓名格式"): ("Locale", "Name format"),
("在地化", "日期格式"): ("Locale", "Date format"),
("在地化", "地址格式"): ("Locale", "Address format"),
("在地化", "其他"): ("Locale", "Other"),
}
categories_display = {
"正確性": ["誤譯", "多譯", "漏譯", "其他"],
"流暢度": ["文法", "拼字", "標點符號", "前後不一致", "語域", "其他"],
"專有名詞": ["使用不當", "不一致", "其他"],
"風格": ["用字尷尬", "其他"],
"在地化": ["貨幣格式", "時間格式", "姓名格式", "日期格式", "地址格式", "其他"],
"純正性": []
}
severity_choices_display = ["輕微", "嚴重"]
severity_map = {
"輕微": "Minor",
"嚴重": "Major"
}
# 這兩個字典用於前端顯示資料表時,把英文轉回中文顯示
severity_display_map = {
"Minor": "輕微",
"Major": "嚴重",
"No-error": "無錯誤",
"Non-translation": "過多錯誤"
}
category_display_map = {
"Accuracy": "正確性",
"Fluency": "流暢度",
"Terminology": "專有名詞",
"Style": "風格",
"Locale": "在地化",
"Other": "其他",
"No-error": "無錯誤",
"Non-translation": "過多錯誤",
"Purity": "純正性"
}
# ---------------------------下載CSV資料檔--------------------------------
DATASET_DIR = Path("json_dataset")
DATASET_DIR.mkdir(parents=True, exist_ok=True)
scheduler = CommitScheduler(
repo_id="350016z/TaiwanCOMET_dataset",
repo_type="dataset",
folder_path=DATASET_DIR,
path_in_repo="data"
)
def download_dataset_file(dataset_id, local_dir):
snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset")
contents = os.listdir(snapshot_path)
for file_name in contents:
if file_name.endswith(".csv"):
source_file_path = os.path.join(snapshot_path, file_name)
local_file_path = os.path.join(local_dir, file_name)
shutil.copy(source_file_path, local_file_path)
time.sleep(1)
return local_dir
DATASET_ID = "350016z/Taiwanese_dataset"
current_dir = os.getcwd()
download_dataset_file(DATASET_ID, current_dir)
csv_files = [f for f in os.listdir(current_dir) if f.endswith('.csv')]
if not csv_files:
print("Error: No CSV files found in the current directory.")
exit()
data_path = os.path.join(current_dir, 'test.csv') if 'test.csv' in csv_files else os.path.join(current_dir, csv_files[0])
if not os.path.exists(data_path):
print(f"Error: {data_path} does not exist. Please check the file path.")
exit()
data = pd.read_csv(data_path, dtype={"id": "Int64"})
# 先按照 id 由小到大排序,並重新整理索引
data = data.sort_values(by="id", ascending=True, ignore_index=True)
current_index = 0
current_errors = []
annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
annotation_history = [] # 若需顯示歷史可擴充
def get_all_ids():
"""
顯示格式: [id-原文前10字] 以便快速鎖定哪一筆
"""
id_list = []
for i in range(len(data)):
idx_value = data.loc[i, "id"]
src_text = str(data.loc[i, "source"])[:10].replace("\n", " ")
display_str = f"{idx_value}-{src_text}"
id_list.append(display_str)
return id_list
def parse_id_from_display(display_str):
return int(display_str.split("-", 1)[0])
def get_current_text():
global current_index, data
source = data.loc[current_index, "source"]
target = data.loc[current_index, "target"]
return source, target
def save_to_json(entry: dict, json_file: Path):
with scheduler.lock:
with json_file.open("a") as f:
json.dump(entry, f, ensure_ascii=False)
f.write("\n")
def highlight_errors_in_text(text, errors):
"""
在文本中以 <span style="background-color:yellow;">...</span> 方式高亮。
"""
if not text:
return ""
highlighted = ""
last_end = 0
for err in sorted(errors, key=lambda e: e["start"]):
st = err["start"]
ed = err["end"]
if st < 0 or ed > len(text):
continue
highlighted += text[last_end:st]
highlighted += f"<span style='background-color:yellow;'>{text[st:ed]}</span>"
last_end = ed
highlighted += text[last_end:]
return highlighted
def get_error_dataframe():
"""
只顯示「錯誤文字」「嚴重度」「分類」(皆為中文顯示),後端仍存英文。
"""
df = pd.DataFrame(current_errors)
if df.empty:
return pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"])
display_df = pd.DataFrame()
# 顯示錯誤文字
display_df["錯誤文字"] = df["text"]
# 顯示嚴重度 (中文)
display_df["嚴重度"] = df["severity"].apply(lambda x: severity_display_map.get(x, x))
# 顯示分類 (中文)
def map_category(cat_str):
if cat_str in ["No-error", "Non-translation"]:
# 代表 "完全正確" 或 "過多錯誤"
return severity_display_map.get(cat_str, cat_str)
if "/" not in cat_str:
# Single part (e.g. "Accuracy" or "Other")
return category_display_map.get(cat_str, cat_str)
main_cat, sub_cat = cat_str.split("/", 1)
main_cat_zh = category_display_map.get(main_cat, main_cat)
# sub_cat -> e.g. "Mistranslation", "Addition", "Omission", ...
# 這裡可逐一對照,略示如下:
if sub_cat == "Mistranslation":
sub_cat_zh = "誤譯"
elif sub_cat == "Addition":
sub_cat_zh = "多譯"
elif sub_cat == "Omission":
sub_cat_zh = "漏譯"
elif sub_cat == "Grammar":
sub_cat_zh = "文法"
elif sub_cat == "Spelling":
sub_cat_zh = "拼字"
elif sub_cat == "Punctuation":
sub_cat_zh = "標點符號"
elif sub_cat == "Inconsistency":
sub_cat_zh = "前後不一致"
elif sub_cat == "Register":
sub_cat_zh = "語域"
elif sub_cat == "Inappropriate":
sub_cat_zh = "使用不當"
elif sub_cat == "Inconsistent":
sub_cat_zh = "不一致"
elif sub_cat == "Awkward":
sub_cat_zh = "用字尷尬"
elif sub_cat == "Currency format":
sub_cat_zh = "貨幣格式"
elif sub_cat == "Time format":
sub_cat_zh = "時間格式"
elif sub_cat == "Name format":
sub_cat_zh = "姓名格式"
elif sub_cat == "Date format":
sub_cat_zh = "日期格式"
elif sub_cat == "Address format":
sub_cat_zh = "地址格式"
else:
sub_cat_zh = sub_cat
return f"{main_cat_zh}/{sub_cat_zh}"
display_df["分類"] = df["category"].apply(map_category)
return display_df
# === 關鍵修正:把「保存並繼續標記」後,要同時更新表格與螢光區 ===
def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
"""
原本的邏輯 + 一次回傳 error_span, status, error_table, highlighted_target,
使得按下按鈕後能同步更新介面。
"""
global current_index, data, current_errors
# 若已標記超過 5 處錯誤
if len(current_errors) >= 5:
return (
"", # error_span 清空
"您已標記超過 5 處錯誤,可直接按『過多錯誤』或繼續。",
get_error_dataframe(),
highlight_errors_in_text(target, current_errors)
)
if error_span and error_span not in target:
return (
"",
"錯誤區間不存在於翻譯文本,請檢查!",
get_error_dataframe(),
highlight_errors_in_text(target, current_errors)
)
# 轉英文
cat_val, subcat_val = subcategory_map.get((category, subcategory), (category_map.get(category, "Other"), "Other"))
severity_val = severity_map.get(severity, "Minor")
if error_span:
start = target.find(error_span)
end = start + len(error_span)
# 檢查是否重複標記
for err in current_errors:
if err["start"] == start and err["end"] == end:
return (
"",
"此錯誤區間已標記過,請勿重複。",
get_error_dataframe(),
highlight_errors_in_text(target, current_errors)
)
if subcat_val == "Other" and other.strip():
subcat_val = other.strip()
current_errors.append({
"text": error_span,
"severity": severity_val,
"start": start,
"end": end,
"category": f"{cat_val}/{subcat_val}"
})
status_msg = f"已標記錯誤: {error_span} (範圍 {start}-{end})"
else:
# 未輸入錯誤區間
status_msg = "尚未輸入錯誤區間,如無錯誤請按『完全正確』"
return (
"",
status_msg,
get_error_dataframe(),
highlight_errors_in_text(target, current_errors)
)
def mark_as_correct(target):
"""
標記為完全正確 (No-error),同時更新表格 & 螢光區。
"""
global current_errors
current_errors.append({
"text": "",
"severity": "No-error",
"start": 0,
"end": 0,
"category": "No-error"
})
return (
"", # error_span
"標註為完全正確!",
get_error_dataframe(),
highlight_errors_in_text(target, current_errors)
)
def mark_as_too_many_errors(target):
"""
標記為過多錯誤 (Non-translation),同時更新表格 & 螢光區。
"""
global current_errors
current_errors.append({
"text": "",
"severity": "Major",
"start": 0,
"end": 0,
"category": "Non-translation"
})
return (
"",
"已標註為過多錯誤!",
get_error_dataframe(),
highlight_errors_in_text(target, current_errors)
)
def save_and_next(source, target, score, rater_selector, alternative_translation):
global current_index, data, annotations_file, current_errors, annotation_history
if not rater_selector:
return (
source, target, "", # return empty error_span
str(data.loc[current_index, "id"]),
"請先選擇標註人員!",
get_error_dataframe(),
highlight_errors_in_text(target, current_errors)
)
if score is None:
return (
source, target, "",
str(data.loc[current_index, "id"]),
"請先填寫評分!",
get_error_dataframe(),
highlight_errors_in_text(target, current_errors)
)
system = data.loc[current_index, "system"]
lp = data.loc[current_index, "lp"]
doc = data.loc[current_index, "doc"]
id_val = int(data.loc[current_index, "id"])
reference = data.loc[current_index, "reference"]
annotations_entry = {
"system": system,
"lp": lp,
"doc": doc,
"id": id_val,
"rater": rater_selector,
"src": source,
"mt": target,
"ref": reference,
"esa_score": score,
"esa_spans": current_errors,
"alternative_translation": alternative_translation if alternative_translation else ""
}
save_to_json(annotations_entry, annotations_file)
annotation_history.append(annotations_entry)
current_errors = []
current_index += 1
if current_index >= len(data):
return (
"已完成所有文本標記", # source
"已完成所有文本標記", # target
"", # error_span
"", # current_index_display
f"標記完成並儲存到 {annotations_file.name}!(共 {len(data)} 筆)",
pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]),
""
)
next_source, next_target = get_current_text()
status_msg = f"已提交!目前進度:已完成第 {current_index} 筆 (id={current_index-1}) / 共 {len(data)} 筆。"
highlighted_next = highlight_errors_in_text(next_target, current_errors)
return (
next_source,
next_target,
"",
str(data.loc[current_index, "id"]),
status_msg,
pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]),
highlighted_next
)
def update_file_selection(selected_file):
global data_path, data, current_index, annotations_file, current_errors, annotation_history
data_path = os.path.join(current_dir, selected_file)
data = pd.read_csv(data_path, dtype={"id": "Int64"})
current_errors = []
annotation_history = []
min_id = data["id"].min()
current_index = data.index[data["id"] == min_id].tolist()[0]
file_base_name = os.path.splitext(selected_file)[0]
annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json"
src, tgt = get_current_text()
default_index_display = f"{min_id}-{str(src)[:10]}"
return (
src, tgt, "",
gr.update(choices=get_all_ids(), value=default_index_display),
str(data.loc[current_index, "id"]),
f"已加載檔案:{selected_file}",
pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]),
highlight_errors_in_text(tgt, [])
)
def update_index_selection(selected_display):
global current_index, data, current_errors
selected_id = parse_id_from_display(selected_display)
row_list = data.index[data["id"] == selected_id].tolist()
if not row_list:
return (
"", "", str(selected_id),
f"找不到 id: {selected_id}",
get_error_dataframe(),
""
)
current_index = row_list[0]
src, tgt = get_current_text()
return (
src, tgt,
str(selected_id),
f"已跳轉至 id={selected_id}",
get_error_dataframe(),
highlight_errors_in_text(tgt, current_errors)
)
DEMO_EXPLANATION = """
## 翻譯標記工具
### 💡[使用規則](https://huggingface.co/spaces/350016z/TranslationError_Gradio/blob/main/README.md) (第一次使用務必查看)
### 操作步驟
1. **先選擇標註人員與檔案**,並在「索引」下拉中挑選要標註的句子。
2. 在「步驟 1:錯誤標註」中,若翻譯文本有錯,請輸入「錯誤區間」、選擇「錯誤類別/子類別/嚴重度」並點「保存並繼續標記」。
- 多個錯誤可重複此步驟;若無錯誤則可直接點「完全正確」。
3. 錯誤標完後,在「步驟 2:評分與提交」中,拉動滑桿給分,若有更好譯文,可在「建議翻譯」填入。
4. 按「保存並顯示下一筆」送出本句標註並進入下一句。
"""
with gr.Blocks(css="""
/* 整體字體與行距 */
* {
font-size: 15px;
line-height: 1.4;
}
/* 按鈕分色 */
#correct_button {
background-color: #4CAF50; /* 綠 */
color: white;
font-size: 14px;
margin-bottom: 5px;
}
#too_many_errors_button {
background-color: #f44336; /* 紅 */
color: white;
font-size: 14px;
margin-bottom: 5px;
}
#save_current_button {
background-color: #1565C0; /* 藍 */
color: white;
font-size: 14px;
margin-bottom: 5px;
}
#save_next_button {
background-color: #1565C0; /* 藍 */
color: white;
font-size: 14px;
margin-bottom: 5px;
}
/* 模擬帶框風格 */
#highlight_box_group {
border: 1px solid #aaa;
padding: 10px;
margin-bottom: 10px;
min-height: 80px;
}
/* 讓「步驟區塊」顯示類似面板效果 */
#step1_box, #step2_box {
border: 1px solid #ccc;
padding: 10px;
margin-bottom: 10px;
}
""") as demo:
gr.Markdown(DEMO_EXPLANATION)
# ------------------- 頂部: 檔案 & 索引控制 -------------------
with gr.Row():
with gr.Column(scale=1):
rater_selector = gr.Dropdown(
label="標註人員",
choices=["rater_test", "rater1", "rater2", "rater3", "rater4", "rater5", "rater6", "rater7"],
value="rater_test"
)
file_selector = gr.Dropdown(
label="選擇檔案",
choices=csv_files,
value="test.csv"
)
index_selector = gr.Dropdown(
label="選擇索引 (id-原文前10字)",
choices=get_all_ids(),
value=f"{data.loc[current_index, 'id']}-{str(data.loc[current_index, 'source'])[:10]}"
)
current_index_display = gr.Textbox(
label="當前索引(id)",
value=str(data.loc[current_index, "id"]),
interactive=False
)
# 左: 原始文本 / 右: 翻譯文本
with gr.Column(scale=4):
source = gr.Textbox(label="原始文本", lines=14, interactive=False)
with gr.Column(scale=4):
target = gr.Textbox(label="翻譯文本", lines=14, interactive=False)
with gr.Tab("步驟1:錯誤標註"):
# ------------------- 螢光標記區(用 Group + elem_id)&錯誤紀錄表 -------------------
with gr.Row():
with gr.Column(scale=5):
with gr.Group(elem_id="highlight_box_group"):
highlighted_target = gr.HTML(value="", label="螢光標示區 (已標註的錯誤)")
with gr.Column(scale=5):
error_table = gr.Dataframe(
headers=["錯誤文字", "嚴重度", "分類"],
label="當前句子錯誤紀錄 (中文顯示)",
datatype=["str", "str", "str"],
interactive=False
)
# ------------------- 步驟1:錯誤標註 -------------------
# with gr.Group(elem_id="step1_box"):
with gr.Row(equal_height=True):
error_span = gr.Textbox(label="錯誤區間 (可複製『翻譯文本』貼上)", lines=2, placeholder="請輸入翻譯中文本的錯誤區間")
# with gr.Row(equal_height=True):
category = gr.Dropdown(
label="錯誤類別",
choices=list(categories_display.keys()),
value="正確性"
)
subcategory = gr.Dropdown(
label="子類別",
choices=categories_display["正確性"],
value="誤譯"
)
other = gr.Textbox(label="其他子類別", placeholder="如子類別選『其他』則填寫")
severity = gr.Dropdown(
label="嚴重程度",
choices=severity_choices_display,
value="輕微"
)
with gr.Row():
correct_button = gr.Button("✔ 完全正確", elem_id="correct_button")
too_many_errors_button = gr.Button("✖ 過多錯誤", elem_id="too_many_errors_button")
save_current_button = gr.Button("保存並繼續標記當前資料", elem_id="save_current_button")
with gr.Tab("步驟2:評分與提交"):
# ------------------- 步驟2:評分與提交 -------------------
# with gr.Group(elem_id="step2_box"):
with gr.Row():
alternative_translation = gr.Textbox(
label="建議翻譯(如有更好譯法可填)",
lines=2
)
score = gr.Slider(
label="翻譯評分 (0=最差, 100=最好)",
minimum=0,
maximum=100,
step=1,
value=66
)
save_next_button = gr.Button("保存並顯示下一筆", elem_id="save_next_button")
# ------------------- 當前狀態 -------------------
status = gr.Textbox(label="當前狀態", lines=1, interactive=False)
# ------------------- callback 綁定 -------------------
def update_subcats(selected_category):
subcats = categories_display[selected_category]
if len(subcats) == 0:
# 沒有任何子類別 -> 傳回空的 choices
return gr.update(choices=[], value=None)
else:
return gr.update(choices=subcats, value=subcats[0])
file_selector.change(
update_file_selection,
inputs=[file_selector],
outputs=[
source, target, error_span,
index_selector, current_index_display,
status, error_table, highlighted_target
]
)
index_selector.change(
update_index_selection,
inputs=[index_selector],
outputs=[
source, target, current_index_display,
status, error_table, highlighted_target
]
)
category.change(
update_subcats,
inputs=[category],
outputs=[subcategory]
)
# === 以下三個按鈕,皆一次更新表格與螢光區 ===
# 按「保存並繼續標記」 -> 在同一句上加錯誤並更新表格 & 高亮
correct_button.click(
mark_as_correct,
inputs=[target],
outputs=[error_span, status, error_table, highlighted_target]
)
too_many_errors_button.click(
mark_as_too_many_errors,
inputs=[target],
outputs=[error_span, status, error_table, highlighted_target]
)
save_current_button.click(
save_current,
inputs=[source, target, rater_selector, error_span, category, subcategory, severity, other],
outputs=[error_span, status, error_table, highlighted_target]
)
# 按「保存並顯示下一筆」 -> 送出當前整句標註 & 進下一句
save_next_button.click(
save_and_next,
inputs=[source, target, score, rater_selector, alternative_translation],
outputs=[
source, target, error_span,
current_index_display, status,
error_table, highlighted_target
]
)
# 初始化介面
init_src, init_tgt = get_current_text()
source.value = init_src
target.value = init_tgt
error_table.value = pd.DataFrame(columns=["錯誤文字","嚴重度","分類"])
highlighted_target.value = highlight_errors_in_text(init_tgt, [])
demo.launch()