import gradio as gr import pandas as pd import os import time import json from pathlib import Path from huggingface_hub import CommitScheduler, snapshot_download from uuid import uuid4 import shutil # --------------------------- 中英對照的字典 --------------------------- # 後端儲存(English),前端顯示(中文) category_map = { "正確性": "Accuracy", "流暢度": "Fluency", "專有名詞": "Terminology", "風格": "Style", "在地化": "Locale", "純正性": "Purity", } subcategory_map = { ("正確性", "誤譯"): ("Accuracy", "Mistranslation"), ("正確性", "多譯"): ("Accuracy", "Addition"), ("正確性", "漏譯"): ("Accuracy", "Omission"), ("正確性", "其他"): ("Accuracy", "Other"), ("流暢度", "文法"): ("Fluency", "Grammar"), ("流暢度", "拼字"): ("Fluency", "Spelling"), ("流暢度", "標點符號"): ("Fluency", "Punctuation"), ("流暢度", "前後不一致"): ("Fluency", "Inconsistency"), ("流暢度", "語域"): ("Fluency", "Register"), ("流暢度", "其他"): ("Fluency", "Other"), ("專有名詞", "使用不當"): ("Terminology", "Inappropriate"), ("專有名詞", "不一致"): ("Terminology", "Inconsistent"), ("專有名詞", "其他"): ("Terminology", "Other"), ("風格", "用字尷尬"): ("Style", "Awkward"), ("風格", "其他"): ("Style", "Other"), ("在地化", "貨幣格式"): ("Locale", "Currency format"), ("在地化", "時間格式"): ("Locale", "Time format"), ("在地化", "姓名格式"): ("Locale", "Name format"), ("在地化", "日期格式"): ("Locale", "Date format"), ("在地化", "地址格式"): ("Locale", "Address format"), ("在地化", "其他"): ("Locale", "Other"), } categories_display = { "正確性": ["誤譯", "多譯", "漏譯", "其他"], "流暢度": ["文法", "拼字", "標點符號", "前後不一致", "語域", "其他"], "專有名詞": ["使用不當", "不一致", "其他"], "風格": ["用字尷尬", "其他"], "在地化": ["貨幣格式", "時間格式", "姓名格式", "日期格式", "地址格式", "其他"], "純正性": [] } severity_choices_display = ["輕微", "嚴重"] severity_map = { "輕微": "Minor", "嚴重": "Major" } # 這兩個字典用於前端顯示資料表時,把英文轉回中文顯示 severity_display_map = { "Minor": "輕微", "Major": "嚴重", "No-error": "無錯誤", "Non-translation": "過多錯誤" } category_display_map = { "Accuracy": "正確性", "Fluency": "流暢度", "Terminology": "專有名詞", "Style": "風格", "Locale": "在地化", "Other": "其他", "No-error": "無錯誤", "Non-translation": "過多錯誤", "Purity": "純正性" } # ---------------------------下載CSV資料檔-------------------------------- DATASET_DIR = Path("json_dataset") DATASET_DIR.mkdir(parents=True, exist_ok=True) scheduler = CommitScheduler( repo_id="350016z/TaiwanCOMET_dataset", repo_type="dataset", folder_path=DATASET_DIR, path_in_repo="data" ) def download_dataset_file(dataset_id, local_dir): snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset") contents = os.listdir(snapshot_path) for file_name in contents: if file_name.endswith(".csv"): source_file_path = os.path.join(snapshot_path, file_name) local_file_path = os.path.join(local_dir, file_name) shutil.copy(source_file_path, local_file_path) time.sleep(1) return local_dir DATASET_ID = "350016z/Taiwanese_dataset" current_dir = os.getcwd() download_dataset_file(DATASET_ID, current_dir) csv_files = [f for f in os.listdir(current_dir) if f.endswith('.csv')] if not csv_files: print("Error: No CSV files found in the current directory.") exit() data_path = os.path.join(current_dir, 'test.csv') if 'test.csv' in csv_files else os.path.join(current_dir, csv_files[0]) if not os.path.exists(data_path): print(f"Error: {data_path} does not exist. Please check the file path.") exit() data = pd.read_csv(data_path, dtype={"id": "Int64"}) # 先按照 id 由小到大排序,並重新整理索引 data = data.sort_values(by="id", ascending=True, ignore_index=True) current_index = 0 current_errors = [] annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json" annotation_history = [] # 若需顯示歷史可擴充 def get_all_ids(): """ 顯示格式: [id-原文前10字] 以便快速鎖定哪一筆 """ id_list = [] for i in range(len(data)): idx_value = data.loc[i, "id"] src_text = str(data.loc[i, "source"])[:10].replace("\n", " ") display_str = f"{idx_value}-{src_text}" id_list.append(display_str) return id_list def parse_id_from_display(display_str): return int(display_str.split("-", 1)[0]) def get_current_text(): global current_index, data source = data.loc[current_index, "source"] target = data.loc[current_index, "target"] return source, target def save_to_json(entry: dict, json_file: Path): with scheduler.lock: with json_file.open("a") as f: json.dump(entry, f, ensure_ascii=False) f.write("\n") def highlight_errors_in_text(text, errors): """ 在文本中以 ... 方式高亮。 """ if not text: return "" highlighted = "" last_end = 0 for err in sorted(errors, key=lambda e: e["start"]): st = err["start"] ed = err["end"] if st < 0 or ed > len(text): continue highlighted += text[last_end:st] highlighted += f"{text[st:ed]}" last_end = ed highlighted += text[last_end:] return highlighted def get_error_dataframe(): """ 只顯示「錯誤文字」「嚴重度」「分類」(皆為中文顯示),後端仍存英文。 """ df = pd.DataFrame(current_errors) if df.empty: return pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]) display_df = pd.DataFrame() # 顯示錯誤文字 display_df["錯誤文字"] = df["text"] # 顯示嚴重度 (中文) display_df["嚴重度"] = df["severity"].apply(lambda x: severity_display_map.get(x, x)) # 顯示分類 (中文) def map_category(cat_str): if cat_str in ["No-error", "Non-translation"]: # 代表 "完全正確" 或 "過多錯誤" return severity_display_map.get(cat_str, cat_str) if "/" not in cat_str: # Single part (e.g. "Accuracy" or "Other") return category_display_map.get(cat_str, cat_str) main_cat, sub_cat = cat_str.split("/", 1) main_cat_zh = category_display_map.get(main_cat, main_cat) # sub_cat -> e.g. "Mistranslation", "Addition", "Omission", ... # 這裡可逐一對照,略示如下: if sub_cat == "Mistranslation": sub_cat_zh = "誤譯" elif sub_cat == "Addition": sub_cat_zh = "多譯" elif sub_cat == "Omission": sub_cat_zh = "漏譯" elif sub_cat == "Grammar": sub_cat_zh = "文法" elif sub_cat == "Spelling": sub_cat_zh = "拼字" elif sub_cat == "Punctuation": sub_cat_zh = "標點符號" elif sub_cat == "Inconsistency": sub_cat_zh = "前後不一致" elif sub_cat == "Register": sub_cat_zh = "語域" elif sub_cat == "Inappropriate": sub_cat_zh = "使用不當" elif sub_cat == "Inconsistent": sub_cat_zh = "不一致" elif sub_cat == "Awkward": sub_cat_zh = "用字尷尬" elif sub_cat == "Currency format": sub_cat_zh = "貨幣格式" elif sub_cat == "Time format": sub_cat_zh = "時間格式" elif sub_cat == "Name format": sub_cat_zh = "姓名格式" elif sub_cat == "Date format": sub_cat_zh = "日期格式" elif sub_cat == "Address format": sub_cat_zh = "地址格式" else: sub_cat_zh = sub_cat return f"{main_cat_zh}/{sub_cat_zh}" display_df["分類"] = df["category"].apply(map_category) return display_df # === 關鍵修正:把「保存並繼續標記」後,要同時更新表格與螢光區 === def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other): """ 原本的邏輯 + 一次回傳 error_span, status, error_table, highlighted_target, 使得按下按鈕後能同步更新介面。 """ global current_index, data, current_errors # 若已標記超過 5 處錯誤 if len(current_errors) >= 5: return ( "", # error_span 清空 "您已標記超過 5 處錯誤,可直接按『過多錯誤』或繼續。", get_error_dataframe(), highlight_errors_in_text(target, current_errors) ) if error_span and error_span not in target: return ( "", "錯誤區間不存在於翻譯文本,請檢查!", get_error_dataframe(), highlight_errors_in_text(target, current_errors) ) # 轉英文 cat_val, subcat_val = subcategory_map.get((category, subcategory), (category_map.get(category, "Other"), "Other")) severity_val = severity_map.get(severity, "Minor") if error_span: start = target.find(error_span) end = start + len(error_span) # 檢查是否重複標記 for err in current_errors: if err["start"] == start and err["end"] == end: return ( "", "此錯誤區間已標記過,請勿重複。", get_error_dataframe(), highlight_errors_in_text(target, current_errors) ) if subcat_val == "Other" and other.strip(): subcat_val = other.strip() current_errors.append({ "text": error_span, "severity": severity_val, "start": start, "end": end, "category": f"{cat_val}/{subcat_val}" }) status_msg = f"已標記錯誤: {error_span} (範圍 {start}-{end})" else: # 未輸入錯誤區間 status_msg = "尚未輸入錯誤區間,如無錯誤請按『完全正確』" return ( "", status_msg, get_error_dataframe(), highlight_errors_in_text(target, current_errors) ) def mark_as_correct(target): """ 標記為完全正確 (No-error),同時更新表格 & 螢光區。 """ global current_errors current_errors.append({ "text": "", "severity": "No-error", "start": 0, "end": 0, "category": "No-error" }) return ( "", # error_span "標註為完全正確!", get_error_dataframe(), highlight_errors_in_text(target, current_errors) ) def mark_as_too_many_errors(target): """ 標記為過多錯誤 (Non-translation),同時更新表格 & 螢光區。 """ global current_errors current_errors.append({ "text": "", "severity": "Major", "start": 0, "end": 0, "category": "Non-translation" }) return ( "", "已標註為過多錯誤!", get_error_dataframe(), highlight_errors_in_text(target, current_errors) ) def save_and_next(source, target, score, rater_selector, alternative_translation): global current_index, data, annotations_file, current_errors, annotation_history if not rater_selector: return ( source, target, "", # return empty error_span str(data.loc[current_index, "id"]), "請先選擇標註人員!", get_error_dataframe(), highlight_errors_in_text(target, current_errors) ) if score is None: return ( source, target, "", str(data.loc[current_index, "id"]), "請先填寫評分!", get_error_dataframe(), highlight_errors_in_text(target, current_errors) ) system = data.loc[current_index, "system"] lp = data.loc[current_index, "lp"] doc = data.loc[current_index, "doc"] id_val = int(data.loc[current_index, "id"]) reference = data.loc[current_index, "reference"] annotations_entry = { "system": system, "lp": lp, "doc": doc, "id": id_val, "rater": rater_selector, "src": source, "mt": target, "ref": reference, "esa_score": score, "esa_spans": current_errors, "alternative_translation": alternative_translation if alternative_translation else "" } save_to_json(annotations_entry, annotations_file) annotation_history.append(annotations_entry) current_errors = [] current_index += 1 if current_index >= len(data): return ( "已完成所有文本標記", # source "已完成所有文本標記", # target "", # error_span "", # current_index_display f"標記完成並儲存到 {annotations_file.name}!(共 {len(data)} 筆)", pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]), "" ) next_source, next_target = get_current_text() status_msg = f"已提交!目前進度:已完成第 {current_index} 筆 (id={current_index-1}) / 共 {len(data)} 筆。" highlighted_next = highlight_errors_in_text(next_target, current_errors) return ( next_source, next_target, "", str(data.loc[current_index, "id"]), status_msg, pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]), highlighted_next ) def update_file_selection(selected_file): global data_path, data, current_index, annotations_file, current_errors, annotation_history data_path = os.path.join(current_dir, selected_file) data = pd.read_csv(data_path, dtype={"id": "Int64"}) current_errors = [] annotation_history = [] min_id = data["id"].min() current_index = data.index[data["id"] == min_id].tolist()[0] file_base_name = os.path.splitext(selected_file)[0] annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json" src, tgt = get_current_text() default_index_display = f"{min_id}-{str(src)[:10]}" return ( src, tgt, "", gr.update(choices=get_all_ids(), value=default_index_display), str(data.loc[current_index, "id"]), f"已加載檔案:{selected_file}", pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]), highlight_errors_in_text(tgt, []) ) def update_index_selection(selected_display): global current_index, data, current_errors selected_id = parse_id_from_display(selected_display) row_list = data.index[data["id"] == selected_id].tolist() if not row_list: return ( "", "", str(selected_id), f"找不到 id: {selected_id}", get_error_dataframe(), "" ) current_index = row_list[0] src, tgt = get_current_text() return ( src, tgt, str(selected_id), f"已跳轉至 id={selected_id}", get_error_dataframe(), highlight_errors_in_text(tgt, current_errors) ) DEMO_EXPLANATION = """ ## 翻譯標記工具 ### 💡[使用規則](https://huggingface.co/spaces/350016z/TranslationError_Gradio/blob/main/README.md) (第一次使用務必查看) ### 操作步驟 1. **先選擇標註人員與檔案**,並在「索引」下拉中挑選要標註的句子。 2. 在「步驟 1:錯誤標註」中,若翻譯文本有錯,請輸入「錯誤區間」、選擇「錯誤類別/子類別/嚴重度」並點「保存並繼續標記」。 - 多個錯誤可重複此步驟;若無錯誤則可直接點「完全正確」。 3. 錯誤標完後,在「步驟 2:評分與提交」中,拉動滑桿給分,若有更好譯文,可在「建議翻譯」填入。 4. 按「保存並顯示下一筆」送出本句標註並進入下一句。 """ with gr.Blocks(css=""" /* 整體字體與行距 */ * { font-size: 15px; line-height: 1.4; } /* 按鈕分色 */ #correct_button { background-color: #4CAF50; /* 綠 */ color: white; font-size: 14px; margin-bottom: 5px; } #too_many_errors_button { background-color: #f44336; /* 紅 */ color: white; font-size: 14px; margin-bottom: 5px; } #save_current_button { background-color: #1565C0; /* 藍 */ color: white; font-size: 14px; margin-bottom: 5px; } #save_next_button { background-color: #1565C0; /* 藍 */ color: white; font-size: 14px; margin-bottom: 5px; } /* 模擬帶框風格 */ #highlight_box_group { border: 1px solid #aaa; padding: 10px; margin-bottom: 10px; min-height: 80px; } /* 讓「步驟區塊」顯示類似面板效果 */ #step1_box, #step2_box { border: 1px solid #ccc; padding: 10px; margin-bottom: 10px; } """) as demo: gr.Markdown(DEMO_EXPLANATION) # ------------------- 頂部: 檔案 & 索引控制 ------------------- with gr.Row(): with gr.Column(scale=1): rater_selector = gr.Dropdown( label="標註人員", choices=["rater_test", "rater1", "rater2", "rater3", "rater4", "rater5", "rater6", "rater7"], value="rater_test" ) file_selector = gr.Dropdown( label="選擇檔案", choices=csv_files, value="test.csv" ) index_selector = gr.Dropdown( label="選擇索引 (id-原文前10字)", choices=get_all_ids(), value=f"{data.loc[current_index, 'id']}-{str(data.loc[current_index, 'source'])[:10]}" ) current_index_display = gr.Textbox( label="當前索引(id)", value=str(data.loc[current_index, "id"]), interactive=False ) # 左: 原始文本 / 右: 翻譯文本 with gr.Column(scale=4): source = gr.Textbox(label="原始文本", lines=14, interactive=False) with gr.Column(scale=4): target = gr.Textbox(label="翻譯文本", lines=14, interactive=False) with gr.Tab("步驟1:錯誤標註"): # ------------------- 螢光標記區(用 Group + elem_id)&錯誤紀錄表 ------------------- with gr.Row(): with gr.Column(scale=5): with gr.Group(elem_id="highlight_box_group"): highlighted_target = gr.HTML(value="", label="螢光標示區 (已標註的錯誤)") with gr.Column(scale=5): error_table = gr.Dataframe( headers=["錯誤文字", "嚴重度", "分類"], label="當前句子錯誤紀錄 (中文顯示)", datatype=["str", "str", "str"], interactive=False ) # ------------------- 步驟1:錯誤標註 ------------------- # with gr.Group(elem_id="step1_box"): with gr.Row(equal_height=True): error_span = gr.Textbox(label="錯誤區間 (可複製『翻譯文本』貼上)", lines=2, placeholder="請輸入翻譯中文本的錯誤區間") # with gr.Row(equal_height=True): category = gr.Dropdown( label="錯誤類別", choices=list(categories_display.keys()), value="正確性" ) subcategory = gr.Dropdown( label="子類別", choices=categories_display["正確性"], value="誤譯" ) other = gr.Textbox(label="其他子類別", placeholder="如子類別選『其他』則填寫") severity = gr.Dropdown( label="嚴重程度", choices=severity_choices_display, value="輕微" ) with gr.Row(): correct_button = gr.Button("✔ 完全正確", elem_id="correct_button") too_many_errors_button = gr.Button("✖ 過多錯誤", elem_id="too_many_errors_button") save_current_button = gr.Button("保存並繼續標記當前資料", elem_id="save_current_button") with gr.Tab("步驟2:評分與提交"): # ------------------- 步驟2:評分與提交 ------------------- # with gr.Group(elem_id="step2_box"): with gr.Row(): alternative_translation = gr.Textbox( label="建議翻譯(如有更好譯法可填)", lines=2 ) score = gr.Slider( label="翻譯評分 (0=最差, 100=最好)", minimum=0, maximum=100, step=1, value=66 ) save_next_button = gr.Button("保存並顯示下一筆", elem_id="save_next_button") # ------------------- 當前狀態 ------------------- status = gr.Textbox(label="當前狀態", lines=1, interactive=False) # ------------------- callback 綁定 ------------------- def update_subcats(selected_category): subcats = categories_display[selected_category] if len(subcats) == 0: # 沒有任何子類別 -> 傳回空的 choices return gr.update(choices=[], value=None) else: return gr.update(choices=subcats, value=subcats[0]) file_selector.change( update_file_selection, inputs=[file_selector], outputs=[ source, target, error_span, index_selector, current_index_display, status, error_table, highlighted_target ] ) index_selector.change( update_index_selection, inputs=[index_selector], outputs=[ source, target, current_index_display, status, error_table, highlighted_target ] ) category.change( update_subcats, inputs=[category], outputs=[subcategory] ) # === 以下三個按鈕,皆一次更新表格與螢光區 === # 按「保存並繼續標記」 -> 在同一句上加錯誤並更新表格 & 高亮 correct_button.click( mark_as_correct, inputs=[target], outputs=[error_span, status, error_table, highlighted_target] ) too_many_errors_button.click( mark_as_too_many_errors, inputs=[target], outputs=[error_span, status, error_table, highlighted_target] ) save_current_button.click( save_current, inputs=[source, target, rater_selector, error_span, category, subcategory, severity, other], outputs=[error_span, status, error_table, highlighted_target] ) # 按「保存並顯示下一筆」 -> 送出當前整句標註 & 進下一句 save_next_button.click( save_and_next, inputs=[source, target, score, rater_selector, alternative_translation], outputs=[ source, target, error_span, current_index_display, status, error_table, highlighted_target ] ) # 初始化介面 init_src, init_tgt = get_current_text() source.value = init_src target.value = init_tgt error_table.value = pd.DataFrame(columns=["錯誤文字","嚴重度","分類"]) highlighted_target.value = highlight_errors_in_text(init_tgt, []) demo.launch()