Spaces:

350016z
/

TranslationError_Gradio

Sleeping

App Files Files Community

350016z commited on Feb 14

Commit

4f4750e

verified ·

1 Parent(s): b5dc54b

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -42

app.py CHANGED Viewed

@@ -64,17 +64,22 @@ if not os.path.exists(data_path):
     exit()
-# Loading Data-----------------------------------------------------------------------------------------------------------
 data = pd.read_csv(data_path, dtype={"id": "Int64"}) # 確保 id 為標準 Python int
 current_index = 0
-def get_all_ids():
-    return [str(id) for id in data["id"].tolist()]
 # ---------------------------------------------------------------------------------------------------------------------
-annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
-score_file = DATASET_DIR / f"test_score-{uuid4()}.json"
 def get_current_text():
     global current_index, data
     source = data.loc[current_index, "source"]
@@ -91,80 +96,156 @@ def save_to_json(entry: dict, json_file: Path):
             f.write("\n")
         # scheduler.push_to_hub(commit_message=f"更新檔案 {json_file.name}")
 def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
-    global current_index, data, annotations_file
     system = data.loc[current_index, "system"]
     lp = data.loc[current_index, "lp"]
     doc = data.loc[current_index, "doc"]
     id = int(data.loc[current_index, "id"])
     reference = data.loc[current_index, "reference"]
-    if category != "Non-translation" and category != "No-error":
-        category_value = f"{category}/{subcategory}"
     else:
         category_value = category
     if error_span and error_span in target:
-        highlighted_error_span = target.replace(error_span, f"<v>{error_span}</v>")
-    elif not error_span:
-        highlighted_error_span = target
     else:
-        highlighted_error_span = error_span  # 若 error_span 不存在於 target，則保持原樣
-    new_entry = {
-        "system": system,
-        "lp": lp,
-        "doc": doc,
-        "id": id,
-        "source": source,
-        "mt": target,
-        "target": highlighted_error_span,
         "category": category_value,
-        "severity": severity,
-        "other": other if other else "",
-        "rater": rater_selector,
-    }
-    save_to_json(new_entry, annotations_file)
     # [error_span, status]
-    return "", f"當前資料已保存到 {annotations_file.name}，請繼續標記！"
 def save_and_next(source, target, score, rater_selector):
-    global current_index, data, score_file
     system = data.loc[current_index, "system"]
     lp = data.loc[current_index, "lp"]
     doc = data.loc[current_index, "doc"]
     id = int(data.loc[current_index, "id"])
     reference = data.loc[current_index, "reference"]
-    id_list = [str(id) for id in sorted(data["id"].unique())]
-    max_id = int(id_list[-1])  # 取得最大的 ID
-    new_entry = {
         "system": system,
         "lp": lp,
         "doc": doc,
         "id": id,
         "src": source,
         "mt": target,
         "ref": reference,
-        "score": score,
-        "rater": rater_selector,
     }
-    save_to_json(new_entry, score_file)
     current_index += 1
     if current_index >= len(data):
-        return "已完成所有文本標記", "已完成所有文本標記", "", "", f"所有標記已完成並保存到 {score_file.name}！"
     next_source, next_target = get_current_text()
-    # [source, target, error_span, current_index_display, status]
-    return next_source, next_target, "", str(current_index), f"分數已保存到 {score_file.name}，請繼續下一筆！"
 def update_file_selection(selected_file):
-    global data_path, data, current_index, annotations_file, score_file
     data_path = os.path.join(current_dir, selected_file)
     data = pd.read_csv(data_path)
@@ -175,7 +256,7 @@ def update_file_selection(selected_file):
     file_base_name = os.path.splitext(selected_file)[0]
     annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json"
-    score_file = DATASET_DIR / f"{file_base_name}_score-{uuid4()}.json"
     # [source, target, error_span, index_selector, current_index_display, status]
     return get_current_text() + ("", gr.update(choices=id_list, value=str(min_id)), str(min_id), f"已加載檔案：{selected_file}")

     exit()
+# Loading & Setting --------------------------------------------------------------------------------------------------
 data = pd.read_csv(data_path, dtype={"id": "Int64"}) # 確保 id 為標準 Python int
 current_index = 0
+current_errors = []
+current_others = []
+annotations_file = DATASET_DIR / f"test-{uuid4()}.json"
 # ---------------------------------------------------------------------------------------------------------------------
+# score_file = DATASET_DIR / f"test_score-{uuid4()}.json"
+def get_all_ids():
+    return [str(id) for id in data["id"].tolist()]
 def get_current_text():
     global current_index, data
     source = data.loc[current_index, "source"]
             f.write("\n")
         # scheduler.push_to_hub(commit_message=f"更新檔案 {json_file.name}")
+# def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
+#     global current_index, data, annotations_file
+#     system = data.loc[current_index, "system"]
+#     lp = data.loc[current_index, "lp"]
+#     doc = data.loc[current_index, "doc"]
+#     id = int(data.loc[current_index, "id"])
+#     reference = data.loc[current_index, "reference"]
+#     if category != "Non-translation" and category != "No-error":
+#         category_value = f"{category}/{subcategory}"
+#     else:
+#         category_value = category
+#     if error_span and error_span in target:
+#         highlighted_error_span = target.replace(error_span, f"<v>{error_span}</v>")
+#     elif not error_span:
+#         highlighted_error_span = target
+#     else:
+#         highlighted_error_span = error_span  # 若 error_span 不存在於 target，則保持原樣
+#     new_entry = {
+#         "system": system,
+#         "lp": lp,
+#         "doc": doc,
+#         "id": id,
+#         "source": source,
+#         "mt": target,
+#         "target": highlighted_error_span,
+#         "category": category_value,
+#         "severity": severity,
+#         "other": other if other else "",
+#         "rater": rater_selector,
+#     }
+#     save_to_json(new_entry, annotations_file)
+#     # [error_span, status]
+#     return "", f"當前資料已保存到 {annotations_file.name}，請繼續標記！"
 def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
+    global current_index, data, current_errors
+    if category == "No-error":
+        return "", "無錯誤，不需要保存錯誤區間。"
     system = data.loc[current_index, "system"]
     lp = data.loc[current_index, "lp"]
     doc = data.loc[current_index, "doc"]
     id = int(data.loc[current_index, "id"])
     reference = data.loc[current_index, "reference"]
+    if category != "Non-translation":
+        category_value = f"{category}/{subcategory}" if subcategory else category
     else:
         category_value = category
     if error_span and error_span in target:
+        start = target.find(error_span)
+        end = start + len(error_span)
+        print(f"start: {start}, end: {end}")
     else:
+        return "", "錯誤區間不存在於翻譯文本中，請檢查！"
+    current_errors.append({
+        "text": error_span,
+        "severity": severity.lower(),
+        "start": start,
+        "end": end
+    })
+    current_others.append({
         "category": category_value,
+        "others": other if other else "",
+    })
     # [error_span, status]
+    return "", f"已記錄錯誤區間: {error_span}，範圍 {start}-{end}。"
+# def save_and_next(source, target, score, rater_selector):
+#     global current_index, data, score_file
+#     system = data.loc[current_index, "system"]
+#     lp = data.loc[current_index, "lp"]
+#     doc = data.loc[current_index, "doc"]
+#     id = int(data.loc[current_index, "id"])
+#     reference = data.loc[current_index, "reference"]
+#     id_list = [str(id) for id in sorted(data["id"].unique())]
+#     max_id = int(id_list[-1])  # 取得最大的 ID
+#     new_entry = {
+#         "system": system,
+#         "lp": lp,
+#         "doc": doc,
+#         "id": id,
+#         "src": source,
+#         "mt": target,
+#         "ref": reference,
+#         "score": score,
+#         "rater": rater_selector,
+#     }
+#     save_to_json(new_entry, score_file)
+#     current_index += 1
+#     if current_index >= len(data):
+#         return "已完成所有文本標記", "已完成所有文本標記", "", "", f"所有標記已完成並保存到 {score_file.name}！"
+#     next_source, next_target = get_current_text()
+#     # [source, target, error_span, current_index_display, status]
+#     return next_source, next_target, "", str(current_index), f"分數已保存到 {score_file.name}，請繼續下一筆！"
 def save_and_next(source, target, score, rater_selector):
+    global current_index, data, annotations_file, current_errors, current_others
     system = data.loc[current_index, "system"]
     lp = data.loc[current_index, "lp"]
     doc = data.loc[current_index, "doc"]
     id = int(data.loc[current_index, "id"])
     reference = data.loc[current_index, "reference"]
+    annotations_entry = {
         "system": system,
         "lp": lp,
         "doc": doc,
         "id": id,
+        "rater": rater_selector,
         "src": source,
         "mt": target,
         "ref": reference,
+        "sentence_score": score / 100.0,  # 標準化到 [0, 1]
+        "errors": current_errors,
+        "others": current_others,
     }
+    save_to_json(annotations_entry, annotations_file)
+    # 清空當前錯誤緩存
+    current_errors = []
+    current_others = []
     current_index += 1
     if current_index >= len(data):
+        return "已完成所有文本標記", "已完成所有文本標記", "", "", f"所有標記已完成並保存到 {annotations_file.name}！"
     next_source, next_target = get_current_text()
+    return next_source, next_target, "", str(current_index), f"分數與錯誤已保存到 {annotations_file.name}，請繼續下一筆！"
 def update_file_selection(selected_file):
+    global data_path, data, current_index, annotations_file
     data_path = os.path.join(current_dir, selected_file)
     data = pd.read_csv(data_path)
     file_base_name = os.path.splitext(selected_file)[0]
     annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json"
+    # score_file = DATASET_DIR / f"{file_base_name}_score-{uuid4()}.json"
     # [source, target, error_span, index_selector, current_index_display, status]
     return get_current_text() + ("", gr.update(choices=id_list, value=str(min_id)), str(min_id), f"已加載檔案：{selected_file}")