350016z commited on
Commit
ae82a4a
·
verified ·
1 Parent(s): eda02ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +225 -137
app.py CHANGED
@@ -8,7 +8,8 @@ from huggingface_hub import CommitScheduler, snapshot_download
8
  from uuid import uuid4
9
  import shutil
10
 
11
- # ------------------------- 中英對照 (前端中文顯示, 後台英文儲存) -------------------------
 
12
  category_map = {
13
  "正確性": "Accuracy",
14
  "流暢度": "Fluency",
@@ -16,7 +17,6 @@ category_map = {
16
  "風格": "Style",
17
  "在地化": "Locale"
18
  }
19
-
20
  subcategory_map = {
21
  ("正確性", "誤譯"): ("Accuracy", "Mistranslation"),
22
  ("正確性", "新增"): ("Accuracy", "Addition"),
@@ -44,7 +44,6 @@ subcategory_map = {
44
  ("在地化", "地址格式"): ("Locale", "Address format"),
45
  ("在地化", "其他"): ("Locale", "Other"),
46
  }
47
-
48
  categories_display = {
49
  "正確性": ["誤譯", "新增", "漏譯", "其他"],
50
  "流暢度": ["文法", "拼字", "標點符號", "前後不一致", "語域", "其他"],
@@ -53,35 +52,50 @@ categories_display = {
53
  "在地化": ["貨幣格式", "時間格式", "人名格式", "日期格式", "地址格式", "其他"]
54
  }
55
 
56
- severity_choices_display = ["輕微 (Minor)", "嚴重 (Major)"]
57
  severity_map = {
58
  "輕微 (Minor)": "Minor",
59
  "嚴重 (Major)": "Major"
60
  }
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  DATASET_DIR = Path("json_dataset")
63
  DATASET_DIR.mkdir(parents=True, exist_ok=True)
64
 
65
  scheduler = CommitScheduler(
66
- repo_id="350016z/TaiwanCOMET_dataset",
67
  repo_type="dataset",
68
  folder_path=DATASET_DIR,
69
  path_in_repo="data"
70
  )
71
 
72
- # ---------------------------下載CSV資料檔--------------------------------
73
  def download_dataset_file(dataset_id, local_dir):
74
  snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset")
75
  contents = os.listdir(snapshot_path)
76
 
77
  for file_name in contents:
78
- print("Checking file: ", file_name)
79
  if file_name.endswith(".csv"):
80
  source_file_path = os.path.join(snapshot_path, file_name)
81
  local_file_path = os.path.join(local_dir, file_name)
82
  shutil.copy(source_file_path, local_file_path)
83
- print(f"Copied {file_name} to {local_file_path}")
84
- print(f"Permissions for {local_file_path}: {oct(os.stat(local_file_path).st_mode)}")
85
  time.sleep(1)
86
  return local_dir
87
 
@@ -95,8 +109,6 @@ if not csv_files:
95
  exit()
96
 
97
  data_path = os.path.join(current_dir, 'test.csv') if 'test.csv' in csv_files else os.path.join(current_dir, csv_files[0])
98
- print(f"Data path: {data_path}")
99
-
100
  if not os.path.exists(data_path):
101
  print(f"Error: {data_path} does not exist. Please check the file path.")
102
  exit()
@@ -106,9 +118,7 @@ current_index = 0
106
  current_errors = []
107
 
108
  annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
109
-
110
- # 存放所有已提交標註(方便後續查看歷史),此範例主要顯示當前紀錄
111
- annotation_history = []
112
 
113
  def get_all_ids():
114
  """
@@ -137,24 +147,12 @@ def save_to_json(entry: dict, json_file: Path):
137
  json.dump(entry, f, ensure_ascii=False)
138
  f.write("\n")
139
 
140
- def get_error_dataframe():
141
- """
142
- 只顯示「text」「severity」「category」三個欄位,不顯示 start/end。
143
- """
144
- df = pd.DataFrame(current_errors)
145
- if df.empty:
146
- return pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"])
147
- # 轉成中文欄位
148
- display_df = pd.DataFrame()
149
- display_df["錯誤文字"] = df["text"]
150
- display_df["嚴重度"] = df["severity"]
151
- display_df["分類"] = df["category"]
152
- return display_df
153
-
154
  def highlight_errors_in_text(text, errors):
155
  """
156
- 在文本中以 <span style="background-color:yellow;">...</span> 方式高亮顯示錯誤區間。
157
  """
 
 
158
  highlighted = ""
159
  last_end = 0
160
  for err in sorted(errors, key=lambda e: e["start"]):
@@ -168,45 +166,139 @@ def highlight_errors_in_text(text, errors):
168
  highlighted += text[last_end:]
169
  return highlighted
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
172
  global current_index, data, current_errors
173
  if len(current_errors) >= 5:
174
- return "", "您已標記超過 5 處錯誤,可直接按「過多錯誤」或繼續標注。"
175
 
176
  if error_span and error_span not in target:
177
  return "", "錯誤區間不存在於翻譯文本,請檢查!"
178
 
179
- # 中英轉換
180
  cat_val, subcat_val = subcategory_map.get((category, subcategory), (category_map.get(category, "Other"), "Other"))
181
  severity_val = severity_map.get(severity, "Minor")
182
 
183
  if error_span:
184
  start = target.find(error_span)
185
  end = start + len(error_span)
 
186
  for err in current_errors:
187
  if err["start"] == start and err["end"] == end:
188
- return "", "此錯誤區間已標記過,請勿重複標記。"
189
- # 若子類別是 "其他" 且 user 有填 other,就使用 other
190
- if subcat_val == "Other" and other:
191
- subcat_val = other
 
192
 
193
  current_errors.append({
194
  "text": error_span,
195
  "severity": severity_val,
196
  "start": start,
197
  "end": end,
198
- "category": f"{cat_val}/{subcat_val}",
199
  })
200
- return "", f"已記錄錯誤區間: {error_span},範圍 {start}-{end}"
201
  else:
202
- return "", "請輸入錯誤區間或點選『完全正確』"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  def save_and_next(source, target, score, rater_selector, alternative_translation):
205
  global current_index, data, annotations_file, current_errors, annotation_history
206
 
207
  if not rater_selector:
208
  return (
209
- source, target, "",
210
  str(data.loc[current_index, "id"]),
211
  "請先選擇標註人員!",
212
  get_error_dataframe(),
@@ -248,17 +340,17 @@ def save_and_next(source, target, score, rater_selector, alternative_translation
248
 
249
  if current_index >= len(data):
250
  return (
251
- "已完成所有文本標記",
252
- "已完成所有文本標記",
253
- "",
254
- "",
255
- f"所有標記已完成並保存到 {annotations_file.name}! (總共 {len(data)} 筆)",
256
  pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]),
257
  ""
258
  )
259
 
260
  next_source, next_target = get_current_text()
261
- status_msg = f"評分與標記已提交!已完成第 {current_index} 筆 / 共 {len(data)} 筆。"
262
  return (
263
  next_source,
264
  next_target,
@@ -298,45 +390,34 @@ def update_index_selection(selected_display):
298
  selected_id = parse_id_from_display(selected_display)
299
  row_list = data.index[data["id"] == selected_id].tolist()
300
  if not row_list:
301
- return "", "", str(selected_id), f"找不到id: {selected_id}", get_error_dataframe(), ""
 
 
 
 
 
302
  current_index = row_list[0]
303
  src, tgt = get_current_text()
304
- return src, tgt, str(selected_id), f"已跳轉至 id: {selected_id}", get_error_dataframe(), highlight_errors_in_text(tgt, current_errors)
 
 
 
 
 
 
305
 
306
- def mark_as_correct():
307
- global current_errors
308
- current_errors.append({
309
- "text": "",
310
- "severity": "No-error",
311
- "start": 0,
312
- "end": 0,
313
- "category": "No-error"
314
- })
315
- return "", "標註為完全正確,無錯誤!", get_error_dataframe()
316
 
317
- def mark_as_too_many_errors():
318
- global current_errors
319
- current_errors.append({
320
- "text": "",
321
- "severity": "Major",
322
- "start": 0,
323
- "end": 0,
324
- "category": "Non-translation"
325
- })
326
- return "", "已標註為過多錯誤!", get_error_dataframe()
327
 
328
- DEMO_EXPLANATION = """
329
- ## 翻譯標記工具
330
- 1. 選擇檔案、標註人員、以及想要檢視的索引(句子)。
331
- 2. 檢查「翻譯文本」是否有錯誤,如有,請選擇「錯誤類別」、「子類別」、「嚴重度」,並在「錯誤區間」貼上有問題的翻譯文字。
332
- 3. 按「保存並繼續標記當前資料」,錯誤會暫時列在右方的「當前句子錯誤紀錄」中。
333
- 4. 全部錯誤標記完後,可給分(0-100),並可在「建議翻譯」中填寫更好的譯文。
334
- 5. 按「保存並顯示下一筆」,會提交當前這筆紀錄並跳至下一筆。
335
- 6. 若整句都正確,可按「完全正確」。若錯誤超過五處,可按「過多錯誤」。
336
  """
337
 
338
  with gr.Blocks(css="""
339
- /* 調整整體字體大小與行距 */
340
  * {
341
  font-size: 15px;
342
  line-height: 1.4;
@@ -346,24 +427,30 @@ with gr.Blocks(css="""
346
  padding: 10px;
347
  margin-bottom: 10px;
348
  }
349
- /* 調整按鈕外觀 */
350
  #correct_button {
351
- background-color: #4CAF50;
352
  color: white;
353
  font-size: 14px;
354
  margin-bottom: 5px;
355
  }
356
  #too_many_errors_button {
357
- background-color: #f44336;
358
  color: white;
359
  font-size: 14px;
360
  margin-bottom: 5px;
361
  }
 
 
 
 
 
 
362
  """) as demo:
363
  gr.Markdown(DEMO_EXPLANATION)
364
 
365
  with gr.Tab("標記工具"):
366
- # ------------------- 第一行:上方控制 -------------------
367
  with gr.Row():
368
  with gr.Column(scale=1):
369
  rater_selector = gr.Dropdown(
@@ -377,7 +464,7 @@ with gr.Blocks(css="""
377
  value="test.csv"
378
  )
379
  index_selector = gr.Dropdown(
380
- label="選擇索引(id-原文前10字)",
381
  choices=get_all_ids(),
382
  value=f"{data.loc[current_index, 'id']}-{str(data.loc[current_index, 'source'])[:10]}"
383
  )
@@ -387,34 +474,33 @@ with gr.Blocks(css="""
387
  interactive=False
388
  )
389
 
390
- # ----------------- 中間:原始文本 -----------------
391
  with gr.Column(scale=4):
392
- source = gr.Textbox(label="原始文本", lines=5, interactive=False)
393
-
394
- # ----------------- 右側:翻譯文本 -----------------
395
  with gr.Column(scale=4):
396
- target = gr.Textbox(label="翻譯文本", lines=5, interactive=False)
397
 
398
- # ------------------- 第二行:高亮 & 錯誤紀錄 -------------------
399
  with gr.Row():
400
  with gr.Column(scale=5):
401
- highlighted_target = gr.HTML(label="高亮顯示錯誤區間")
 
402
  with gr.Column(scale=5):
403
  error_table = gr.Dataframe(
404
  headers=["錯誤文字", "嚴重度", "分類"],
405
- label="當前句子錯誤紀錄",
406
  datatype=["str", "str", "str"],
407
  interactive=False
408
  )
409
 
410
- # ------------------- 第三行:錯誤標註相關區 -------------------
411
- with gr.Row():
412
- with gr.Column(scale=4):
413
- error_span = gr.Textbox(
414
- label="錯誤區間 (可複製『翻譯文本』貼上)",
415
- lines=2
416
- )
417
- with gr.Column(scale=3):
418
  category = gr.Dropdown(
419
  label="錯誤類別",
420
  choices=list(categories_display.keys()),
@@ -425,89 +511,91 @@ with gr.Blocks(css="""
425
  choices=categories_display["正確性"],
426
  value="誤譯"
427
  )
428
- with gr.Column(scale=3):
429
- other = gr.Textbox(label="其他子類別(如選『其他』則在此填)")
430
  severity = gr.Dropdown(
431
  label="嚴重度",
432
  choices=severity_choices_display,
433
  value="輕微 (Minor)"
434
  )
435
 
436
- # ------------------- 第四行:錯誤標註按鈕區 -------------------
437
- with gr.Row():
438
- save_current_button = gr.Button("保存並繼續標記當前資料")
439
- correct_button = gr.Button(" 完全正確", elem_id="correct_button")
440
- too_many_errors_button = gr.Button("✖ 過多錯誤", elem_id="too_many_errors_button")
441
-
442
- # ------------------- 第五行:建議翻譯 & 評分 & 送出 -------------------
443
- with gr.Row():
444
- alternative_translation = gr.Textbox(
445
- label="建議翻譯 (如有更好譯法,可填)",
446
- lines=2
447
- )
448
- score = gr.Slider(
449
- label="翻譯評分 (0=最差, 100=最好)",
450
- minimum=0,
451
- maximum=100,
452
- step=1,
453
- value=66
454
- )
 
 
455
  save_next_button = gr.Button("保存並顯示下一筆")
456
 
457
- # ------------------- 狀態顯示 -------------------
458
  status = gr.Textbox(label="當前狀態", lines=1, interactive=False)
459
 
460
- # ------------------- 互動邏輯綁定 -------------------
461
- def update_subcategories(selected_category):
462
  subcats = categories_display[selected_category]
463
  return gr.update(choices=subcats, value=subcats[0])
464
 
465
  file_selector.change(
466
- update_file_selection,
467
- inputs=[file_selector],
468
  outputs=[
469
- source, target, error_span,
470
- index_selector, current_index_display,
471
  status, error_table, highlighted_target
472
  ]
473
  )
474
  index_selector.change(
475
- update_index_selection,
476
- inputs=[index_selector],
477
  outputs=[
478
- source, target, current_index_display,
479
  status, error_table, highlighted_target
480
  ]
481
  )
482
  category.change(
483
- update_subcategories,
484
- inputs=[category],
485
  outputs=[subcategory]
486
  )
487
 
488
  correct_button.click(
489
- mark_as_correct,
490
  outputs=[error_span, status, error_table]
491
  )
492
  too_many_errors_button.click(
493
- mark_as_too_many_errors,
494
  outputs=[error_span, status, error_table]
495
  )
496
 
497
- # 「保存並繼續標記當前資料」(只是暫時往 current_errors 加)
498
  save_current_button.click(
499
- save_current,
500
  inputs=[source, target, rater_selector, error_span, category, subcategory, severity, other],
501
  outputs=[error_span, status]
502
  )
503
- # 再次更新表格 & 高亮
504
  save_current_button.click(
505
  fn=lambda tgt: (get_error_dataframe(), highlight_errors_in_text(tgt, current_errors)),
506
  inputs=[target],
507
  outputs=[error_table, highlighted_target]
508
  )
509
 
510
- # 「保存並顯示下一筆」(正式存檔到 JSON 並跳下一筆)
511
  save_next_button.click(
512
  save_and_next,
513
  inputs=[source, target, score, rater_selector, alternative_translation],
@@ -518,7 +606,7 @@ with gr.Blocks(css="""
518
  ]
519
  )
520
 
521
- # 初始化畫面
522
  init_src, init_tgt = get_current_text()
523
  source.value = init_src
524
  target.value = init_tgt
 
8
  from uuid import uuid4
9
  import shutil
10
 
11
+ # --------------------------- 中英對照的字典 ---------------------------
12
+ # 後端儲存(English),前端顯示(中文)
13
  category_map = {
14
  "正確性": "Accuracy",
15
  "流暢度": "Fluency",
 
17
  "風格": "Style",
18
  "在地化": "Locale"
19
  }
 
20
  subcategory_map = {
21
  ("正確性", "誤譯"): ("Accuracy", "Mistranslation"),
22
  ("正確性", "新增"): ("Accuracy", "Addition"),
 
44
  ("在地化", "地址格式"): ("Locale", "Address format"),
45
  ("在地化", "其他"): ("Locale", "Other"),
46
  }
 
47
  categories_display = {
48
  "正確性": ["誤譯", "新增", "漏譯", "其他"],
49
  "流暢度": ["文法", "拼字", "標點符號", "前後不一致", "語域", "其他"],
 
52
  "在地化": ["貨幣格式", "時間格式", "人名格式", "日期格式", "地址格式", "其他"]
53
  }
54
 
55
+ severity_choices_display = ["輕微 (Minor)", "嚴重 (Major)"]
56
  severity_map = {
57
  "輕微 (Minor)": "Minor",
58
  "嚴重 (Major)": "Major"
59
  }
60
 
61
+ # 這兩個字典用於前端顯示資料表時,把英文轉回中文顯示
62
+ severity_display_map = {
63
+ "Minor": "輕微 (Minor)",
64
+ "Major": "嚴重 (Major)",
65
+ "No-error": "無錯誤",
66
+ "Non-translation": "過多錯誤"
67
+ }
68
+ category_display_map = {
69
+ "Accuracy": "正確性",
70
+ "Fluency": "流暢度",
71
+ "Terminology": "專有名詞",
72
+ "Style": "風格",
73
+ "Locale": "在地化",
74
+ "Other": "其他",
75
+ "No-error": "無錯誤",
76
+ "Non-translation": "過多錯誤"
77
+ }
78
+
79
+ # ---------------------------下載CSV資料檔--------------------------------
80
  DATASET_DIR = Path("json_dataset")
81
  DATASET_DIR.mkdir(parents=True, exist_ok=True)
82
 
83
  scheduler = CommitScheduler(
84
+ repo_id="350016z/TaiwanCOMET_dataset",
85
  repo_type="dataset",
86
  folder_path=DATASET_DIR,
87
  path_in_repo="data"
88
  )
89
 
 
90
  def download_dataset_file(dataset_id, local_dir):
91
  snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset")
92
  contents = os.listdir(snapshot_path)
93
 
94
  for file_name in contents:
 
95
  if file_name.endswith(".csv"):
96
  source_file_path = os.path.join(snapshot_path, file_name)
97
  local_file_path = os.path.join(local_dir, file_name)
98
  shutil.copy(source_file_path, local_file_path)
 
 
99
  time.sleep(1)
100
  return local_dir
101
 
 
109
  exit()
110
 
111
  data_path = os.path.join(current_dir, 'test.csv') if 'test.csv' in csv_files else os.path.join(current_dir, csv_files[0])
 
 
112
  if not os.path.exists(data_path):
113
  print(f"Error: {data_path} does not exist. Please check the file path.")
114
  exit()
 
118
  current_errors = []
119
 
120
  annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
121
+ annotation_history = [] # 若需顯示歷史可擴充
 
 
122
 
123
  def get_all_ids():
124
  """
 
147
  json.dump(entry, f, ensure_ascii=False)
148
  f.write("\n")
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  def highlight_errors_in_text(text, errors):
151
  """
152
+ 在文本中以 <span style="background-color:yellow;">...</span> 方式高亮。
153
  """
154
+ if not text:
155
+ return ""
156
  highlighted = ""
157
  last_end = 0
158
  for err in sorted(errors, key=lambda e: e["start"]):
 
166
  highlighted += text[last_end:]
167
  return highlighted
168
 
169
+ def get_error_dataframe():
170
+ """
171
+ 只顯示「錯誤文字」「嚴重度」「分類」(皆為中文顯示),後端仍存英文。
172
+ """
173
+ df = pd.DataFrame(current_errors)
174
+ if df.empty:
175
+ return pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"])
176
+
177
+ display_df = pd.DataFrame()
178
+ # 顯示錯誤文字
179
+ display_df["錯誤文字"] = df["text"]
180
+
181
+ # 顯示嚴重度 (中文)
182
+ display_df["嚴重度"] = df["severity"].apply(lambda x: severity_display_map.get(x, x))
183
+
184
+ # 顯示分類 (中文)
185
+ # 後端儲存格式為 "Accuracy/Mistranslation" 或 "No-error"
186
+ def map_category(cat_str):
187
+ if cat_str in ["No-error", "Non-translation"]:
188
+ return severity_display_map.get(cat_str, cat_str)
189
+ if "/" not in cat_str:
190
+ # Single part (e.g. "Accuracy" or "Other")
191
+ return category_display_map.get(cat_str, cat_str)
192
+ main_cat, sub_cat = cat_str.split("/", 1)
193
+ main_cat_zh = category_display_map.get(main_cat, main_cat)
194
+ # sub_cat 可能是 "Mistranslation" or "Other" or "Addition"...
195
+ # 若需要更細,可再進一層 map;這裡示範單純中文對照
196
+ # 也可自行定義 sub_cat_map dict
197
+ sub_cat_zh = None
198
+ # 簡易示範: 逐一對照
199
+ if sub_cat == "Mistranslation":
200
+ sub_cat_zh = "誤譯"
201
+ elif sub_cat == "Addition":
202
+ sub_cat_zh = "新增"
203
+ elif sub_cat == "Omission":
204
+ sub_cat_zh = "漏譯"
205
+ elif sub_cat == "Grammar":
206
+ sub_cat_zh = "文法"
207
+ elif sub_cat == "Spelling":
208
+ sub_cat_zh = "拼字"
209
+ elif sub_cat == "Punctuation":
210
+ sub_cat_zh = "標點符號"
211
+ elif sub_cat == "Inconsistency":
212
+ sub_cat_zh = "前後不一致"
213
+ elif sub_cat == "Register":
214
+ sub_cat_zh = "語域"
215
+ elif sub_cat == "Inappropriate":
216
+ sub_cat_zh = "使用不當"
217
+ elif sub_cat == "Inconsistent":
218
+ sub_cat_zh = "不一致"
219
+ elif sub_cat == "Awkward":
220
+ sub_cat_zh = "用字笨拙"
221
+ elif sub_cat == "Currency format":
222
+ sub_cat_zh = "貨幣格式"
223
+ elif sub_cat == "Time format":
224
+ sub_cat_zh = "時間格式"
225
+ elif sub_cat == "Name format":
226
+ sub_cat_zh = "人名格式"
227
+ elif sub_cat == "Date format":
228
+ sub_cat_zh = "日期格式"
229
+ elif sub_cat == "Address format":
230
+ sub_cat_zh = "地址格式"
231
+ else:
232
+ # 若無對應就顯示原本
233
+ sub_cat_zh = sub_cat
234
+ return f"{main_cat_zh}/{sub_cat_zh}"
235
+
236
+ display_df["分類"] = df["category"].apply(map_category)
237
+ return display_df
238
+
239
  def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
240
  global current_index, data, current_errors
241
  if len(current_errors) >= 5:
242
+ return "", "您已標記超過 5 處錯誤,可直接按『過多錯誤』或繼續。"
243
 
244
  if error_span and error_span not in target:
245
  return "", "錯誤區間不存在於翻譯文本,請檢查!"
246
 
247
+ # 轉英文
248
  cat_val, subcat_val = subcategory_map.get((category, subcategory), (category_map.get(category, "Other"), "Other"))
249
  severity_val = severity_map.get(severity, "Minor")
250
 
251
  if error_span:
252
  start = target.find(error_span)
253
  end = start + len(error_span)
254
+
255
  for err in current_errors:
256
  if err["start"] == start and err["end"] == end:
257
+ return "", "此錯誤區間已標記過,請勿重複。"
258
+
259
+ if subcat_val == "Other" and other.strip():
260
+ # 如果子類別選『其他』且填了自訂內容
261
+ subcat_val = other.strip()
262
 
263
  current_errors.append({
264
  "text": error_span,
265
  "severity": severity_val,
266
  "start": start,
267
  "end": end,
268
+ "category": f"{cat_val}/{subcat_val}"
269
  })
270
+ return "", f"已標記錯誤: {error_span} (範圍 {start}-{end})"
271
  else:
272
+ return "", "尚未輸入錯誤區間,如無錯誤請按『完全正確』"
273
+
274
+ def mark_as_correct():
275
+ global current_errors
276
+ current_errors.append({
277
+ "text": "",
278
+ "severity": "No-error",
279
+ "start": 0,
280
+ "end": 0,
281
+ "category": "No-error"
282
+ })
283
+ return "", "標註為完全正確!", get_error_dataframe()
284
+
285
+ def mark_as_too_many_errors():
286
+ global current_errors
287
+ current_errors.append({
288
+ "text": "",
289
+ "severity": "Major",
290
+ "start": 0,
291
+ "end": 0,
292
+ "category": "Non-translation"
293
+ })
294
+ return "", "已標註為過多錯誤!", get_error_dataframe()
295
 
296
  def save_and_next(source, target, score, rater_selector, alternative_translation):
297
  global current_index, data, annotations_file, current_errors, annotation_history
298
 
299
  if not rater_selector:
300
  return (
301
+ source, target, "", # return empty error_span
302
  str(data.loc[current_index, "id"]),
303
  "請先選擇標註人員!",
304
  get_error_dataframe(),
 
340
 
341
  if current_index >= len(data):
342
  return (
343
+ "已完成所有文本標記", # source
344
+ "已完成所有文本標記", # target
345
+ "", # error_span
346
+ "", # current_index_display
347
+ f"標記完成並儲存到 {annotations_file.name}!( {len(data)} 筆)",
348
  pd.DataFrame(columns=["錯誤文字", "嚴重度", "分類"]),
349
  ""
350
  )
351
 
352
  next_source, next_target = get_current_text()
353
+ status_msg = f"已提交!目前進度:第 {current_index} 筆 / 共 {len(data)} 筆。"
354
  return (
355
  next_source,
356
  next_target,
 
390
  selected_id = parse_id_from_display(selected_display)
391
  row_list = data.index[data["id"] == selected_id].tolist()
392
  if not row_list:
393
+ return (
394
+ "", "", str(selected_id),
395
+ f"找不到 id: {selected_id}",
396
+ get_error_dataframe(),
397
+ ""
398
+ )
399
  current_index = row_list[0]
400
  src, tgt = get_current_text()
401
+ return (
402
+ src, tgt,
403
+ str(selected_id),
404
+ f"已跳轉至 id={selected_id}",
405
+ get_error_dataframe(),
406
+ highlight_errors_in_text(tgt, current_errors)
407
+ )
408
 
409
+ DEMO_EXPLANATION = """
410
+ ## 翻譯標記工具:階段性操作流程
 
 
 
 
 
 
 
 
411
 
412
+ ### 操作步驟
413
+ 1. **先選擇標註人員與檔案**,並在「索引」下拉中挑選要標註的句子。
414
+ 2. 在「步驟 1:錯誤標註」中,若翻譯文本有錯,請輸入「錯誤區間」、選擇「錯誤類別/子類別/嚴重度」並點「保存並繼續標記」。多個錯誤可重複此步驟;若無錯誤則可直接點「完全正確」。
415
+ 3. 錯誤標完後,在「步驟 2:評分與提交」中,拉動滑桿給分,若有更好譯文,可在「建議翻譯」填入。再按「保存並顯示下一筆」送出本句標註並進入下一句。
 
 
 
 
 
 
416
 
 
 
 
 
 
 
 
 
417
  """
418
 
419
  with gr.Blocks(css="""
420
+ /* 整體字體與行距 */
421
  * {
422
  font-size: 15px;
423
  line-height: 1.4;
 
427
  padding: 10px;
428
  margin-bottom: 10px;
429
  }
430
+ /* 按鈕分色 */
431
  #correct_button {
432
+ background-color: #4CAF50; /* 綠 */
433
  color: white;
434
  font-size: 14px;
435
  margin-bottom: 5px;
436
  }
437
  #too_many_errors_button {
438
+ background-color: #f44336; /* 紅 */
439
  color: white;
440
  font-size: 14px;
441
  margin-bottom: 5px;
442
  }
443
+ /* 螢光標示外層加框,便於視覺聚焦 */
444
+ #highlight_box {
445
+ border: 1px solid #aaa;
446
+ padding: 10px;
447
+ min-height: 80px;
448
+ }
449
  """) as demo:
450
  gr.Markdown(DEMO_EXPLANATION)
451
 
452
  with gr.Tab("標記工具"):
453
+ # ------------------- 頂部: 檔案 & 索引控制 -------------------
454
  with gr.Row():
455
  with gr.Column(scale=1):
456
  rater_selector = gr.Dropdown(
 
464
  value="test.csv"
465
  )
466
  index_selector = gr.Dropdown(
467
+ label="選擇索引 (id-原文前10字)",
468
  choices=get_all_ids(),
469
  value=f"{data.loc[current_index, 'id']}-{str(data.loc[current_index, 'source'])[:10]}"
470
  )
 
474
  interactive=False
475
  )
476
 
477
+ # 左: 原始文本 / 右: 翻譯文本
478
  with gr.Column(scale=4):
479
+ source = gr.Textbox(label="原始文本", lines=4, interactive=False)
 
 
480
  with gr.Column(scale=4):
481
+ target = gr.Textbox(label="翻譯文本", lines=4, interactive=False)
482
 
483
+ # ------------------- 螢光標記區(帶外框)&錯誤紀錄表 -------------------
484
  with gr.Row():
485
  with gr.Column(scale=5):
486
+ with gr.Box(elem_id="highlight_box"):
487
+ highlighted_target = gr.HTML(value="", label="螢光標示區 (已標註的錯誤)")
488
  with gr.Column(scale=5):
489
  error_table = gr.Dataframe(
490
  headers=["錯誤文字", "嚴重度", "分類"],
491
+ label="當前句子錯誤紀錄 (中文顯示)",
492
  datatype=["str", "str", "str"],
493
  interactive=False
494
  )
495
 
496
+ # ------------------- 步驟1:錯誤標註 -------------------
497
+ with gr.Box(elem_id="step1_box", css="panel"):
498
+ gr.Markdown("### 步驟 1:錯誤標註")
499
+
500
+ with gr.Row():
501
+ # 錯誤區間 / 錯誤類別 / 子類別 / 嚴重度
502
+ error_span = gr.Textbox(label="錯誤區間 (可複製『翻譯文本』貼上)", lines=2)
503
+
504
  category = gr.Dropdown(
505
  label="錯誤類別",
506
  choices=list(categories_display.keys()),
 
511
  choices=categories_display["正確性"],
512
  value="誤譯"
513
  )
514
+ other = gr.Textbox(label="其他子類別(如選『其他』則填寫)")
515
+
516
  severity = gr.Dropdown(
517
  label="嚴重度",
518
  choices=severity_choices_display,
519
  value="輕微 (Minor)"
520
  )
521
 
522
+ with gr.Row():
523
+ save_current_button = gr.Button("保存並繼續標記當前資料")
524
+ correct_button = gr.Button("✔ 完全正確", elem_id="correct_button")
525
+ too_many_errors_button = gr.Button(" 過多錯誤", elem_id="too_many_errors_button")
526
+
527
+ # ------------------- 步驟2:評分與提交 -------------------
528
+ with gr.Box(elem_id="step2_box", css="panel"):
529
+ gr.Markdown("### 步驟 2:評分與提交")
530
+ with gr.Row():
531
+ alternative_translation = gr.Textbox(
532
+ label="建議翻譯(如有更好譯法可填)",
533
+ lines=2
534
+ )
535
+ score = gr.Slider(
536
+ label="翻譯評分 (0=最差, 100=最好)",
537
+ minimum=0,
538
+ maximum=100,
539
+ step=1,
540
+ value=66
541
+ )
542
+ # 提交按鈕
543
  save_next_button = gr.Button("保存並顯示下一筆")
544
 
545
+ # 最下方: 狀態
546
  status = gr.Textbox(label="當前狀態", lines=1, interactive=False)
547
 
548
+ # ------------------- 邏輯綁定 -------------------
549
+ def update_subcats(selected_category):
550
  subcats = categories_display[selected_category]
551
  return gr.update(choices=subcats, value=subcats[0])
552
 
553
  file_selector.change(
554
+ update_file_selection,
555
+ inputs=[file_selector],
556
  outputs=[
557
+ source, target, error_span,
558
+ index_selector, current_index_display,
559
  status, error_table, highlighted_target
560
  ]
561
  )
562
  index_selector.change(
563
+ update_index_selection,
564
+ inputs=[index_selector],
565
  outputs=[
566
+ source, target, current_index_display,
567
  status, error_table, highlighted_target
568
  ]
569
  )
570
  category.change(
571
+ update_subcats,
572
+ inputs=[category],
573
  outputs=[subcategory]
574
  )
575
 
576
  correct_button.click(
577
+ mark_as_correct,
578
  outputs=[error_span, status, error_table]
579
  )
580
  too_many_errors_button.click(
581
+ mark_as_too_many_errors,
582
  outputs=[error_span, status, error_table]
583
  )
584
 
585
+ # 按「保存並繼續標記」 -> 在同一句上加錯誤
586
  save_current_button.click(
587
+ save_current,
588
  inputs=[source, target, rater_selector, error_span, category, subcategory, severity, other],
589
  outputs=[error_span, status]
590
  )
591
+ # 再更新表格 & 高亮
592
  save_current_button.click(
593
  fn=lambda tgt: (get_error_dataframe(), highlight_errors_in_text(tgt, current_errors)),
594
  inputs=[target],
595
  outputs=[error_table, highlighted_target]
596
  )
597
 
598
+ # 按「保存並顯示下一筆」 -> 送出當前整句標註 & 進下一句
599
  save_next_button.click(
600
  save_and_next,
601
  inputs=[source, target, score, rater_selector, alternative_translation],
 
606
  ]
607
  )
608
 
609
+ # 初始化介面
610
  init_src, init_tgt = get_current_text()
611
  source.value = init_src
612
  target.value = init_tgt