350016z commited on
Commit
b0d473b
·
verified ·
1 Parent(s): 437738a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +447 -138
app.py CHANGED
@@ -9,6 +9,65 @@ from uuid import uuid4
9
  from datasets import load_dataset
10
  import shutil
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  DATASET_DIR = Path("json_dataset")
13
  DATASET_DIR.mkdir(parents=True, exist_ok=True)
14
 
@@ -19,9 +78,7 @@ scheduler = CommitScheduler(
19
  path_in_repo="data"
20
  )
21
 
22
- # Loading dataset from HuggingFace -------------------------------------------------------------------------------------
23
  def download_dataset_file(dataset_id, local_dir):
24
- # /home/user/.cache/huggingface/hub/datasets--350016z--Taiwanese_dataset/snapshots/22594253c63bd80e85b5255f948432014c37373a
25
  snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset")
26
  contents = os.listdir(snapshot_path)
27
 
@@ -33,10 +90,7 @@ def download_dataset_file(dataset_id, local_dir):
33
 
34
  shutil.copy(source_file_path, local_file_path)
35
  print(f"Copied {file_name} to {local_file_path}")
36
-
37
- # Check file permissions
38
  print(f"Permissions for {local_file_path}: {oct(os.stat(local_file_path).st_mode)}")
39
-
40
  time.sleep(1)
41
 
42
  return local_dir
@@ -57,19 +111,33 @@ if not os.path.exists(data_path):
57
  print(f"Error: {data_path} does not exist. Please check the file path.")
58
  exit()
59
 
60
-
61
- # Loading & Setting --------------------------------------------------------------------------------------------------
62
- data = pd.read_csv(data_path, dtype={"id": "Int64"}) # 確保 id 為標準 Python int
63
 
64
  current_index = 0
65
  current_errors = []
66
 
67
  annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
68
- # ---------------------------------------------------------------------------------------------------------------------
 
 
 
69
 
70
  def get_all_ids():
71
- return [str(id) for id in data["id"].tolist()]
72
-
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def get_current_text():
74
  global current_index, data
75
  source = data.loc[current_index, "source"]
@@ -77,122 +145,211 @@ def get_current_text():
77
  return source, target
78
 
79
  def save_to_json(entry: dict, json_file: Path):
80
- """
81
- 將資料保存到指定的 JSON 檔案,並推送到 Hugging Face Dataset。
82
- """
83
  with scheduler.lock:
84
  with json_file.open("a") as f:
85
  json.dump(entry, f, ensure_ascii=False)
86
  f.write("\n")
 
87
  # scheduler.push_to_hub(commit_message=f"更新檔案 {json_file.name}")
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
91
  global current_index, data, current_errors
92
-
93
- system = data.loc[current_index, "system"]
94
- lp = data.loc[current_index, "lp"]
95
- doc = data.loc[current_index, "doc"]
96
- id = int(data.loc[current_index, "id"])
97
- reference = data.loc[current_index, "reference"]
98
 
99
- if subcategory:
100
- if subcategory == "Other":
101
- category_value = f"{category}/{other}"
102
- else:
103
- category_value = f"{category}/{subcategory}"
104
-
 
 
105
 
106
- if error_span and error_span in target:
107
  start = target.find(error_span)
108
  end = start + len(error_span)
109
- print(f"start: {start}, end: {end}")
 
 
 
 
 
 
 
 
 
 
 
 
110
  else:
111
- return "", "錯誤區間不存在於翻譯文本中,請檢查!"
112
-
113
- current_errors.append({
114
- "text": error_span,
115
- "severity": severity,
116
- "start": start,
117
- "end": end,
118
- "category": category_value,
119
- })
120
-
121
- # [error_span, status]
122
- return "", f"已記錄錯誤區間: {error_span},範圍 {start}-{end}。"
123
-
124
-
125
- def save_and_next(source, target, score, rater_selector):
126
- global current_index, data, annotations_file, current_errors
 
 
 
 
 
 
 
 
 
 
127
 
128
  system = data.loc[current_index, "system"]
129
  lp = data.loc[current_index, "lp"]
130
  doc = data.loc[current_index, "doc"]
131
- id = int(data.loc[current_index, "id"])
132
  reference = data.loc[current_index, "reference"]
133
 
134
  annotations_entry = {
135
  "system": system,
136
  "lp": lp,
137
  "doc": doc,
138
- "id": id,
139
  "rater": rater_selector,
140
  "src": source,
141
  "mt": target,
142
  "ref": reference,
143
  "esa_score": score,
144
  "esa_spans": current_errors,
 
145
  }
146
  save_to_json(annotations_entry, annotations_file)
147
 
148
- # 清空當前錯誤緩存
 
 
 
149
  current_errors = []
150
 
151
  current_index += 1
152
  if current_index >= len(data):
153
- return "已完成所有文本標記", "已完成所有文本標記", "", "", f"所有標記已完成並保存到 {annotations_file.name}!"
 
 
 
 
 
 
 
 
154
 
155
  next_source, next_target = get_current_text()
156
- return next_source, next_target, "", str(current_index), f"分數與錯誤已保存到 {annotations_file.name},請繼續下一筆!"
157
-
 
 
 
 
 
 
 
 
 
158
 
159
  def update_file_selection(selected_file):
160
- global data_path, data, current_index, annotations_file
161
  data_path = os.path.join(current_dir, selected_file)
162
- data = pd.read_csv(data_path)
163
-
164
- id_list = [str(id) for id in sorted(data["id"].unique())] # 轉為字串,確保 Gradio Dropdown 兼容
165
- min_id = int(id_list[0]) # 取得最小的 ID
166
-
167
- current_index = data.index[data["id"] == int(min_id)].tolist()[0] # DataFrame 的行索引(row index);而非檔案中的id
 
168
 
169
  file_base_name = os.path.splitext(selected_file)[0]
 
170
  annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json"
171
 
172
- # [source, target, error_span, index_selector, current_index_display, status]
173
- return get_current_text() + ("", gr.update(choices=id_list, value=str(min_id)), str(min_id), f"已加載檔案:{selected_file}")
174
-
175
-
176
- def update_index_selection(selected_index):
177
- global current_index, data
178
- selected_index = int(selected_index)
179
- current_index = data.index[data["id"] == selected_index].tolist()[0]
180
- # [source, target, current_index_display, status]
181
- return get_current_text() + (str(selected_index), f"已跳轉至 id: {selected_index}")
182
-
183
- categories = {
184
- "Accuracy": ["Mistranslation", "Addition", "Omission", "Other"],
185
- "Fluency": ["Grammar", "Spelling", "Punctuation", "Inconsistency", "Register", "Other"],
186
- "Terminology": ["Inappropriate", "Inconsistent", "Other"],
187
- "Style": ["Awkward", "Other"],
188
- "Locale": ["Currency format", "Time format", "Name format", "Date format", "Address format", "Other"],
189
- }
190
- severity_choices = ["Minor", "Major"]
191
- rater = ['rater1', 'rater2','rater3', 'rater4', 'rater5', 'rater6', 'rater7']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  def mark_as_correct():
194
  global current_errors
195
-
196
  current_errors.append({
197
  "text": "",
198
  "severity": "No-error",
@@ -200,11 +357,14 @@ def mark_as_correct():
200
  "end": 0,
201
  "category": "No-error"
202
  })
203
- return "", "標註為完全正確,無錯誤!"
204
-
 
 
 
 
205
  def mark_as_too_many_errors():
206
  global current_errors
207
-
208
  current_errors.append({
209
  "text": "",
210
  "severity": "Major",
@@ -212,104 +372,253 @@ def mark_as_too_many_errors():
212
  "end": 0,
213
  "category": "Non-translation"
214
  })
215
- return "", "已標註為過多錯誤!"
216
-
 
 
 
 
 
 
217
  DEMO_EXPLANATION = """
218
  ## 翻譯標記工具
219
- ### 使用規則 [更多細節](https://huggingface.co/spaces/350016z/TranslationError_Gradio/blob/main/README.md)
220
- 1. **開始作業**
221
- - 在「標註人員」選擇您的編號以識別。
222
- - 左側「原始文本」顯示原文,右側「翻譯文本」為機器翻譯結果,請檢查右側內容是否有錯誤。
223
- 2. **錯誤標註**
224
- - 發現翻譯錯誤時,將錯誤部分標註到「錯誤區間」欄位,錯誤需連接成最長可能區間,若中間有正確翻譯,需分段標註,避免連續標記。
225
- - 若有多處錯誤,可逐一標註並點擊「保存並繼續標記當前資料」後繼續修正。
226
- - 若錯誤超過五處,直接按下「過多錯誤」按鈕,再進行後續的評分。
227
- - 若無錯誤,直接按下「完全正確」按鈕,再進行後續的評分。
228
- 3. **評分**
229
- - 標記完所有錯誤區間以後,對每個翻譯文本的整體品質進行評分 (0-100分,0分最差,100分最好)。
230
- - 0:幾乎沒有保留原文意思,大部分資訊遺失。
231
- - 33:保留部分原文意思,但有明顯遺漏,敘述難以理解,文法可能很差。
232
- - 66:保留大部分原文意思,有一些文法錯誤或輕微不一致。
233
- - 100:原文意思和文法完全正確。
234
- (即使選擇 **「完全正確」**,分數也不一定需要評100分)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  """
236
 
237
  with gr.Blocks(css="""
 
 
 
 
 
 
 
 
 
 
238
  #correct_button {
239
- background-color: #4CAF50;
240
  color: white;
241
- font-size: 12px;
242
  padding: 5px 5px;
243
  border-radius: 5px;
244
  min-height: 0px;
 
245
  }
246
  #too_many_errors_button {
247
- background-color: #f44336;
248
  color: white;
249
- font-size: 12px;
250
  padding: 5px 5px;
251
  border-radius: 5px;
252
  min-height: 0px;
 
 
 
 
 
253
  }
254
  """) as demo:
255
  gr.Markdown(DEMO_EXPLANATION)
256
 
257
-
258
- with gr.Tab("標記工具"):
259
  with gr.Row():
260
  with gr.Column(scale=1):
261
- rater_selector = gr.Dropdown(label="標註人員", choices=rater, value="rater1")
262
- file_selector = gr.Dropdown(label="選擇檔案", choices=csv_files, value="test.csv")
263
- index_selector = gr.Dropdown(label="選擇索引", choices=get_all_ids())
264
- current_index_display = gr.Textbox(label="當前索引", value=str(current_index), interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  with gr.Column(scale=8):
266
- source = gr.Textbox(label="原始文本", lines=15, interactive=False)
267
  with gr.Column(scale=8):
268
- target = gr.Textbox(label="翻譯文本", lines=15, interactive=False)
269
 
 
 
 
 
 
 
 
270
  with gr.Row(variant='panel', equal_height=True):
271
  with gr.Column(scale=3):
272
- error_span = gr.Textbox(label="錯誤區間 (💡可以直接複製「翻譯文本」欄位,並在此貼上)", lines=6, placeholder="請輸入翻譯中文本的錯誤區間 (如無錯誤則不需)")
 
 
 
 
273
  with gr.Column(scale=3):
274
  with gr.Row(equal_height=True):
275
- category = gr.Dropdown(label="錯誤類別", choices=list(categories.keys()), value="Accuracy")
276
- subcategory = gr.Dropdown(label="子類別", choices=categories["Accuracy"], value="Mistranslation")
277
- other = gr.Textbox(label="其他子類別", placeholder="若無法歸類,請填寫其他")
278
- with gr.Row(equal_height=True):
279
- severity = gr.Radio(label="錯誤嚴重程度", choices=severity_choices, value="Minor")
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  save_current_button = gr.Button("保存並繼續標記當前資料")
281
- with gr.Column(scale=1):
282
  correct_button = gr.Button("✔ 完全正確", elem_id="correct_button")
283
  too_many_errors_button = gr.Button("✖ 過多錯誤", elem_id="too_many_errors_button")
284
 
 
 
 
 
 
 
 
 
 
 
285
  with gr.Row(variant='panel', equal_height=True):
286
- with gr.Column(scale=8):
287
- score = gr.Slider(label="翻譯評分", minimum=0, maximum=100, step=1, value=66)
 
 
 
 
 
 
 
 
 
 
 
288
  with gr.Column(scale=1):
289
  save_next_button = gr.Button("保存並顯示下一筆")
290
 
 
291
  status = gr.Textbox(label="當前狀態", lines=1, interactive=False)
292
 
 
293
  def update_subcategories(selected_category):
294
- subcategories = categories[selected_category]
295
  if subcategories:
296
  return gr.update(choices=subcategories, value=subcategories[0])
297
  else:
298
  return gr.update(choices=[], value=None)
299
-
300
-
301
- file_selector.change(update_file_selection, inputs=[file_selector], outputs=[source, target, error_span, index_selector, current_index_display, status])
302
- index_selector.change(update_index_selection, inputs=[index_selector], outputs=[source, target, current_index_display, status])
303
- category.change(update_subcategories, inputs=[category], outputs=[subcategory])
304
-
305
- correct_button.click(mark_as_correct, outputs=[error_span, status])
306
- too_many_errors_button.click(mark_as_too_many_errors, outputs=[error_span, status])
307
-
308
- save_current_button.click(save_current, inputs=[source, target, rater_selector, error_span, category, subcategory, severity, other], outputs=[error_span, status])
309
- save_next_button.click(save_and_next, inputs=[source, target, score, rater_selector], outputs=[source, target, error_span, current_index_display, status])
310
-
311
- original, translated = get_current_text()
312
- source.value = original
313
- target.value = translated
314
-
315
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  from datasets import load_dataset
10
  import shutil
11
 
12
+ # ------------------------- 更新:新增顯示用與儲存用的雙層字典 (中文顯示 → 英文儲存) -------------------------
13
+ # 以下為「錯誤類別」(category) 與「子類別」(subcategory) 的中英對照
14
+ category_map = {
15
+ "正確性": "Accuracy",
16
+ "流暢度": "Fluency",
17
+ "專有名詞": "Terminology",
18
+ "風格": "Style",
19
+ "在地化": "Locale"
20
+ }
21
+
22
+ subcategory_map = {
23
+ # 正確性
24
+ ("正確性", "誤譯"): ("Accuracy", "Mistranslation"),
25
+ ("正確性", "新增"): ("Accuracy", "Addition"),
26
+ ("正確性", "漏譯"): ("Accuracy", "Omission"),
27
+ ("正確性", "其他"): ("Accuracy", "Other"),
28
+
29
+ # 流暢度
30
+ ("流暢度", "文法"): ("Fluency", "Grammar"),
31
+ ("流暢度", "拼字"): ("Fluency", "Spelling"),
32
+ ("流暢度", "標點符號"): ("Fluency", "Punctuation"),
33
+ ("流暢度", "前後不一致"): ("Fluency", "Inconsistency"),
34
+ ("流暢度", "語域"): ("Fluency", "Register"),
35
+ ("流暢度", "其他"): ("Fluency", "Other"),
36
+
37
+ # 專有名詞
38
+ ("專有名詞", "使用不當"): ("Terminology", "Inappropriate"),
39
+ ("專有名詞", "不一致"): ("Terminology", "Inconsistent"),
40
+ ("專有名詞", "其他"): ("Terminology", "Other"),
41
+
42
+ # 風格
43
+ ("風格", "用字笨拙"): ("Style", "Awkward"),
44
+ ("風格", "其他"): ("Style", "Other"),
45
+
46
+ # 在地化
47
+ ("在地化", "貨幣格式"): ("Locale", "Currency format"),
48
+ ("在地化", "時間格式"): ("Locale", "Time format"),
49
+ ("在地化", "人名格式"): ("Locale", "Name format"),
50
+ ("在地化", "日期格式"): ("Locale", "Date format"),
51
+ ("在地化", "地址格式"): ("Locale", "Address format"),
52
+ ("在地化", "其他"): ("Locale", "Other"),
53
+ }
54
+
55
+ # 這些為前端顯示的中文選項,對應到上面 map 中的 key
56
+ categories_display = {
57
+ "正確性": ["誤譯", "新增", "漏譯", "其他"],
58
+ "流暢度": ["文法", "拼字", "標點符號", "前後不一致", "語域", "其他"],
59
+ "專有名詞": ["使用不當", "不一致", "其他"],
60
+ "風格": ["用字笨拙", "其他"],
61
+ "在地化": ["貨幣格式", "時間格式", "人名格式", "日期格式", "地址格式", "其他"]
62
+ }
63
+
64
+ severity_choices_display = ["輕微 (Minor)", "嚴重 (Major)"] # 仍然儲存成 Minor / Major
65
+ severity_map = {
66
+ "輕微 (Minor)": "Minor",
67
+ "嚴重 (Major)": "Major"
68
+ }
69
+
70
+ # ---------------------------------- 其餘程式基本結構不變 -------------------------------------
71
  DATASET_DIR = Path("json_dataset")
72
  DATASET_DIR.mkdir(parents=True, exist_ok=True)
73
 
 
78
  path_in_repo="data"
79
  )
80
 
 
81
  def download_dataset_file(dataset_id, local_dir):
 
82
  snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset")
83
  contents = os.listdir(snapshot_path)
84
 
 
90
 
91
  shutil.copy(source_file_path, local_file_path)
92
  print(f"Copied {file_name} to {local_file_path}")
 
 
93
  print(f"Permissions for {local_file_path}: {oct(os.stat(local_file_path).st_mode)}")
 
94
  time.sleep(1)
95
 
96
  return local_dir
 
111
  print(f"Error: {data_path} does not exist. Please check the file path.")
112
  exit()
113
 
114
+ data = pd.read_csv(data_path, dtype={"id": "Int64"}) # 確保 id 為標準 Python int
 
 
115
 
116
  current_index = 0
117
  current_errors = []
118
 
119
  annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
120
+
121
+ # --------------------- 改善:可顯示歷史紀錄並顯示錯誤區間狀態 ---------------------
122
+ # 新增一個資料結構「annotation_history」用來暫存所有標記結果
123
+ annotation_history = []
124
 
125
  def get_all_ids():
126
+ # 為改善索引檢索效率,將「id + source(前10字)」當作顯示文字
127
+ # 實際上還是要存回單純的 id,後續要解析
128
+ id_list = []
129
+ for i in range(len(data)):
130
+ idx_value = data.loc[i, "id"]
131
+ src_text = str(data.loc[i, "source"])[:10].replace("\n", " ")
132
+ display_str = f"{idx_value}-{src_text}"
133
+ id_list.append(display_str)
134
+ return id_list
135
+
136
+ def parse_id_from_display(display_str):
137
+ # 從 "id-前10字" 中分離出真正的 id
138
+ # 假設固定結構「{id}-{some_text}」
139
+ return int(display_str.split("-", 1)[0])
140
+
141
  def get_current_text():
142
  global current_index, data
143
  source = data.loc[current_index, "source"]
 
145
  return source, target
146
 
147
  def save_to_json(entry: dict, json_file: Path):
 
 
 
148
  with scheduler.lock:
149
  with json_file.open("a") as f:
150
  json.dump(entry, f, ensure_ascii=False)
151
  f.write("\n")
152
+ # 如需立即Push則取消註解
153
  # scheduler.push_to_hub(commit_message=f"更新檔案 {json_file.name}")
154
 
155
+ # -------------------------- 新增:將錯誤標示記錄在前端介面 --------------------------
156
+ def get_error_dataframe():
157
+ """
158
+ 回傳當前暫存的錯誤清單,用於在前端顯示(例如用 DataFrame)。
159
+ """
160
+ # current_errors 為同一筆資料內的錯誤紀錄
161
+ # annotation_history 為已經提交到某筆資料(點下一筆)的紀錄
162
+ df = pd.DataFrame(current_errors)
163
+ if df.empty:
164
+ return pd.DataFrame(columns=["text", "severity", "start", "end", "category"])
165
+ return df[["text", "severity", "start", "end", "category"]]
166
+
167
+ # ---------------------- 高亮顯示錯誤區間 (基於 HTML) 的示範 ----------------------
168
+ def highlight_errors_in_text(text, errors):
169
+ """
170
+ 在文本中以 <span style="background-color:yellow;"> 標示錯誤區間。
171
+ 此功能受限於前端顯示,只能在 HTML 環境下顯示,Gradio Textbox 內不支援 HTML。
172
+ """
173
+ highlighted = ""
174
+ last_end = 0
175
+ for err in sorted(errors, key=lambda e: e["start"]):
176
+ st = err["start"]
177
+ ed = err["end"]
178
+ # 防呆:若 st/ed 超出範圍則跳過
179
+ if st < 0 or ed > len(text):
180
+ continue
181
+ highlighted += text[last_end:st]
182
+ # 醒目顏色
183
+ highlighted += f"<span style='background-color:yellow;'>{text[st:ed]}</span>"
184
+ last_end = ed
185
+ highlighted += text[last_end:]
186
+ return highlighted
187
 
188
  def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
189
  global current_index, data, current_errors
190
+ # 如果已經標註超過 5 處,這裡可再詢問使用者(Gradio 無法直接 alert/confirm),暫以提示方式
191
+ if len(current_errors) >= 5:
192
+ return "", "您已標記超過 5 處錯誤,如錯誤非常多,可直接按「過多錯誤」,或繼續標注。"
 
 
 
193
 
194
+ # 防呆:若 error_span 內容不存在於 target 中
195
+ if error_span and error_span not in target:
196
+ return "", "錯誤區間不存在於翻譯文本中,請檢查!"
197
+
198
+ # 轉換 category, subcategory 為英文
199
+ cat_val, subcat_val = subcategory_map.get((category, subcategory), (category_map.get(category, "Other"), "Other"))
200
+ # 轉換 severity 為英文
201
+ severity_val = severity_map.get(severity, "Minor")
202
 
203
+ if error_span:
204
  start = target.find(error_span)
205
  end = start + len(error_span)
206
+ # 若已經標記過相同範圍,就跳過(防重複)
207
+ for err in current_errors:
208
+ if err["start"] == start and err["end"] == end:
209
+ return "", "此錯誤區間已經標記過,請勿重複標記。"
210
+
211
+ current_errors.append({
212
+ "text": error_span,
213
+ "severity": severity_val,
214
+ "start": start,
215
+ "end": end,
216
+ "category": f"{cat_val}/{subcat_val}" if subcat_val != "Other" else f"{cat_val}/{other}" if other else f"{cat_val}/Other",
217
+ })
218
+ return "", f"已記錄錯誤區間: {error_span},範圍 {start}-{end}。"
219
  else:
220
+ return "", "請輸入錯誤區間,或選擇『完全正確』按鈕。"
221
+
222
+ def save_and_next(source, target, score, rater_selector, alternative_translation):
223
+ global current_index, data, annotations_file, current_errors, annotation_history
224
+
225
+ # 防呆:若未填寫評分/標註人員
226
+ if rater_selector is None or rater_selector.strip() == "":
227
+ return (
228
+ source,
229
+ target,
230
+ "", # error_span
231
+ str(data.loc[current_index, "id"]),
232
+ f"請先選擇標註人員 (rater)!",
233
+ get_error_dataframe(),
234
+ highlight_errors_in_text(target, current_errors)
235
+ )
236
+ if score is None:
237
+ return (
238
+ source,
239
+ target,
240
+ "",
241
+ str(data.loc[current_index, "id"]),
242
+ f"請先填寫評分!",
243
+ get_error_dataframe(),
244
+ highlight_errors_in_text(target, current_errors)
245
+ )
246
 
247
  system = data.loc[current_index, "system"]
248
  lp = data.loc[current_index, "lp"]
249
  doc = data.loc[current_index, "doc"]
250
+ id_val = int(data.loc[current_index, "id"])
251
  reference = data.loc[current_index, "reference"]
252
 
253
  annotations_entry = {
254
  "system": system,
255
  "lp": lp,
256
  "doc": doc,
257
+ "id": id_val,
258
  "rater": rater_selector,
259
  "src": source,
260
  "mt": target,
261
  "ref": reference,
262
  "esa_score": score,
263
  "esa_spans": current_errors,
264
+ "alternative_translation": alternative_translation if alternative_translation else ""
265
  }
266
  save_to_json(annotations_entry, annotations_file)
267
 
268
+ # 儲存到前端「歷史紀錄」以便用戶回顧
269
+ annotation_history.append(annotations_entry)
270
+
271
+ # 清空當前錯誤紀錄
272
  current_errors = []
273
 
274
  current_index += 1
275
  if current_index >= len(data):
276
+ return (
277
+ "已完成所有文本標記",
278
+ "已完成所有文本標記",
279
+ "",
280
+ "",
281
+ f"所有標記已完成並保存到 {annotations_file.name}! (共 {len(data)} 筆)",
282
+ pd.DataFrame(), # 空表
283
+ ""
284
+ )
285
 
286
  next_source, next_target = get_current_text()
287
+ # 回傳下一筆資訊,並顯示已完成幾筆 / 共幾筆
288
+ status_msg = f"評分與標記已提交!已完成第 {current_index} 筆 / 共 {len(data)} 筆。"
289
+ return (
290
+ next_source,
291
+ next_target,
292
+ "",
293
+ str(data.loc[current_index, "id"]),
294
+ status_msg,
295
+ pd.DataFrame(), # 新的一筆錯誤紀錄預設空
296
+ "" # 沒有錯誤高亮
297
+ )
298
 
299
  def update_file_selection(selected_file):
300
+ global data_path, data, current_index, annotations_file, current_errors, annotation_history
301
  data_path = os.path.join(current_dir, selected_file)
302
+ data = pd.read_csv(data_path, dtype={"id":"Int64"})
303
+ current_errors = []
304
+ annotation_history = []
305
+
306
+ # 重新定位 current_index = 第一行 (或最小id)
307
+ min_id = data["id"].min()
308
+ current_index = data.index[data["id"] == min_id].tolist()[0]
309
 
310
  file_base_name = os.path.splitext(selected_file)[0]
311
+ # 產生新的 annotations_file
312
  annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json"
313
 
314
+ src, tgt = get_current_text()
315
+ return (
316
+ src, # source
317
+ tgt, # target
318
+ "", # error_span
319
+ gr.update(choices=get_all_ids(), value=f"{min_id}-{str(src)[:10]}"), # index_selector
320
+ str(data.loc[current_index, "id"]), # current_index_display
321
+ f"已加載檔案:{selected_file}",
322
+ pd.DataFrame(columns=["text","severity","start","end","category"]),
323
+ highlight_errors_in_text(tgt, []) # 高亮為空
324
+ )
325
+
326
+ def update_index_selection(selected_display):
327
+ global current_index, data, current_errors
328
+ # "id-內容" 解析出真正的 id
329
+ selected_id = parse_id_from_display(selected_display)
330
+ # 找到對應行
331
+ row_list = data.index[data["id"] == selected_id].tolist()
332
+ if not row_list:
333
+ return (
334
+ "", "", str(selected_id),
335
+ f"找不到id: {selected_id}",
336
+ get_error_dataframe(),
337
+ ""
338
+ )
339
+ current_index = row_list[0]
340
+ src, tgt = get_current_text()
341
+ return (
342
+ src,
343
+ tgt,
344
+ str(selected_id),
345
+ f"已跳轉至 id: {selected_id}",
346
+ get_error_dataframe(),
347
+ highlight_errors_in_text(tgt, current_errors)
348
+ )
349
 
350
  def mark_as_correct():
351
  global current_errors
352
+ # 標註無錯誤
353
  current_errors.append({
354
  "text": "",
355
  "severity": "No-error",
 
357
  "end": 0,
358
  "category": "No-error"
359
  })
360
+ return (
361
+ "",
362
+ "標註為完全正確,無錯誤!",
363
+ get_error_dataframe()
364
+ )
365
+
366
  def mark_as_too_many_errors():
367
  global current_errors
 
368
  current_errors.append({
369
  "text": "",
370
  "severity": "Major",
 
372
  "end": 0,
373
  "category": "Non-translation"
374
  })
375
+ return (
376
+ "",
377
+ "已標註為過多錯誤!",
378
+ get_error_dataframe()
379
+ )
380
+
381
+ # ------------------------- 新增:提供一個「建議翻譯」欄位 -------------------------
382
+ # ------------------------- 新增:在界面加上較明顯的評分標準提示 -------------------
383
  DEMO_EXPLANATION = """
384
  ## 翻譯標記工具
385
+ ### 使用規則
386
+ 1. **開始作業**
387
+ - 在「標註人員」選擇您的編號以識別。
388
+ - 下方「原始文本」顯示原文,右側「翻譯文本」為機器翻譯結果,請仔細檢查右側翻譯並標註錯誤。
389
+
390
+ 2. **錯誤標註**
391
+ - 若發現翻譯錯誤,請在「錯誤區間」欄位填入此錯誤在「翻譯文本」中的對應文字。
392
+ - 選擇「錯誤類別」、「子類別」,以及「錯誤嚴重程度」。
393
+ - 按下「保存並繼續標記當前資料」即可臨時儲存。
394
+ - 若錯誤超過五處,請使用「過多錯誤」按鈕(標註為 Major/Non-translation)。
395
+ - 若無任何錯誤,可直接按「完全正確」。
396
+ - **系統將在畫面下方顯示錯誤紀錄**,避免重複標記或遺漏。
397
+
398
+ 3. **評分** (0–100)
399
+ - 0分:幾乎無法理解,大部分意思遺失。
400
+ - 33分:保留部分原文意思,有明顯遺漏,句子不流暢或文法差。
401
+ - 66分:大部分原文意思保留,僅部分文法瑕疵或不一致。
402
+ - 100分:完全保留原文意思,語句通順無誤。
403
+ - 註:就算選擇「完全正確」,也可酌情給分,例如 90 或 100。
404
+
405
+ 4. **建議翻譯**
406
+ - 若您有更好的譯文想法,可在「建議翻譯」輸入框提供,利於後續改進翻譯品質。
407
+
408
+ 5. **送出與查看進度**
409
+ - 按「保存並顯示下一筆」後,系統會進行保存並顯示下一筆資料。
410
+ - 在畫面下方之「當前狀態」會顯示目前進度,例如「已完成第 X 筆 / 共 Y 筆」。
411
+
412
+ 6. **注意**
413
+ - 若���要跳至其他索引,可在「選擇索引」裡選擇。請留意:存檔後才會保留當前標記與評分。
414
+ - 此平台暫無法動態調整全部字體大小;若有視覺需要,可放大瀏覽器或按下 Ctrl + 滑鼠滾輪。
415
+
416
+ 以上說明若有不足,請直接留言反饋。
417
  """
418
 
419
  with gr.Blocks(css="""
420
+ /* 提高整體字體大小 (部分瀏覽器可能需自行縮放) */
421
+ * {
422
+ font-size: 16px;
423
+ }
424
+ /* 分區的樣式調整 */
425
+ .panel {
426
+ border: 1px solid #ccc;
427
+ padding: 10px;
428
+ }
429
+ /* 按鈕樣式區分 */
430
  #correct_button {
431
+ background-color: #4CAF50; /* 綠色 */
432
  color: white;
433
+ font-size: 14px;
434
  padding: 5px 5px;
435
  border-radius: 5px;
436
  min-height: 0px;
437
+ margin-bottom: 10px;
438
  }
439
  #too_many_errors_button {
440
+ background-color: #f44336; /* 紅色 */
441
  color: white;
442
+ font-size: 14px;
443
  padding: 5px 5px;
444
  border-radius: 5px;
445
  min-height: 0px;
446
+ margin-bottom: 10px;
447
+ }
448
+ /* 優化下拉清單字體 */
449
+ label, select {
450
+ font-size: 16px;
451
  }
452
  """) as demo:
453
  gr.Markdown(DEMO_EXPLANATION)
454
 
455
+ with gr.Tab("標記工具"):
456
+ # ------------------- 上方: 檔案與索引選擇 -------------------
457
  with gr.Row():
458
  with gr.Column(scale=1):
459
+ rater_selector = gr.Dropdown(
460
+ label="標註人員",
461
+ choices=["rater1", "rater2", "rater3", "rater4", "rater5", "rater6", "rater7"],
462
+ value="rater1"
463
+ )
464
+ file_selector = gr.Dropdown(
465
+ label="選擇檔案",
466
+ choices=csv_files,
467
+ value="test.csv"
468
+ )
469
+ index_selector = gr.Dropdown(
470
+ label="選擇索引 (顯示: id-原文前10字)",
471
+ choices=get_all_ids(),
472
+ value=f"{data.loc[current_index, 'id']}-{str(data.loc[current_index, 'source'])[:10]}"
473
+ )
474
+ current_index_display = gr.Textbox(
475
+ label="當前索引(id)",
476
+ value=str(data.loc[current_index, "id"]),
477
+ interactive=False
478
+ )
479
+
480
+ # ------------------- 中間: 原文 & 右側: 機翻 -------------------
481
  with gr.Column(scale=8):
482
+ source = gr.Textbox(label="原始文本", lines=6, interactive=False)
483
  with gr.Column(scale=8):
484
+ target = gr.Textbox(label="翻譯文本", lines=6, interactive=False)
485
 
486
+ # ------------------- 顯示錯誤高亮 -------------------
487
+ with gr.Row():
488
+ # 高亮後的翻譯文本(只讀, HTML 顯示)
489
+ with gr.Column(scale=8):
490
+ highlighted_target = gr.HTML(label="錯誤高亮顯示(僅供參考)")
491
+
492
+ # ------------------- 中段: 錯誤標註相關 -------------------
493
  with gr.Row(variant='panel', equal_height=True):
494
  with gr.Column(scale=3):
495
+ error_span = gr.Textbox(
496
+ label="錯誤區間 (請直接複製『翻譯文本』文字貼上)",
497
+ lines=3,
498
+ placeholder="如無錯誤,可按『完全正確』"
499
+ )
500
  with gr.Column(scale=3):
501
  with gr.Row(equal_height=True):
502
+ category = gr.Dropdown(
503
+ label="錯誤類別",
504
+ choices=list(categories_display.keys()),
505
+ value="正確性"
506
+ )
507
+ subcategory = gr.Dropdown(
508
+ label="子類別",
509
+ choices=categories_display["正確性"],
510
+ value="誤譯"
511
+ )
512
+ with gr.Row(equal_height=True):
513
+ other = gr.Textbox(label="其他子類別", placeholder="若無法歸類,請填寫")
514
+ severity = gr.Dropdown(
515
+ label="錯誤嚴重程度",
516
+ choices=severity_choices_display,
517
+ value="輕微 (Minor)"
518
+ )
519
+ with gr.Row():
520
  save_current_button = gr.Button("保存並繼續標記當前資料")
521
+ with gr.Column(scale=2):
522
  correct_button = gr.Button("✔ 完全正確", elem_id="correct_button")
523
  too_many_errors_button = gr.Button("✖ 過多錯誤", elem_id="too_many_errors_button")
524
 
525
+ # ----------------- 錯誤紀錄表格 -----------------
526
+ with gr.Column(scale=4):
527
+ error_table = gr.Dataframe(
528
+ headers=["text", "severity", "start", "end", "category"],
529
+ label="當前句子錯誤紀錄",
530
+ datatype=["str", "str", "number", "number", "str"],
531
+ interactive=False
532
+ )
533
+
534
+ # ------------------- 建議翻譯與評分 -------------------
535
  with gr.Row(variant='panel', equal_height=True):
536
+ with gr.Column(scale=4):
537
+ alternative_translation = gr.Textbox(
538
+ label="建議翻譯 (如有更適合的譯文,可在此提供)",
539
+ lines=2
540
+ )
541
+ with gr.Column(scale=4):
542
+ score = gr.Slider(
543
+ label="翻譯評分 (0=最差, 100=最好)",
544
+ minimum=0,
545
+ maximum=100,
546
+ step=1,
547
+ value=66
548
+ )
549
  with gr.Column(scale=1):
550
  save_next_button = gr.Button("保存並顯示下一筆")
551
 
552
+ # ------------------- 最下方: 狀態列 -------------------
553
  status = gr.Textbox(label="當前狀態", lines=1, interactive=False)
554
 
555
+ # ------------------- 事件處理 -------------------
556
  def update_subcategories(selected_category):
557
+ subcategories = categories_display[selected_category]
558
  if subcategories:
559
  return gr.update(choices=subcategories, value=subcategories[0])
560
  else:
561
  return gr.update(choices=[], value=None)
562
+
563
+ file_selector.change(
564
+ update_file_selection,
565
+ inputs=[file_selector],
566
+ outputs=[
567
+ source, target, error_span,
568
+ index_selector, current_index_display,
569
+ status, error_table, highlighted_target
570
+ ]
571
+ )
572
+ index_selector.change(
573
+ update_index_selection,
574
+ inputs=[index_selector],
575
+ outputs=[
576
+ source, target,
577
+ current_index_display, status,
578
+ error_table, highlighted_target
579
+ ]
580
+ )
581
+ category.change(
582
+ update_subcategories,
583
+ inputs=[category],
584
+ outputs=[subcategory]
585
+ )
586
+ correct_button.click(
587
+ mark_as_correct,
588
+ outputs=[error_span, status, error_table]
589
+ )
590
+ too_many_errors_button.click(
591
+ mark_as_too_many_errors,
592
+ outputs=[error_span, status, error_table]
593
+ )
594
+ save_current_button.click(
595
+ save_current,
596
+ inputs=[source, target, rater_selector, error_span, category, subcategory, severity, other],
597
+ outputs=[error_span, status]
598
+ )
599
+
600
+ # 當保存當前錯誤後,也要更新錯誤表格和高亮
601
+ save_current_button.click(
602
+ fn=lambda tgt: (get_error_dataframe(), highlight_errors_in_text(tgt, current_errors)),
603
+ inputs=[target],
604
+ outputs=[error_table, highlighted_target]
605
+ )
606
+
607
+ save_next_button.click(
608
+ save_and_next,
609
+ inputs=[source, target, score, rater_selector, alternative_translation],
610
+ outputs=[
611
+ source, target, error_span,
612
+ current_index_display, status,
613
+ error_table, highlighted_target
614
+ ]
615
+ )
616
+
617
+ # 預設載入時顯示
618
+ initial_src, initial_tgt = get_current_text()
619
+ source.value = initial_src
620
+ target.value = initial_tgt
621
+ error_table.value = pd.DataFrame(columns=["text","severity","start","end","category"])
622
+ highlighted_target.value = highlight_errors_in_text(initial_tgt, [])
623
+
624
+ demo.launch()