350016z commited on
Commit
4f4750e
·
verified ·
1 Parent(s): b5dc54b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -42
app.py CHANGED
@@ -64,17 +64,22 @@ if not os.path.exists(data_path):
64
  exit()
65
 
66
 
67
- # Loading Data-----------------------------------------------------------------------------------------------------------
68
  data = pd.read_csv(data_path, dtype={"id": "Int64"}) # 確保 id 為標準 Python int
69
 
70
  current_index = 0
71
- def get_all_ids():
72
- return [str(id) for id in data["id"].tolist()]
 
 
73
  # ---------------------------------------------------------------------------------------------------------------------
74
 
75
- annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
76
- score_file = DATASET_DIR / f"test_score-{uuid4()}.json"
77
 
 
 
 
 
 
78
  def get_current_text():
79
  global current_index, data
80
  source = data.loc[current_index, "source"]
@@ -91,80 +96,156 @@ def save_to_json(entry: dict, json_file: Path):
91
  f.write("\n")
92
  # scheduler.push_to_hub(commit_message=f"更新檔案 {json_file.name}")
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
95
- global current_index, data, annotations_file
 
 
 
96
 
97
  system = data.loc[current_index, "system"]
98
  lp = data.loc[current_index, "lp"]
99
  doc = data.loc[current_index, "doc"]
100
  id = int(data.loc[current_index, "id"])
101
  reference = data.loc[current_index, "reference"]
102
-
103
-
104
- if category != "Non-translation" and category != "No-error":
105
- category_value = f"{category}/{subcategory}"
106
  else:
107
  category_value = category
108
 
109
  if error_span and error_span in target:
110
- highlighted_error_span = target.replace(error_span, f"<v>{error_span}</v>")
111
- elif not error_span:
112
- highlighted_error_span = target
113
  else:
114
- highlighted_error_span = error_span # 若 error_span 不存在於 target,則保持原樣
115
 
116
- new_entry = {
117
- "system": system,
118
- "lp": lp,
119
- "doc": doc,
120
- "id": id,
121
- "source": source,
122
- "mt": target,
123
- "target": highlighted_error_span,
124
  "category": category_value,
125
- "severity": severity,
126
- "other": other if other else "",
127
- "rater": rater_selector,
128
- }
129
- save_to_json(new_entry, annotations_file)
130
 
131
  # [error_span, status]
132
- return "", f"當前資料已保存到 {annotations_file.name},請繼續標記!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  def save_and_next(source, target, score, rater_selector):
135
- global current_index, data, score_file
 
136
  system = data.loc[current_index, "system"]
137
  lp = data.loc[current_index, "lp"]
138
  doc = data.loc[current_index, "doc"]
139
  id = int(data.loc[current_index, "id"])
140
  reference = data.loc[current_index, "reference"]
141
 
142
- id_list = [str(id) for id in sorted(data["id"].unique())]
143
- max_id = int(id_list[-1]) # 取得最大的 ID
144
-
145
- new_entry = {
146
  "system": system,
147
  "lp": lp,
148
  "doc": doc,
149
  "id": id,
 
150
  "src": source,
151
  "mt": target,
152
  "ref": reference,
153
- "score": score,
154
- "rater": rater_selector,
 
155
  }
156
- save_to_json(new_entry, score_file)
 
 
 
 
157
 
158
  current_index += 1
159
  if current_index >= len(data):
160
- return "已完成所有文本標記", "已完成所有文本標記", "", "", f"所有標記已完成並保存到 {score_file.name}!"
161
-
162
  next_source, next_target = get_current_text()
163
- # [source, target, error_span, current_index_display, status]
164
- return next_source, next_target, "", str(current_index), f"分數已保存到 {score_file.name},請繼續下一筆!"
165
 
166
  def update_file_selection(selected_file):
167
- global data_path, data, current_index, annotations_file, score_file
168
  data_path = os.path.join(current_dir, selected_file)
169
  data = pd.read_csv(data_path)
170
 
@@ -175,7 +256,7 @@ def update_file_selection(selected_file):
175
 
176
  file_base_name = os.path.splitext(selected_file)[0]
177
  annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json"
178
- score_file = DATASET_DIR / f"{file_base_name}_score-{uuid4()}.json"
179
 
180
  # [source, target, error_span, index_selector, current_index_display, status]
181
  return get_current_text() + ("", gr.update(choices=id_list, value=str(min_id)), str(min_id), f"已加載檔案:{selected_file}")
 
64
  exit()
65
 
66
 
67
+ # Loading & Setting --------------------------------------------------------------------------------------------------
68
  data = pd.read_csv(data_path, dtype={"id": "Int64"}) # 確保 id 為標準 Python int
69
 
70
  current_index = 0
71
+ current_errors = []
72
+ current_others = []
73
+
74
+ annotations_file = DATASET_DIR / f"test-{uuid4()}.json"
75
  # ---------------------------------------------------------------------------------------------------------------------
76
 
 
 
77
 
78
+ # score_file = DATASET_DIR / f"test_score-{uuid4()}.json"
79
+
80
+ def get_all_ids():
81
+ return [str(id) for id in data["id"].tolist()]
82
+
83
  def get_current_text():
84
  global current_index, data
85
  source = data.loc[current_index, "source"]
 
96
  f.write("\n")
97
  # scheduler.push_to_hub(commit_message=f"更新檔案 {json_file.name}")
98
 
99
+ # def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
100
+ # global current_index, data, annotations_file
101
+
102
+ # system = data.loc[current_index, "system"]
103
+ # lp = data.loc[current_index, "lp"]
104
+ # doc = data.loc[current_index, "doc"]
105
+ # id = int(data.loc[current_index, "id"])
106
+ # reference = data.loc[current_index, "reference"]
107
+
108
+
109
+ # if category != "Non-translation" and category != "No-error":
110
+ # category_value = f"{category}/{subcategory}"
111
+ # else:
112
+ # category_value = category
113
+
114
+ # if error_span and error_span in target:
115
+ # highlighted_error_span = target.replace(error_span, f"<v>{error_span}</v>")
116
+ # elif not error_span:
117
+ # highlighted_error_span = target
118
+ # else:
119
+ # highlighted_error_span = error_span # 若 error_span 不存在於 target,則保持原樣
120
+
121
+ # new_entry = {
122
+ # "system": system,
123
+ # "lp": lp,
124
+ # "doc": doc,
125
+ # "id": id,
126
+ # "source": source,
127
+ # "mt": target,
128
+ # "target": highlighted_error_span,
129
+ # "category": category_value,
130
+ # "severity": severity,
131
+ # "other": other if other else "",
132
+ # "rater": rater_selector,
133
+ # }
134
+ # save_to_json(new_entry, annotations_file)
135
+
136
+ # # [error_span, status]
137
+ # return "", f"當前資料已保存到 {annotations_file.name},請繼續標記!"
138
+
139
  def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
140
+ global current_index, data, current_errors
141
+
142
+ if category == "No-error":
143
+ return "", "無錯誤,不需要保存錯誤區間。"
144
 
145
  system = data.loc[current_index, "system"]
146
  lp = data.loc[current_index, "lp"]
147
  doc = data.loc[current_index, "doc"]
148
  id = int(data.loc[current_index, "id"])
149
  reference = data.loc[current_index, "reference"]
150
+
151
+ if category != "Non-translation":
152
+ category_value = f"{category}/{subcategory}" if subcategory else category
 
153
  else:
154
  category_value = category
155
 
156
  if error_span and error_span in target:
157
+ start = target.find(error_span)
158
+ end = start + len(error_span)
159
+ print(f"start: {start}, end: {end}")
160
  else:
161
+ return "", "錯誤區間不存在於翻譯文本中,請檢查!"
162
 
163
+ current_errors.append({
164
+ "text": error_span,
165
+ "severity": severity.lower(),
166
+ "start": start,
167
+ "end": end
168
+ })
169
+
170
+ current_others.append({
171
  "category": category_value,
172
+ "others": other if other else "",
173
+ })
 
 
 
174
 
175
  # [error_span, status]
176
+ return "", f"已記錄錯誤區間: {error_span},範圍 {start}-{end}。"
177
+
178
+
179
+ # def save_and_next(source, target, score, rater_selector):
180
+ # global current_index, data, score_file
181
+ # system = data.loc[current_index, "system"]
182
+ # lp = data.loc[current_index, "lp"]
183
+ # doc = data.loc[current_index, "doc"]
184
+ # id = int(data.loc[current_index, "id"])
185
+ # reference = data.loc[current_index, "reference"]
186
+
187
+ # id_list = [str(id) for id in sorted(data["id"].unique())]
188
+ # max_id = int(id_list[-1]) # 取得最大的 ID
189
+
190
+ # new_entry = {
191
+ # "system": system,
192
+ # "lp": lp,
193
+ # "doc": doc,
194
+ # "id": id,
195
+ # "src": source,
196
+ # "mt": target,
197
+ # "ref": reference,
198
+ # "score": score,
199
+ # "rater": rater_selector,
200
+ # }
201
+ # save_to_json(new_entry, score_file)
202
+
203
+ # current_index += 1
204
+ # if current_index >= len(data):
205
+ # return "已完成所有文本標記", "已完成所有文本標記", "", "", f"所有標記已完成並保存到 {score_file.name}!"
206
+
207
+ # next_source, next_target = get_current_text()
208
+ # # [source, target, error_span, current_index_display, status]
209
+ # return next_source, next_target, "", str(current_index), f"分數已保存到 {score_file.name},請繼續下一筆!"
210
 
211
  def save_and_next(source, target, score, rater_selector):
212
+ global current_index, data, annotations_file, current_errors, current_others
213
+
214
  system = data.loc[current_index, "system"]
215
  lp = data.loc[current_index, "lp"]
216
  doc = data.loc[current_index, "doc"]
217
  id = int(data.loc[current_index, "id"])
218
  reference = data.loc[current_index, "reference"]
219
 
220
+ annotations_entry = {
 
 
 
221
  "system": system,
222
  "lp": lp,
223
  "doc": doc,
224
  "id": id,
225
+ "rater": rater_selector,
226
  "src": source,
227
  "mt": target,
228
  "ref": reference,
229
+ "sentence_score": score / 100.0, # 標準化到 [0, 1]
230
+ "errors": current_errors,
231
+ "others": current_others,
232
  }
233
+ save_to_json(annotations_entry, annotations_file)
234
+
235
+ # 清空當前錯誤緩存
236
+ current_errors = []
237
+ current_others = []
238
 
239
  current_index += 1
240
  if current_index >= len(data):
241
+ return "已完成所有文本標記", "已完成所有文本標記", "", "", f"所有標記已完成並保存到 {annotations_file.name}!"
242
+
243
  next_source, next_target = get_current_text()
244
+ return next_source, next_target, "", str(current_index), f"分數與錯誤已保存到 {annotations_file.name},請繼續下一筆!"
245
+
246
 
247
  def update_file_selection(selected_file):
248
+ global data_path, data, current_index, annotations_file
249
  data_path = os.path.join(current_dir, selected_file)
250
  data = pd.read_csv(data_path)
251
 
 
256
 
257
  file_base_name = os.path.splitext(selected_file)[0]
258
  annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json"
259
+ # score_file = DATASET_DIR / f"{file_base_name}_score-{uuid4()}.json"
260
 
261
  # [source, target, error_span, index_selector, current_index_display, status]
262
  return get_current_text() + ("", gr.update(choices=id_list, value=str(min_id)), str(min_id), f"已加載檔案:{selected_file}")