Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,65 @@ from uuid import uuid4
|
|
9 |
from datasets import load_dataset
|
10 |
import shutil
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
DATASET_DIR = Path("json_dataset")
|
13 |
DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
14 |
|
@@ -19,9 +78,7 @@ scheduler = CommitScheduler(
|
|
19 |
path_in_repo="data"
|
20 |
)
|
21 |
|
22 |
-
# Loading dataset from HuggingFace -------------------------------------------------------------------------------------
|
23 |
def download_dataset_file(dataset_id, local_dir):
|
24 |
-
# /home/user/.cache/huggingface/hub/datasets--350016z--Taiwanese_dataset/snapshots/22594253c63bd80e85b5255f948432014c37373a
|
25 |
snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset")
|
26 |
contents = os.listdir(snapshot_path)
|
27 |
|
@@ -33,10 +90,7 @@ def download_dataset_file(dataset_id, local_dir):
|
|
33 |
|
34 |
shutil.copy(source_file_path, local_file_path)
|
35 |
print(f"Copied {file_name} to {local_file_path}")
|
36 |
-
|
37 |
-
# Check file permissions
|
38 |
print(f"Permissions for {local_file_path}: {oct(os.stat(local_file_path).st_mode)}")
|
39 |
-
|
40 |
time.sleep(1)
|
41 |
|
42 |
return local_dir
|
@@ -57,19 +111,33 @@ if not os.path.exists(data_path):
|
|
57 |
print(f"Error: {data_path} does not exist. Please check the file path.")
|
58 |
exit()
|
59 |
|
60 |
-
|
61 |
-
# Loading & Setting --------------------------------------------------------------------------------------------------
|
62 |
-
data = pd.read_csv(data_path, dtype={"id": "Int64"}) # 確保 id 為標準 Python int
|
63 |
|
64 |
current_index = 0
|
65 |
current_errors = []
|
66 |
|
67 |
annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
|
68 |
-
|
|
|
|
|
|
|
69 |
|
70 |
def get_all_ids():
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
def get_current_text():
|
74 |
global current_index, data
|
75 |
source = data.loc[current_index, "source"]
|
@@ -77,122 +145,211 @@ def get_current_text():
|
|
77 |
return source, target
|
78 |
|
79 |
def save_to_json(entry: dict, json_file: Path):
|
80 |
-
"""
|
81 |
-
將資料保存到指定的 JSON 檔案,並推送到 Hugging Face Dataset。
|
82 |
-
"""
|
83 |
with scheduler.lock:
|
84 |
with json_file.open("a") as f:
|
85 |
json.dump(entry, f, ensure_ascii=False)
|
86 |
f.write("\n")
|
|
|
87 |
# scheduler.push_to_hub(commit_message=f"更新檔案 {json_file.name}")
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
|
91 |
global current_index, data, current_errors
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
doc = data.loc[current_index, "doc"]
|
96 |
-
id = int(data.loc[current_index, "id"])
|
97 |
-
reference = data.loc[current_index, "reference"]
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
105 |
|
106 |
-
if error_span
|
107 |
start = target.find(error_span)
|
108 |
end = start + len(error_span)
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
else:
|
111 |
-
return "", "
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
system = data.loc[current_index, "system"]
|
129 |
lp = data.loc[current_index, "lp"]
|
130 |
doc = data.loc[current_index, "doc"]
|
131 |
-
|
132 |
reference = data.loc[current_index, "reference"]
|
133 |
|
134 |
annotations_entry = {
|
135 |
"system": system,
|
136 |
"lp": lp,
|
137 |
"doc": doc,
|
138 |
-
"id":
|
139 |
"rater": rater_selector,
|
140 |
"src": source,
|
141 |
"mt": target,
|
142 |
"ref": reference,
|
143 |
"esa_score": score,
|
144 |
"esa_spans": current_errors,
|
|
|
145 |
}
|
146 |
save_to_json(annotations_entry, annotations_file)
|
147 |
|
148 |
-
#
|
|
|
|
|
|
|
149 |
current_errors = []
|
150 |
|
151 |
current_index += 1
|
152 |
if current_index >= len(data):
|
153 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
next_source, next_target = get_current_text()
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
def update_file_selection(selected_file):
|
160 |
-
global data_path, data, current_index, annotations_file
|
161 |
data_path = os.path.join(current_dir, selected_file)
|
162 |
-
data = pd.read_csv(data_path)
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
168 |
|
169 |
file_base_name = os.path.splitext(selected_file)[0]
|
|
|
170 |
annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json"
|
171 |
|
172 |
-
|
173 |
-
return
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
def mark_as_correct():
|
194 |
global current_errors
|
195 |
-
|
196 |
current_errors.append({
|
197 |
"text": "",
|
198 |
"severity": "No-error",
|
@@ -200,11 +357,14 @@ def mark_as_correct():
|
|
200 |
"end": 0,
|
201 |
"category": "No-error"
|
202 |
})
|
203 |
-
return
|
204 |
-
|
|
|
|
|
|
|
|
|
205 |
def mark_as_too_many_errors():
|
206 |
global current_errors
|
207 |
-
|
208 |
current_errors.append({
|
209 |
"text": "",
|
210 |
"severity": "Major",
|
@@ -212,104 +372,253 @@ def mark_as_too_many_errors():
|
|
212 |
"end": 0,
|
213 |
"category": "Non-translation"
|
214 |
})
|
215 |
-
return
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
DEMO_EXPLANATION = """
|
218 |
## 翻譯標記工具
|
219 |
-
### 使用規則
|
220 |
-
1. **開始作業**
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
"""
|
236 |
|
237 |
with gr.Blocks(css="""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
#correct_button {
|
239 |
-
background-color: #4CAF50;
|
240 |
color: white;
|
241 |
-
font-size:
|
242 |
padding: 5px 5px;
|
243 |
border-radius: 5px;
|
244 |
min-height: 0px;
|
|
|
245 |
}
|
246 |
#too_many_errors_button {
|
247 |
-
background-color: #f44336;
|
248 |
color: white;
|
249 |
-
font-size:
|
250 |
padding: 5px 5px;
|
251 |
border-radius: 5px;
|
252 |
min-height: 0px;
|
|
|
|
|
|
|
|
|
|
|
253 |
}
|
254 |
""") as demo:
|
255 |
gr.Markdown(DEMO_EXPLANATION)
|
256 |
|
257 |
-
|
258 |
-
|
259 |
with gr.Row():
|
260 |
with gr.Column(scale=1):
|
261 |
-
rater_selector = gr.Dropdown(
|
262 |
-
|
263 |
-
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
with gr.Column(scale=8):
|
266 |
-
source = gr.Textbox(label="原始文本", lines=
|
267 |
with gr.Column(scale=8):
|
268 |
-
target = gr.Textbox(label="翻譯文本", lines=
|
269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
with gr.Row(variant='panel', equal_height=True):
|
271 |
with gr.Column(scale=3):
|
272 |
-
error_span = gr.Textbox(
|
|
|
|
|
|
|
|
|
273 |
with gr.Column(scale=3):
|
274 |
with gr.Row(equal_height=True):
|
275 |
-
category = gr.Dropdown(
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
save_current_button = gr.Button("保存並繼續標記當前資料")
|
281 |
-
with gr.Column(scale=
|
282 |
correct_button = gr.Button("✔ 完全正確", elem_id="correct_button")
|
283 |
too_many_errors_button = gr.Button("✖ 過多錯誤", elem_id="too_many_errors_button")
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
with gr.Row(variant='panel', equal_height=True):
|
286 |
-
with gr.Column(scale=
|
287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
with gr.Column(scale=1):
|
289 |
save_next_button = gr.Button("保存並顯示下一筆")
|
290 |
|
|
|
291 |
status = gr.Textbox(label="當前狀態", lines=1, interactive=False)
|
292 |
|
|
|
293 |
def update_subcategories(selected_category):
|
294 |
-
subcategories =
|
295 |
if subcategories:
|
296 |
return gr.update(choices=subcategories, value=subcategories[0])
|
297 |
else:
|
298 |
return gr.update(choices=[], value=None)
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
from datasets import load_dataset
|
10 |
import shutil
|
11 |
|
12 |
+
# ------------------------- 更新:新增顯示用與儲存用的雙層字典 (中文顯示 → 英文儲存) -------------------------
|
13 |
+
# 以下為「錯誤類別」(category) 與「子類別」(subcategory) 的中英對照
|
14 |
+
category_map = {
|
15 |
+
"正確性": "Accuracy",
|
16 |
+
"流暢度": "Fluency",
|
17 |
+
"專有名詞": "Terminology",
|
18 |
+
"風格": "Style",
|
19 |
+
"在地化": "Locale"
|
20 |
+
}
|
21 |
+
|
22 |
+
subcategory_map = {
|
23 |
+
# 正確性
|
24 |
+
("正確性", "誤譯"): ("Accuracy", "Mistranslation"),
|
25 |
+
("正確性", "新增"): ("Accuracy", "Addition"),
|
26 |
+
("正確性", "漏譯"): ("Accuracy", "Omission"),
|
27 |
+
("正確性", "其他"): ("Accuracy", "Other"),
|
28 |
+
|
29 |
+
# 流暢度
|
30 |
+
("流暢度", "文法"): ("Fluency", "Grammar"),
|
31 |
+
("流暢度", "拼字"): ("Fluency", "Spelling"),
|
32 |
+
("流暢度", "標點符號"): ("Fluency", "Punctuation"),
|
33 |
+
("流暢度", "前後不一致"): ("Fluency", "Inconsistency"),
|
34 |
+
("流暢度", "語域"): ("Fluency", "Register"),
|
35 |
+
("流暢度", "其他"): ("Fluency", "Other"),
|
36 |
+
|
37 |
+
# 專有名詞
|
38 |
+
("專有名詞", "使用不當"): ("Terminology", "Inappropriate"),
|
39 |
+
("專有名詞", "不一致"): ("Terminology", "Inconsistent"),
|
40 |
+
("專有名詞", "其他"): ("Terminology", "Other"),
|
41 |
+
|
42 |
+
# 風格
|
43 |
+
("風格", "用字笨拙"): ("Style", "Awkward"),
|
44 |
+
("風格", "其他"): ("Style", "Other"),
|
45 |
+
|
46 |
+
# 在地化
|
47 |
+
("在地化", "貨幣格式"): ("Locale", "Currency format"),
|
48 |
+
("在地化", "時間格式"): ("Locale", "Time format"),
|
49 |
+
("在地化", "人名格式"): ("Locale", "Name format"),
|
50 |
+
("在地化", "日期格式"): ("Locale", "Date format"),
|
51 |
+
("在地化", "地址格式"): ("Locale", "Address format"),
|
52 |
+
("在地化", "其他"): ("Locale", "Other"),
|
53 |
+
}
|
54 |
+
|
55 |
+
# 這些為前端顯示的中文選項,對應到上面 map 中的 key
|
56 |
+
categories_display = {
|
57 |
+
"正確性": ["誤譯", "新增", "漏譯", "其他"],
|
58 |
+
"流暢度": ["文法", "拼字", "標點符號", "前後不一致", "語域", "其他"],
|
59 |
+
"專有名詞": ["使用不當", "不一致", "其他"],
|
60 |
+
"風格": ["用字笨拙", "其他"],
|
61 |
+
"在地化": ["貨幣格式", "時間格式", "人名格式", "日期格式", "地址格式", "其他"]
|
62 |
+
}
|
63 |
+
|
64 |
+
severity_choices_display = ["輕微 (Minor)", "嚴重 (Major)"] # 仍然儲存成 Minor / Major
|
65 |
+
severity_map = {
|
66 |
+
"輕微 (Minor)": "Minor",
|
67 |
+
"嚴重 (Major)": "Major"
|
68 |
+
}
|
69 |
+
|
70 |
+
# ---------------------------------- 其餘程式基本結構不變 -------------------------------------
|
71 |
DATASET_DIR = Path("json_dataset")
|
72 |
DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
73 |
|
|
|
78 |
path_in_repo="data"
|
79 |
)
|
80 |
|
|
|
81 |
def download_dataset_file(dataset_id, local_dir):
|
|
|
82 |
snapshot_path = snapshot_download(repo_id=dataset_id, repo_type="dataset")
|
83 |
contents = os.listdir(snapshot_path)
|
84 |
|
|
|
90 |
|
91 |
shutil.copy(source_file_path, local_file_path)
|
92 |
print(f"Copied {file_name} to {local_file_path}")
|
|
|
|
|
93 |
print(f"Permissions for {local_file_path}: {oct(os.stat(local_file_path).st_mode)}")
|
|
|
94 |
time.sleep(1)
|
95 |
|
96 |
return local_dir
|
|
|
111 |
print(f"Error: {data_path} does not exist. Please check the file path.")
|
112 |
exit()
|
113 |
|
114 |
+
data = pd.read_csv(data_path, dtype={"id": "Int64"}) # 確保 id 為標準 Python int
|
|
|
|
|
115 |
|
116 |
current_index = 0
|
117 |
current_errors = []
|
118 |
|
119 |
annotations_file = DATASET_DIR / f"test_annotations-{uuid4()}.json"
|
120 |
+
|
121 |
+
# --------------------- 改善:可顯示歷史紀錄並顯示錯誤區間狀態 ---------------------
|
122 |
+
# 新增一個資料結構「annotation_history」用來暫存所有標記結果
|
123 |
+
annotation_history = []
|
124 |
|
125 |
def get_all_ids():
|
126 |
+
# 為改善索引檢索效率,將「id + source(前10字)」當作顯示文字
|
127 |
+
# 實際上還是要存回單純的 id,後續要解析
|
128 |
+
id_list = []
|
129 |
+
for i in range(len(data)):
|
130 |
+
idx_value = data.loc[i, "id"]
|
131 |
+
src_text = str(data.loc[i, "source"])[:10].replace("\n", " ")
|
132 |
+
display_str = f"{idx_value}-{src_text}"
|
133 |
+
id_list.append(display_str)
|
134 |
+
return id_list
|
135 |
+
|
136 |
+
def parse_id_from_display(display_str):
|
137 |
+
# 從 "id-前10字" 中分離出真正的 id
|
138 |
+
# 假設固定結構「{id}-{some_text}」
|
139 |
+
return int(display_str.split("-", 1)[0])
|
140 |
+
|
141 |
def get_current_text():
|
142 |
global current_index, data
|
143 |
source = data.loc[current_index, "source"]
|
|
|
145 |
return source, target
|
146 |
|
147 |
def save_to_json(entry: dict, json_file: Path):
|
|
|
|
|
|
|
148 |
with scheduler.lock:
|
149 |
with json_file.open("a") as f:
|
150 |
json.dump(entry, f, ensure_ascii=False)
|
151 |
f.write("\n")
|
152 |
+
# 如需立即Push則取消註解
|
153 |
# scheduler.push_to_hub(commit_message=f"更新檔案 {json_file.name}")
|
154 |
|
155 |
+
# -------------------------- 新增:將錯誤標示記錄在前端介面 --------------------------
|
156 |
+
def get_error_dataframe():
|
157 |
+
"""
|
158 |
+
回傳當前暫存的錯誤清單,用於在前端顯示(例如用 DataFrame)。
|
159 |
+
"""
|
160 |
+
# current_errors 為同一筆資料內的錯誤紀錄
|
161 |
+
# annotation_history 為已經提交到某筆資料(點下一筆)的紀錄
|
162 |
+
df = pd.DataFrame(current_errors)
|
163 |
+
if df.empty:
|
164 |
+
return pd.DataFrame(columns=["text", "severity", "start", "end", "category"])
|
165 |
+
return df[["text", "severity", "start", "end", "category"]]
|
166 |
+
|
167 |
+
# ---------------------- 高亮顯示錯誤區間 (基於 HTML) 的示範 ----------------------
|
168 |
+
def highlight_errors_in_text(text, errors):
|
169 |
+
"""
|
170 |
+
在文本中以 <span style="background-color:yellow;"> 標示錯誤區間。
|
171 |
+
此功能受限於前端顯示,只能在 HTML 環境下顯示,Gradio Textbox 內不支援 HTML。
|
172 |
+
"""
|
173 |
+
highlighted = ""
|
174 |
+
last_end = 0
|
175 |
+
for err in sorted(errors, key=lambda e: e["start"]):
|
176 |
+
st = err["start"]
|
177 |
+
ed = err["end"]
|
178 |
+
# 防呆:若 st/ed 超出範圍則跳過
|
179 |
+
if st < 0 or ed > len(text):
|
180 |
+
continue
|
181 |
+
highlighted += text[last_end:st]
|
182 |
+
# 醒目顏色
|
183 |
+
highlighted += f"<span style='background-color:yellow;'>{text[st:ed]}</span>"
|
184 |
+
last_end = ed
|
185 |
+
highlighted += text[last_end:]
|
186 |
+
return highlighted
|
187 |
|
188 |
def save_current(source, target, rater_selector, error_span, category, subcategory, severity, other):
|
189 |
global current_index, data, current_errors
|
190 |
+
# 如果已經標註超過 5 處,這裡可再詢問使用者(Gradio 無法直接 alert/confirm),暫以提示方式
|
191 |
+
if len(current_errors) >= 5:
|
192 |
+
return "", "您已標記超過 5 處錯誤,如錯誤非常多,可直接按「過多錯誤」,或繼續標注。"
|
|
|
|
|
|
|
193 |
|
194 |
+
# 防呆:若 error_span 內容不存在於 target 中
|
195 |
+
if error_span and error_span not in target:
|
196 |
+
return "", "錯誤區間不存在於翻譯文本中,請檢查!"
|
197 |
+
|
198 |
+
# 轉換 category, subcategory 為英文
|
199 |
+
cat_val, subcat_val = subcategory_map.get((category, subcategory), (category_map.get(category, "Other"), "Other"))
|
200 |
+
# 轉換 severity 為英文
|
201 |
+
severity_val = severity_map.get(severity, "Minor")
|
202 |
|
203 |
+
if error_span:
|
204 |
start = target.find(error_span)
|
205 |
end = start + len(error_span)
|
206 |
+
# 若已經標記過相同範圍,就跳過(防重複)
|
207 |
+
for err in current_errors:
|
208 |
+
if err["start"] == start and err["end"] == end:
|
209 |
+
return "", "此錯誤區間已經標記過,請勿重複標記。"
|
210 |
+
|
211 |
+
current_errors.append({
|
212 |
+
"text": error_span,
|
213 |
+
"severity": severity_val,
|
214 |
+
"start": start,
|
215 |
+
"end": end,
|
216 |
+
"category": f"{cat_val}/{subcat_val}" if subcat_val != "Other" else f"{cat_val}/{other}" if other else f"{cat_val}/Other",
|
217 |
+
})
|
218 |
+
return "", f"已記錄錯誤區間: {error_span},範圍 {start}-{end}。"
|
219 |
else:
|
220 |
+
return "", "請輸入錯誤區間,或選擇『完全正確』按鈕。"
|
221 |
+
|
222 |
+
def save_and_next(source, target, score, rater_selector, alternative_translation):
|
223 |
+
global current_index, data, annotations_file, current_errors, annotation_history
|
224 |
+
|
225 |
+
# 防呆:若未填寫評分/標註人員
|
226 |
+
if rater_selector is None or rater_selector.strip() == "":
|
227 |
+
return (
|
228 |
+
source,
|
229 |
+
target,
|
230 |
+
"", # error_span
|
231 |
+
str(data.loc[current_index, "id"]),
|
232 |
+
f"請先選擇標註人員 (rater)!",
|
233 |
+
get_error_dataframe(),
|
234 |
+
highlight_errors_in_text(target, current_errors)
|
235 |
+
)
|
236 |
+
if score is None:
|
237 |
+
return (
|
238 |
+
source,
|
239 |
+
target,
|
240 |
+
"",
|
241 |
+
str(data.loc[current_index, "id"]),
|
242 |
+
f"請先填寫評分!",
|
243 |
+
get_error_dataframe(),
|
244 |
+
highlight_errors_in_text(target, current_errors)
|
245 |
+
)
|
246 |
|
247 |
system = data.loc[current_index, "system"]
|
248 |
lp = data.loc[current_index, "lp"]
|
249 |
doc = data.loc[current_index, "doc"]
|
250 |
+
id_val = int(data.loc[current_index, "id"])
|
251 |
reference = data.loc[current_index, "reference"]
|
252 |
|
253 |
annotations_entry = {
|
254 |
"system": system,
|
255 |
"lp": lp,
|
256 |
"doc": doc,
|
257 |
+
"id": id_val,
|
258 |
"rater": rater_selector,
|
259 |
"src": source,
|
260 |
"mt": target,
|
261 |
"ref": reference,
|
262 |
"esa_score": score,
|
263 |
"esa_spans": current_errors,
|
264 |
+
"alternative_translation": alternative_translation if alternative_translation else ""
|
265 |
}
|
266 |
save_to_json(annotations_entry, annotations_file)
|
267 |
|
268 |
+
# 儲存到前端「歷史紀錄」以便用戶回顧
|
269 |
+
annotation_history.append(annotations_entry)
|
270 |
+
|
271 |
+
# 清空當前錯誤紀錄
|
272 |
current_errors = []
|
273 |
|
274 |
current_index += 1
|
275 |
if current_index >= len(data):
|
276 |
+
return (
|
277 |
+
"已完成所有文本標記",
|
278 |
+
"已完成所有文本標記",
|
279 |
+
"",
|
280 |
+
"",
|
281 |
+
f"所有標記已完成並保存到 {annotations_file.name}! (共 {len(data)} 筆)",
|
282 |
+
pd.DataFrame(), # 空表
|
283 |
+
""
|
284 |
+
)
|
285 |
|
286 |
next_source, next_target = get_current_text()
|
287 |
+
# 回傳下一筆資訊,並顯示已完成幾筆 / 共幾筆
|
288 |
+
status_msg = f"評分與標記已提交!已完成第 {current_index} 筆 / 共 {len(data)} 筆。"
|
289 |
+
return (
|
290 |
+
next_source,
|
291 |
+
next_target,
|
292 |
+
"",
|
293 |
+
str(data.loc[current_index, "id"]),
|
294 |
+
status_msg,
|
295 |
+
pd.DataFrame(), # 新的一筆錯誤紀錄預設空
|
296 |
+
"" # 沒有錯誤高亮
|
297 |
+
)
|
298 |
|
299 |
def update_file_selection(selected_file):
|
300 |
+
global data_path, data, current_index, annotations_file, current_errors, annotation_history
|
301 |
data_path = os.path.join(current_dir, selected_file)
|
302 |
+
data = pd.read_csv(data_path, dtype={"id":"Int64"})
|
303 |
+
current_errors = []
|
304 |
+
annotation_history = []
|
305 |
+
|
306 |
+
# 重新定位 current_index = 第一行 (或最小id)
|
307 |
+
min_id = data["id"].min()
|
308 |
+
current_index = data.index[data["id"] == min_id].tolist()[0]
|
309 |
|
310 |
file_base_name = os.path.splitext(selected_file)[0]
|
311 |
+
# 產生新的 annotations_file
|
312 |
annotations_file = DATASET_DIR / f"{file_base_name}_annotations-{uuid4()}.json"
|
313 |
|
314 |
+
src, tgt = get_current_text()
|
315 |
+
return (
|
316 |
+
src, # source
|
317 |
+
tgt, # target
|
318 |
+
"", # error_span
|
319 |
+
gr.update(choices=get_all_ids(), value=f"{min_id}-{str(src)[:10]}"), # index_selector
|
320 |
+
str(data.loc[current_index, "id"]), # current_index_display
|
321 |
+
f"已加載檔案:{selected_file}",
|
322 |
+
pd.DataFrame(columns=["text","severity","start","end","category"]),
|
323 |
+
highlight_errors_in_text(tgt, []) # 高亮為空
|
324 |
+
)
|
325 |
+
|
326 |
+
def update_index_selection(selected_display):
|
327 |
+
global current_index, data, current_errors
|
328 |
+
# 從 "id-內容" 解析出真正的 id
|
329 |
+
selected_id = parse_id_from_display(selected_display)
|
330 |
+
# 找到對應行
|
331 |
+
row_list = data.index[data["id"] == selected_id].tolist()
|
332 |
+
if not row_list:
|
333 |
+
return (
|
334 |
+
"", "", str(selected_id),
|
335 |
+
f"找不到id: {selected_id}",
|
336 |
+
get_error_dataframe(),
|
337 |
+
""
|
338 |
+
)
|
339 |
+
current_index = row_list[0]
|
340 |
+
src, tgt = get_current_text()
|
341 |
+
return (
|
342 |
+
src,
|
343 |
+
tgt,
|
344 |
+
str(selected_id),
|
345 |
+
f"已跳轉至 id: {selected_id}",
|
346 |
+
get_error_dataframe(),
|
347 |
+
highlight_errors_in_text(tgt, current_errors)
|
348 |
+
)
|
349 |
|
350 |
def mark_as_correct():
|
351 |
global current_errors
|
352 |
+
# 標註無錯誤
|
353 |
current_errors.append({
|
354 |
"text": "",
|
355 |
"severity": "No-error",
|
|
|
357 |
"end": 0,
|
358 |
"category": "No-error"
|
359 |
})
|
360 |
+
return (
|
361 |
+
"",
|
362 |
+
"標註為完全正確,無錯誤!",
|
363 |
+
get_error_dataframe()
|
364 |
+
)
|
365 |
+
|
366 |
def mark_as_too_many_errors():
|
367 |
global current_errors
|
|
|
368 |
current_errors.append({
|
369 |
"text": "",
|
370 |
"severity": "Major",
|
|
|
372 |
"end": 0,
|
373 |
"category": "Non-translation"
|
374 |
})
|
375 |
+
return (
|
376 |
+
"",
|
377 |
+
"已標註為過多錯誤!",
|
378 |
+
get_error_dataframe()
|
379 |
+
)
|
380 |
+
|
381 |
+
# ------------------------- 新增:提供一個「建議翻譯」欄位 -------------------------
|
382 |
+
# ------------------------- 新增:在界面加上較明顯的評分標準提示 -------------------
|
383 |
DEMO_EXPLANATION = """
|
384 |
## 翻譯標記工具
|
385 |
+
### 使用規則
|
386 |
+
1. **開始作業**
|
387 |
+
- 在「標註人員」選擇您的編號以識別。
|
388 |
+
- 下方「原始文本」顯示原文,右側「翻譯文本」為機器翻譯結果,請仔細檢查右側翻譯並標註錯誤。
|
389 |
+
|
390 |
+
2. **錯誤標註**
|
391 |
+
- 若發現翻譯錯誤,請在「錯誤區間」欄位填入此錯誤在「翻譯文本」中的對應文字。
|
392 |
+
- 選擇「錯誤類別」、「子類別」,以及「錯誤嚴重程度」。
|
393 |
+
- 按下「保存並繼續標記當前資料」即可臨時儲存。
|
394 |
+
- 若錯誤超過五處,請使用「過多錯誤」按鈕(標註為 Major/Non-translation)。
|
395 |
+
- 若無任何錯誤,可直接按「完全正確」。
|
396 |
+
- **系統將在畫面下方顯示錯誤紀錄**,避免重複標記或遺漏。
|
397 |
+
|
398 |
+
3. **評分** (0–100)
|
399 |
+
- 0分:幾乎無法理解,大部分意思遺失。
|
400 |
+
- 33分:保留部分原文意思,有明顯遺漏,句子不流暢或文法差。
|
401 |
+
- 66分:大部分原文意思保留,僅部分文法瑕疵或不一致。
|
402 |
+
- 100分:完全保留原文意思,語句通順無誤。
|
403 |
+
- 註:就算選擇「完全正確」,也可酌情給分,例如 90 或 100。
|
404 |
+
|
405 |
+
4. **建議翻譯**
|
406 |
+
- 若您有更好的譯文想法,可在「建議翻譯」輸入框提供,利於後續改進翻譯品質。
|
407 |
+
|
408 |
+
5. **送出與查看進度**
|
409 |
+
- 按「保存並顯示下一筆」後,系統會進行保存並顯示下一筆資料。
|
410 |
+
- 在畫面下方之「當前狀態」會顯示目前進度,例如「已完成第 X 筆 / 共 Y 筆」。
|
411 |
+
|
412 |
+
6. **注意**
|
413 |
+
- 若���要跳至其他索引,可在「選擇索引」裡選擇。請留意:存檔後才會保留當前標記與評分。
|
414 |
+
- 此平台暫無法動態調整全部字體大小;若有視覺需要,可放大瀏覽器或按下 Ctrl + 滑鼠滾輪。
|
415 |
+
|
416 |
+
以上說明若有不足,請直接留言反饋。
|
417 |
"""
|
418 |
|
419 |
with gr.Blocks(css="""
|
420 |
+
/* 提高整體字體大小 (部分瀏覽器可能需自行縮放) */
|
421 |
+
* {
|
422 |
+
font-size: 16px;
|
423 |
+
}
|
424 |
+
/* 分區的樣式調整 */
|
425 |
+
.panel {
|
426 |
+
border: 1px solid #ccc;
|
427 |
+
padding: 10px;
|
428 |
+
}
|
429 |
+
/* 按鈕樣式區分 */
|
430 |
#correct_button {
|
431 |
+
background-color: #4CAF50; /* 綠色 */
|
432 |
color: white;
|
433 |
+
font-size: 14px;
|
434 |
padding: 5px 5px;
|
435 |
border-radius: 5px;
|
436 |
min-height: 0px;
|
437 |
+
margin-bottom: 10px;
|
438 |
}
|
439 |
#too_many_errors_button {
|
440 |
+
background-color: #f44336; /* 紅色 */
|
441 |
color: white;
|
442 |
+
font-size: 14px;
|
443 |
padding: 5px 5px;
|
444 |
border-radius: 5px;
|
445 |
min-height: 0px;
|
446 |
+
margin-bottom: 10px;
|
447 |
+
}
|
448 |
+
/* 優化下拉清單字體 */
|
449 |
+
label, select {
|
450 |
+
font-size: 16px;
|
451 |
}
|
452 |
""") as demo:
|
453 |
gr.Markdown(DEMO_EXPLANATION)
|
454 |
|
455 |
+
with gr.Tab("標記工具"):
|
456 |
+
# ------------------- 上方: 檔案與索引選擇 -------------------
|
457 |
with gr.Row():
|
458 |
with gr.Column(scale=1):
|
459 |
+
rater_selector = gr.Dropdown(
|
460 |
+
label="標註人員",
|
461 |
+
choices=["rater1", "rater2", "rater3", "rater4", "rater5", "rater6", "rater7"],
|
462 |
+
value="rater1"
|
463 |
+
)
|
464 |
+
file_selector = gr.Dropdown(
|
465 |
+
label="選擇檔案",
|
466 |
+
choices=csv_files,
|
467 |
+
value="test.csv"
|
468 |
+
)
|
469 |
+
index_selector = gr.Dropdown(
|
470 |
+
label="選擇索引 (顯示: id-原文前10字)",
|
471 |
+
choices=get_all_ids(),
|
472 |
+
value=f"{data.loc[current_index, 'id']}-{str(data.loc[current_index, 'source'])[:10]}"
|
473 |
+
)
|
474 |
+
current_index_display = gr.Textbox(
|
475 |
+
label="當前索引(id)",
|
476 |
+
value=str(data.loc[current_index, "id"]),
|
477 |
+
interactive=False
|
478 |
+
)
|
479 |
+
|
480 |
+
# ------------------- 中間: 原文 & 右側: 機翻 -------------------
|
481 |
with gr.Column(scale=8):
|
482 |
+
source = gr.Textbox(label="原始文本", lines=6, interactive=False)
|
483 |
with gr.Column(scale=8):
|
484 |
+
target = gr.Textbox(label="翻譯文本", lines=6, interactive=False)
|
485 |
|
486 |
+
# ------------------- 顯示錯誤高亮 -------------------
|
487 |
+
with gr.Row():
|
488 |
+
# 高亮後的翻譯文本(只讀, HTML 顯示)
|
489 |
+
with gr.Column(scale=8):
|
490 |
+
highlighted_target = gr.HTML(label="錯誤高亮顯示(僅供參考)")
|
491 |
+
|
492 |
+
# ------------------- 中段: 錯誤標註相關 -------------------
|
493 |
with gr.Row(variant='panel', equal_height=True):
|
494 |
with gr.Column(scale=3):
|
495 |
+
error_span = gr.Textbox(
|
496 |
+
label="錯誤區間 (請直接複製『翻譯文本』文字貼上)",
|
497 |
+
lines=3,
|
498 |
+
placeholder="如無錯誤,可按『完全正確』"
|
499 |
+
)
|
500 |
with gr.Column(scale=3):
|
501 |
with gr.Row(equal_height=True):
|
502 |
+
category = gr.Dropdown(
|
503 |
+
label="錯誤類別",
|
504 |
+
choices=list(categories_display.keys()),
|
505 |
+
value="正確性"
|
506 |
+
)
|
507 |
+
subcategory = gr.Dropdown(
|
508 |
+
label="子類別",
|
509 |
+
choices=categories_display["正確性"],
|
510 |
+
value="誤譯"
|
511 |
+
)
|
512 |
+
with gr.Row(equal_height=True):
|
513 |
+
other = gr.Textbox(label="其他子類別", placeholder="若無法歸類,請填寫")
|
514 |
+
severity = gr.Dropdown(
|
515 |
+
label="錯誤嚴重程度",
|
516 |
+
choices=severity_choices_display,
|
517 |
+
value="輕微 (Minor)"
|
518 |
+
)
|
519 |
+
with gr.Row():
|
520 |
save_current_button = gr.Button("保存並繼續標記當前資料")
|
521 |
+
with gr.Column(scale=2):
|
522 |
correct_button = gr.Button("✔ 完全正確", elem_id="correct_button")
|
523 |
too_many_errors_button = gr.Button("✖ 過多錯誤", elem_id="too_many_errors_button")
|
524 |
|
525 |
+
# ----------------- 錯誤紀錄表格 -----------------
|
526 |
+
with gr.Column(scale=4):
|
527 |
+
error_table = gr.Dataframe(
|
528 |
+
headers=["text", "severity", "start", "end", "category"],
|
529 |
+
label="當前句子錯誤紀錄",
|
530 |
+
datatype=["str", "str", "number", "number", "str"],
|
531 |
+
interactive=False
|
532 |
+
)
|
533 |
+
|
534 |
+
# ------------------- 建議翻譯與評分 -------------------
|
535 |
with gr.Row(variant='panel', equal_height=True):
|
536 |
+
with gr.Column(scale=4):
|
537 |
+
alternative_translation = gr.Textbox(
|
538 |
+
label="建議翻譯 (如有更適合的譯文,可在此提供)",
|
539 |
+
lines=2
|
540 |
+
)
|
541 |
+
with gr.Column(scale=4):
|
542 |
+
score = gr.Slider(
|
543 |
+
label="翻譯評分 (0=最差, 100=最好)",
|
544 |
+
minimum=0,
|
545 |
+
maximum=100,
|
546 |
+
step=1,
|
547 |
+
value=66
|
548 |
+
)
|
549 |
with gr.Column(scale=1):
|
550 |
save_next_button = gr.Button("保存並顯示下一筆")
|
551 |
|
552 |
+
# ------------------- 最下方: 狀態列 -------------------
|
553 |
status = gr.Textbox(label="當前狀態", lines=1, interactive=False)
|
554 |
|
555 |
+
# ------------------- 事件處理 -------------------
|
556 |
def update_subcategories(selected_category):
|
557 |
+
subcategories = categories_display[selected_category]
|
558 |
if subcategories:
|
559 |
return gr.update(choices=subcategories, value=subcategories[0])
|
560 |
else:
|
561 |
return gr.update(choices=[], value=None)
|
562 |
+
|
563 |
+
file_selector.change(
|
564 |
+
update_file_selection,
|
565 |
+
inputs=[file_selector],
|
566 |
+
outputs=[
|
567 |
+
source, target, error_span,
|
568 |
+
index_selector, current_index_display,
|
569 |
+
status, error_table, highlighted_target
|
570 |
+
]
|
571 |
+
)
|
572 |
+
index_selector.change(
|
573 |
+
update_index_selection,
|
574 |
+
inputs=[index_selector],
|
575 |
+
outputs=[
|
576 |
+
source, target,
|
577 |
+
current_index_display, status,
|
578 |
+
error_table, highlighted_target
|
579 |
+
]
|
580 |
+
)
|
581 |
+
category.change(
|
582 |
+
update_subcategories,
|
583 |
+
inputs=[category],
|
584 |
+
outputs=[subcategory]
|
585 |
+
)
|
586 |
+
correct_button.click(
|
587 |
+
mark_as_correct,
|
588 |
+
outputs=[error_span, status, error_table]
|
589 |
+
)
|
590 |
+
too_many_errors_button.click(
|
591 |
+
mark_as_too_many_errors,
|
592 |
+
outputs=[error_span, status, error_table]
|
593 |
+
)
|
594 |
+
save_current_button.click(
|
595 |
+
save_current,
|
596 |
+
inputs=[source, target, rater_selector, error_span, category, subcategory, severity, other],
|
597 |
+
outputs=[error_span, status]
|
598 |
+
)
|
599 |
+
|
600 |
+
# 當保存當前錯誤後,也要更新錯誤表格和高亮
|
601 |
+
save_current_button.click(
|
602 |
+
fn=lambda tgt: (get_error_dataframe(), highlight_errors_in_text(tgt, current_errors)),
|
603 |
+
inputs=[target],
|
604 |
+
outputs=[error_table, highlighted_target]
|
605 |
+
)
|
606 |
+
|
607 |
+
save_next_button.click(
|
608 |
+
save_and_next,
|
609 |
+
inputs=[source, target, score, rater_selector, alternative_translation],
|
610 |
+
outputs=[
|
611 |
+
source, target, error_span,
|
612 |
+
current_index_display, status,
|
613 |
+
error_table, highlighted_target
|
614 |
+
]
|
615 |
+
)
|
616 |
+
|
617 |
+
# 預設載入時顯示
|
618 |
+
initial_src, initial_tgt = get_current_text()
|
619 |
+
source.value = initial_src
|
620 |
+
target.value = initial_tgt
|
621 |
+
error_table.value = pd.DataFrame(columns=["text","severity","start","end","category"])
|
622 |
+
highlighted_target.value = highlight_errors_in_text(initial_tgt, [])
|
623 |
+
|
624 |
+
demo.launch()
|