Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- app.py +297 -0
- isco_imperfect.xlsx +0 -0
- isco_predictions.xlsx +0 -0
- requirements.txt +3 -0
- rowsquared-logo-large.png +0 -0
app.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Dict, List, Tuple
|
5 |
+
|
6 |
+
import base64
|
7 |
+
|
8 |
+
PROCESSED_DATA_DIR = Path("./data/processed")
|
9 |
+
DATA_DIR = Path("./data")
|
10 |
+
# Embed logo as a base64 data URI to avoid Gradio toolbar interactions
|
11 |
+
logo_path = DATA_DIR / "asset" / "rowsquared-logo-large.png"
|
12 |
+
with open(logo_path, "rb") as f:
|
13 |
+
logo_b64 = base64.b64encode(f.read()).decode("utf-8")
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
# ----------------------------
|
18 |
+
# Data loading & preprocessing
|
19 |
+
# ----------------------------
|
20 |
+
df_isco = (
|
21 |
+
pd.read_excel(
|
22 |
+
PROCESSED_DATA_DIR / "isco_imperfect.xlsx",
|
23 |
+
converters={"major": str, "sub_major": str, "minor": str, "unit": str},
|
24 |
+
)[["major_label", "sub_major_label", "minor_label", "unit_label"]]
|
25 |
+
.dropna()
|
26 |
+
.drop_duplicates()
|
27 |
+
.reset_index(drop=True)
|
28 |
+
)
|
29 |
+
|
30 |
+
# Build nested hierarchy dict: {major: {sub: {minor: [units]}}}
|
31 |
+
hierarchy: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
|
32 |
+
for _, r in df_isco.iterrows():
|
33 |
+
hierarchy.setdefault(r.major_label, {}) \
|
34 |
+
.setdefault(r.sub_major_label, {}) \
|
35 |
+
.setdefault(r.minor_label, []) \
|
36 |
+
.append(r.unit_label)
|
37 |
+
|
38 |
+
# Ensure uniqueness & sorting at leaf lists
|
39 |
+
for maj in hierarchy:
|
40 |
+
for sub in hierarchy[maj]:
|
41 |
+
for mn in hierarchy[maj][sub]:
|
42 |
+
hierarchy[maj][sub][mn] = sorted(list(dict.fromkeys(hierarchy[maj][sub][mn])))
|
43 |
+
|
44 |
+
# Fast helpers for children
|
45 |
+
def majors() -> List[str]:
|
46 |
+
return sorted(hierarchy.keys())
|
47 |
+
|
48 |
+
def submajors(maj: str) -> List[str]:
|
49 |
+
return sorted(hierarchy.get(maj, {}).keys())
|
50 |
+
|
51 |
+
def minors(maj: str, sub: str) -> List[str]:
|
52 |
+
return sorted(hierarchy.get(maj, {}).get(sub, {}).keys())
|
53 |
+
|
54 |
+
def units(maj: str, sub: str, mn: str) -> List[str]:
|
55 |
+
return hierarchy.get(maj, {}).get(sub, {}).get(mn, [])
|
56 |
+
|
57 |
+
# ----------------------------
|
58 |
+
# Records to annotate
|
59 |
+
# ----------------------------
|
60 |
+
records = pd.read_excel(PROCESSED_DATA_DIR / "isco_predictions.xlsx").copy()
|
61 |
+
for col in ["major_label", "sub_major_label", "minor_label", "unit_label"]:
|
62 |
+
if col not in records:
|
63 |
+
records[col] = ""
|
64 |
+
|
65 |
+
if "annotated" not in records:
|
66 |
+
records["annotated"] = False
|
67 |
+
|
68 |
+
# ensure not views
|
69 |
+
for col in ["major_label", "sub_major_label", "minor_label", "unit_label", "annotated"]:
|
70 |
+
records[col] = records[col].copy()
|
71 |
+
|
72 |
+
records.reset_index(drop=True, inplace=True)
|
73 |
+
|
74 |
+
# -----------------------------------
|
75 |
+
# Core logic: clamp & state management
|
76 |
+
# -----------------------------------
|
77 |
+
def clamp_path(maj: str, sub: str, mn: str, un: str
|
78 |
+
) -> Tuple[str, str, str, str, List[str], List[str], List[str], List[str]]:
|
79 |
+
"""Return a valid (maj, sub, mn, un) tuple + their choices lists.
|
80 |
+
Only replace a level if it's invalid for the hierarchy."""
|
81 |
+
maj_choices = majors()
|
82 |
+
if maj not in maj_choices:
|
83 |
+
maj = maj_choices[0] if maj_choices else ""
|
84 |
+
|
85 |
+
sub_choices = submajors(maj) if maj else []
|
86 |
+
if sub not in sub_choices:
|
87 |
+
sub = sub_choices[0] if sub_choices else ""
|
88 |
+
|
89 |
+
mn_choices = minors(maj, sub) if sub else []
|
90 |
+
if mn not in mn_choices:
|
91 |
+
mn = mn_choices[0] if mn_choices else ""
|
92 |
+
|
93 |
+
un_choices = units(maj, sub, mn) if mn else []
|
94 |
+
if un not in un_choices:
|
95 |
+
un = un_choices[0] if un_choices else ""
|
96 |
+
|
97 |
+
return maj, sub, mn, un, maj_choices, sub_choices, mn_choices, un_choices
|
98 |
+
|
99 |
+
def save_record(i: int, maj: str, sub: str, mn: str, un: str) -> None:
|
100 |
+
records.loc[i, ["major_label", "sub_major_label", "minor_label", "unit_label"]] = [maj, sub, mn, un]
|
101 |
+
records.loc[i, "annotated"] = True
|
102 |
+
|
103 |
+
def status_text(i: int) -> str:
|
104 |
+
return f"**Status**: {'✅ Annotated' if records.loc[i, 'annotated'] else '❌ Not Annotated'}"
|
105 |
+
|
106 |
+
def load_record(i: int):
|
107 |
+
rec = records.loc[i]
|
108 |
+
maj, sub, mn, un, maj_c, sub_c, mn_c, un_c = clamp_path(
|
109 |
+
rec["major_label"], rec["sub_major_label"], rec["minor_label"], rec["unit_label"]
|
110 |
+
)
|
111 |
+
# Persist clamped values back (only if changed)
|
112 |
+
save_record(i, maj, sub, mn, un)
|
113 |
+
|
114 |
+
record_md = f"## Occupation: {rec['occupation_title_main']}\n## Industry: {rec['industry_title_main']}"
|
115 |
+
return (
|
116 |
+
record_md,
|
117 |
+
status_text(i),
|
118 |
+
gr.update(choices=maj_c, value=maj),
|
119 |
+
gr.update(choices=sub_c, value=sub),
|
120 |
+
gr.update(choices=mn_c, value=mn),
|
121 |
+
gr.update(choices=un_c, value=un),
|
122 |
+
)
|
123 |
+
|
124 |
+
# ---------------------
|
125 |
+
# Event handler helpers
|
126 |
+
# ---------------------
|
127 |
+
def on_major_change(new_major: str, i: int):
|
128 |
+
sub_c = submajors(new_major)
|
129 |
+
sub = sub_c[0] if sub_c else ""
|
130 |
+
mn_c = minors(new_major, sub) if sub else []
|
131 |
+
mn = mn_c[0] if mn_c else ""
|
132 |
+
un_c = units(new_major, sub, mn) if mn else []
|
133 |
+
un = un_c[0] if un_c else ""
|
134 |
+
save_record(i, new_major, sub, mn, un)
|
135 |
+
return (
|
136 |
+
gr.update(choices=majors(), value=new_major),
|
137 |
+
gr.update(choices=sub_c, value=sub),
|
138 |
+
gr.update(choices=mn_c, value=mn),
|
139 |
+
gr.update(choices=un_c, value=un),
|
140 |
+
status_text(i),
|
141 |
+
)
|
142 |
+
|
143 |
+
def on_sub_change(new_sub: str, i: int, major: str):
|
144 |
+
mn_c = minors(major, new_sub)
|
145 |
+
mn = mn_c[0] if mn_c else ""
|
146 |
+
un_c = units(major, new_sub, mn) if mn else []
|
147 |
+
un = un_c[0] if un_c else ""
|
148 |
+
records.loc[i, ["sub_major_label", "minor_label", "unit_label"]] = [new_sub, mn, un]
|
149 |
+
records.loc[i, "annotated"] = True
|
150 |
+
return (
|
151 |
+
gr.update(choices=submajors(major), value=new_sub),
|
152 |
+
gr.update(choices=mn_c, value=mn),
|
153 |
+
gr.update(choices=un_c, value=un),
|
154 |
+
status_text(i),
|
155 |
+
)
|
156 |
+
|
157 |
+
def on_minor_change(new_minor: str, i: int, major: str, sub: str):
|
158 |
+
un_c = units(major, sub, new_minor)
|
159 |
+
un = un_c[0] if un_c else ""
|
160 |
+
records.loc[i, ["minor_label", "unit_label"]] = [new_minor, un]
|
161 |
+
records.loc[i, "annotated"] = True
|
162 |
+
return (
|
163 |
+
gr.update(choices=minors(major, sub), value=new_minor),
|
164 |
+
gr.update(choices=un_c, value=un),
|
165 |
+
status_text(i),
|
166 |
+
)
|
167 |
+
|
168 |
+
def on_unit_change(new_unit: str, i: int, major: str, sub: str, mn: str):
|
169 |
+
un_c = units(major, sub, mn)
|
170 |
+
if new_unit not in un_c:
|
171 |
+
new_unit = un_c[0] if un_c else ""
|
172 |
+
records.loc[i, "unit_label"] = new_unit
|
173 |
+
records.loc[i, "annotated"] = True
|
174 |
+
return gr.update(choices=un_c, value=new_unit), status_text(i)
|
175 |
+
|
176 |
+
def go_next(i: int) -> int:
|
177 |
+
return (i + 1) % len(records)
|
178 |
+
|
179 |
+
def go_prev(i: int) -> int:
|
180 |
+
return (i - 1) % len(records)
|
181 |
+
|
182 |
+
# ---- NAVIGATION: save + move + reload in ONE callback ----
|
183 |
+
|
184 |
+
def save_and_jump(i: int, direction: str):
|
185 |
+
# Final safety net: clamp and persist whatever is currently stored
|
186 |
+
rec = records.loc[i]
|
187 |
+
maj, sub, mn, un, *_ = clamp_path(
|
188 |
+
rec["major_label"], rec["sub_major_label"], rec["minor_label"], rec["unit_label"]
|
189 |
+
)
|
190 |
+
save_record(i, maj, sub, mn, un)
|
191 |
+
new_i = go_next(i) if direction == "next" else go_prev(i)
|
192 |
+
return (new_i,) + load_record(new_i)
|
193 |
+
|
194 |
+
def download_annotations() -> str:
|
195 |
+
path = PROCESSED_DATA_DIR / "annotated_output.csv"
|
196 |
+
records.to_csv(path, index=False)
|
197 |
+
return str(path)
|
198 |
+
|
199 |
+
# --------------
|
200 |
+
# Build the UI
|
201 |
+
# --------------
|
202 |
+
def build_gradio_app():
|
203 |
+
with gr.Blocks() as demo:
|
204 |
+
with gr.Row():
|
205 |
+
with gr.Column(scale=1):
|
206 |
+
# Static logo, non-interactive
|
207 |
+
gr.HTML(
|
208 |
+
f'<img src="data:image/png;base64,{logo_b64}" width="200" style="pointer-events:none; user-select:none; display:block;" />'
|
209 |
+
)
|
210 |
+
with gr.Row():
|
211 |
+
gr.Markdown("# ISCO Annotation", elem_id="isco-title")
|
212 |
+
gr.HTML("""
|
213 |
+
<style>
|
214 |
+
#isco-title {
|
215 |
+
text-align: center;
|
216 |
+
width: 100%;
|
217 |
+
margin: 0.5em 0;
|
218 |
+
}
|
219 |
+
footer { display: none !important; }
|
220 |
+
.gradio-container .api-link, .gradio-container .share-link { display: none !important; }
|
221 |
+
</style>
|
222 |
+
""")
|
223 |
+
|
224 |
+
idx_state = gr.State(0)
|
225 |
+
|
226 |
+
with gr.Group():
|
227 |
+
record_md = gr.Markdown()
|
228 |
+
status_md = gr.Markdown()
|
229 |
+
|
230 |
+
with gr.Row():
|
231 |
+
prev_btn = gr.Button("⬅ Previous")
|
232 |
+
next_btn = gr.Button("✅ Next")
|
233 |
+
|
234 |
+
with gr.Row():
|
235 |
+
with gr.Column():
|
236 |
+
major_radio = gr.Radio(label="Level 1: Major", choices=[], interactive=True)
|
237 |
+
with gr.Column():
|
238 |
+
sub_radio = gr.Radio(label="Level 2: Sub-major", choices=[], interactive=True)
|
239 |
+
with gr.Column():
|
240 |
+
minor_radio = gr.Radio(label="Level 3: Minor", choices=[], interactive=True)
|
241 |
+
with gr.Column():
|
242 |
+
unit_radio = gr.Radio(label="Level 4: Unit", choices=[], interactive=True)
|
243 |
+
|
244 |
+
download_btn = gr.Button("📥 Download Annotations")
|
245 |
+
download_file = gr.File(label="Annotated CSV", visible=False)
|
246 |
+
|
247 |
+
# Initial load
|
248 |
+
demo.load(
|
249 |
+
lambda: (0,) + load_record(0),
|
250 |
+
outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio],
|
251 |
+
)
|
252 |
+
|
253 |
+
next_btn.click(lambda i: save_and_jump(i, "next"),
|
254 |
+
inputs=[idx_state],
|
255 |
+
outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio])
|
256 |
+
|
257 |
+
prev_btn.click(lambda i: save_and_jump(i, "prev"),
|
258 |
+
inputs=[idx_state],
|
259 |
+
outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio])
|
260 |
+
|
261 |
+
# Change handlers (also update status)
|
262 |
+
major_radio.change(
|
263 |
+
on_major_change,
|
264 |
+
inputs=[major_radio, idx_state],
|
265 |
+
outputs=[major_radio, sub_radio, minor_radio, unit_radio, status_md],
|
266 |
+
)
|
267 |
+
|
268 |
+
sub_radio.change(
|
269 |
+
on_sub_change,
|
270 |
+
inputs=[sub_radio, idx_state, major_radio],
|
271 |
+
outputs=[sub_radio, minor_radio, unit_radio, status_md],
|
272 |
+
)
|
273 |
+
|
274 |
+
minor_radio.change(
|
275 |
+
on_minor_change,
|
276 |
+
inputs=[minor_radio, idx_state, major_radio, sub_radio],
|
277 |
+
outputs=[minor_radio, unit_radio, status_md],
|
278 |
+
)
|
279 |
+
|
280 |
+
unit_radio.change(
|
281 |
+
on_unit_change,
|
282 |
+
inputs=[unit_radio, idx_state, major_radio, sub_radio, minor_radio],
|
283 |
+
outputs=[unit_radio, status_md],
|
284 |
+
)
|
285 |
+
|
286 |
+
# Download
|
287 |
+
download_btn.click(download_annotations, outputs=[download_file]).then(
|
288 |
+
lambda: gr.update(visible=True), None, [download_file]
|
289 |
+
)
|
290 |
+
|
291 |
+
return demo
|
292 |
+
|
293 |
+
|
294 |
+
if __name__ == "__main__":
|
295 |
+
demo = build_gradio_app()
|
296 |
+
demo.queue().launch(show_api=False)
|
297 |
+
|
isco_imperfect.xlsx
ADDED
Binary file (41.8 kB). View file
|
|
isco_predictions.xlsx
ADDED
Binary file (46.1 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=3.0
|
2 |
+
pandas
|
3 |
+
openpyxl
|
rowsquared-logo-large.png
ADDED
![]() |