gtani commited on
Commit
e3e0b69
·
verified ·
1 Parent(s): 4913301

Upload 5 files

Browse files
app.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from typing import Dict, List, Tuple
5
+
6
+ import base64
7
+
8
+ PROCESSED_DATA_DIR = Path("./data/processed")
9
+ DATA_DIR = Path("./data")
10
+ # Embed logo as a base64 data URI to avoid Gradio toolbar interactions
11
+ logo_path = DATA_DIR / "asset" / "rowsquared-logo-large.png"
12
+ with open(logo_path, "rb") as f:
13
+ logo_b64 = base64.b64encode(f.read()).decode("utf-8")
14
+
15
+
16
+
17
+ # ----------------------------
18
+ # Data loading & preprocessing
19
+ # ----------------------------
20
+ df_isco = (
21
+ pd.read_excel(
22
+ PROCESSED_DATA_DIR / "isco_imperfect.xlsx",
23
+ converters={"major": str, "sub_major": str, "minor": str, "unit": str},
24
+ )[["major_label", "sub_major_label", "minor_label", "unit_label"]]
25
+ .dropna()
26
+ .drop_duplicates()
27
+ .reset_index(drop=True)
28
+ )
29
+
30
+ # Build nested hierarchy dict: {major: {sub: {minor: [units]}}}
31
+ hierarchy: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
32
+ for _, r in df_isco.iterrows():
33
+ hierarchy.setdefault(r.major_label, {}) \
34
+ .setdefault(r.sub_major_label, {}) \
35
+ .setdefault(r.minor_label, []) \
36
+ .append(r.unit_label)
37
+
38
+ # Ensure uniqueness & sorting at leaf lists
39
+ for maj in hierarchy:
40
+ for sub in hierarchy[maj]:
41
+ for mn in hierarchy[maj][sub]:
42
+ hierarchy[maj][sub][mn] = sorted(list(dict.fromkeys(hierarchy[maj][sub][mn])))
43
+
44
+ # Fast helpers for children
45
+ def majors() -> List[str]:
46
+ return sorted(hierarchy.keys())
47
+
48
+ def submajors(maj: str) -> List[str]:
49
+ return sorted(hierarchy.get(maj, {}).keys())
50
+
51
+ def minors(maj: str, sub: str) -> List[str]:
52
+ return sorted(hierarchy.get(maj, {}).get(sub, {}).keys())
53
+
54
+ def units(maj: str, sub: str, mn: str) -> List[str]:
55
+ return hierarchy.get(maj, {}).get(sub, {}).get(mn, [])
56
+
57
+ # ----------------------------
58
+ # Records to annotate
59
+ # ----------------------------
60
+ records = pd.read_excel(PROCESSED_DATA_DIR / "isco_predictions.xlsx").copy()
61
+ for col in ["major_label", "sub_major_label", "minor_label", "unit_label"]:
62
+ if col not in records:
63
+ records[col] = ""
64
+
65
+ if "annotated" not in records:
66
+ records["annotated"] = False
67
+
68
+ # ensure not views
69
+ for col in ["major_label", "sub_major_label", "minor_label", "unit_label", "annotated"]:
70
+ records[col] = records[col].copy()
71
+
72
+ records.reset_index(drop=True, inplace=True)
73
+
74
+ # -----------------------------------
75
+ # Core logic: clamp & state management
76
+ # -----------------------------------
77
+ def clamp_path(maj: str, sub: str, mn: str, un: str
78
+ ) -> Tuple[str, str, str, str, List[str], List[str], List[str], List[str]]:
79
+ """Return a valid (maj, sub, mn, un) tuple + their choices lists.
80
+ Only replace a level if it's invalid for the hierarchy."""
81
+ maj_choices = majors()
82
+ if maj not in maj_choices:
83
+ maj = maj_choices[0] if maj_choices else ""
84
+
85
+ sub_choices = submajors(maj) if maj else []
86
+ if sub not in sub_choices:
87
+ sub = sub_choices[0] if sub_choices else ""
88
+
89
+ mn_choices = minors(maj, sub) if sub else []
90
+ if mn not in mn_choices:
91
+ mn = mn_choices[0] if mn_choices else ""
92
+
93
+ un_choices = units(maj, sub, mn) if mn else []
94
+ if un not in un_choices:
95
+ un = un_choices[0] if un_choices else ""
96
+
97
+ return maj, sub, mn, un, maj_choices, sub_choices, mn_choices, un_choices
98
+
99
+ def save_record(i: int, maj: str, sub: str, mn: str, un: str) -> None:
100
+ records.loc[i, ["major_label", "sub_major_label", "minor_label", "unit_label"]] = [maj, sub, mn, un]
101
+ records.loc[i, "annotated"] = True
102
+
103
+ def status_text(i: int) -> str:
104
+ return f"**Status**: {'✅ Annotated' if records.loc[i, 'annotated'] else '❌ Not Annotated'}"
105
+
106
+ def load_record(i: int):
107
+ rec = records.loc[i]
108
+ maj, sub, mn, un, maj_c, sub_c, mn_c, un_c = clamp_path(
109
+ rec["major_label"], rec["sub_major_label"], rec["minor_label"], rec["unit_label"]
110
+ )
111
+ # Persist clamped values back (only if changed)
112
+ save_record(i, maj, sub, mn, un)
113
+
114
+ record_md = f"## Occupation: {rec['occupation_title_main']}\n## Industry: {rec['industry_title_main']}"
115
+ return (
116
+ record_md,
117
+ status_text(i),
118
+ gr.update(choices=maj_c, value=maj),
119
+ gr.update(choices=sub_c, value=sub),
120
+ gr.update(choices=mn_c, value=mn),
121
+ gr.update(choices=un_c, value=un),
122
+ )
123
+
124
+ # ---------------------
125
+ # Event handler helpers
126
+ # ---------------------
127
+ def on_major_change(new_major: str, i: int):
128
+ sub_c = submajors(new_major)
129
+ sub = sub_c[0] if sub_c else ""
130
+ mn_c = minors(new_major, sub) if sub else []
131
+ mn = mn_c[0] if mn_c else ""
132
+ un_c = units(new_major, sub, mn) if mn else []
133
+ un = un_c[0] if un_c else ""
134
+ save_record(i, new_major, sub, mn, un)
135
+ return (
136
+ gr.update(choices=majors(), value=new_major),
137
+ gr.update(choices=sub_c, value=sub),
138
+ gr.update(choices=mn_c, value=mn),
139
+ gr.update(choices=un_c, value=un),
140
+ status_text(i),
141
+ )
142
+
143
+ def on_sub_change(new_sub: str, i: int, major: str):
144
+ mn_c = minors(major, new_sub)
145
+ mn = mn_c[0] if mn_c else ""
146
+ un_c = units(major, new_sub, mn) if mn else []
147
+ un = un_c[0] if un_c else ""
148
+ records.loc[i, ["sub_major_label", "minor_label", "unit_label"]] = [new_sub, mn, un]
149
+ records.loc[i, "annotated"] = True
150
+ return (
151
+ gr.update(choices=submajors(major), value=new_sub),
152
+ gr.update(choices=mn_c, value=mn),
153
+ gr.update(choices=un_c, value=un),
154
+ status_text(i),
155
+ )
156
+
157
+ def on_minor_change(new_minor: str, i: int, major: str, sub: str):
158
+ un_c = units(major, sub, new_minor)
159
+ un = un_c[0] if un_c else ""
160
+ records.loc[i, ["minor_label", "unit_label"]] = [new_minor, un]
161
+ records.loc[i, "annotated"] = True
162
+ return (
163
+ gr.update(choices=minors(major, sub), value=new_minor),
164
+ gr.update(choices=un_c, value=un),
165
+ status_text(i),
166
+ )
167
+
168
+ def on_unit_change(new_unit: str, i: int, major: str, sub: str, mn: str):
169
+ un_c = units(major, sub, mn)
170
+ if new_unit not in un_c:
171
+ new_unit = un_c[0] if un_c else ""
172
+ records.loc[i, "unit_label"] = new_unit
173
+ records.loc[i, "annotated"] = True
174
+ return gr.update(choices=un_c, value=new_unit), status_text(i)
175
+
176
+ def go_next(i: int) -> int:
177
+ return (i + 1) % len(records)
178
+
179
+ def go_prev(i: int) -> int:
180
+ return (i - 1) % len(records)
181
+
182
+ # ---- NAVIGATION: save + move + reload in ONE callback ----
183
+
184
+ def save_and_jump(i: int, direction: str):
185
+ # Final safety net: clamp and persist whatever is currently stored
186
+ rec = records.loc[i]
187
+ maj, sub, mn, un, *_ = clamp_path(
188
+ rec["major_label"], rec["sub_major_label"], rec["minor_label"], rec["unit_label"]
189
+ )
190
+ save_record(i, maj, sub, mn, un)
191
+ new_i = go_next(i) if direction == "next" else go_prev(i)
192
+ return (new_i,) + load_record(new_i)
193
+
194
+ def download_annotations() -> str:
195
+ path = PROCESSED_DATA_DIR / "annotated_output.csv"
196
+ records.to_csv(path, index=False)
197
+ return str(path)
198
+
199
+ # --------------
200
+ # Build the UI
201
+ # --------------
202
+ def build_gradio_app():
203
+ with gr.Blocks() as demo:
204
+ with gr.Row():
205
+ with gr.Column(scale=1):
206
+ # Static logo, non-interactive
207
+ gr.HTML(
208
+ f'<img src="data:image/png;base64,{logo_b64}" width="200" style="pointer-events:none; user-select:none; display:block;" />'
209
+ )
210
+ with gr.Row():
211
+ gr.Markdown("# ISCO Annotation", elem_id="isco-title")
212
+ gr.HTML("""
213
+ <style>
214
+ #isco-title {
215
+ text-align: center;
216
+ width: 100%;
217
+ margin: 0.5em 0;
218
+ }
219
+ footer { display: none !important; }
220
+ .gradio-container .api-link, .gradio-container .share-link { display: none !important; }
221
+ </style>
222
+ """)
223
+
224
+ idx_state = gr.State(0)
225
+
226
+ with gr.Group():
227
+ record_md = gr.Markdown()
228
+ status_md = gr.Markdown()
229
+
230
+ with gr.Row():
231
+ prev_btn = gr.Button("⬅ Previous")
232
+ next_btn = gr.Button("✅ Next")
233
+
234
+ with gr.Row():
235
+ with gr.Column():
236
+ major_radio = gr.Radio(label="Level 1: Major", choices=[], interactive=True)
237
+ with gr.Column():
238
+ sub_radio = gr.Radio(label="Level 2: Sub-major", choices=[], interactive=True)
239
+ with gr.Column():
240
+ minor_radio = gr.Radio(label="Level 3: Minor", choices=[], interactive=True)
241
+ with gr.Column():
242
+ unit_radio = gr.Radio(label="Level 4: Unit", choices=[], interactive=True)
243
+
244
+ download_btn = gr.Button("📥 Download Annotations")
245
+ download_file = gr.File(label="Annotated CSV", visible=False)
246
+
247
+ # Initial load
248
+ demo.load(
249
+ lambda: (0,) + load_record(0),
250
+ outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio],
251
+ )
252
+
253
+ next_btn.click(lambda i: save_and_jump(i, "next"),
254
+ inputs=[idx_state],
255
+ outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio])
256
+
257
+ prev_btn.click(lambda i: save_and_jump(i, "prev"),
258
+ inputs=[idx_state],
259
+ outputs=[idx_state, record_md, status_md, major_radio, sub_radio, minor_radio, unit_radio])
260
+
261
+ # Change handlers (also update status)
262
+ major_radio.change(
263
+ on_major_change,
264
+ inputs=[major_radio, idx_state],
265
+ outputs=[major_radio, sub_radio, minor_radio, unit_radio, status_md],
266
+ )
267
+
268
+ sub_radio.change(
269
+ on_sub_change,
270
+ inputs=[sub_radio, idx_state, major_radio],
271
+ outputs=[sub_radio, minor_radio, unit_radio, status_md],
272
+ )
273
+
274
+ minor_radio.change(
275
+ on_minor_change,
276
+ inputs=[minor_radio, idx_state, major_radio, sub_radio],
277
+ outputs=[minor_radio, unit_radio, status_md],
278
+ )
279
+
280
+ unit_radio.change(
281
+ on_unit_change,
282
+ inputs=[unit_radio, idx_state, major_radio, sub_radio, minor_radio],
283
+ outputs=[unit_radio, status_md],
284
+ )
285
+
286
+ # Download
287
+ download_btn.click(download_annotations, outputs=[download_file]).then(
288
+ lambda: gr.update(visible=True), None, [download_file]
289
+ )
290
+
291
+ return demo
292
+
293
+
294
+ if __name__ == "__main__":
295
+ demo = build_gradio_app()
296
+ demo.queue().launch(show_api=False)
297
+
isco_imperfect.xlsx ADDED
Binary file (41.8 kB). View file
 
isco_predictions.xlsx ADDED
Binary file (46.1 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio>=3.0
2
+ pandas
3
+ openpyxl
rowsquared-logo-large.png ADDED