LeonceNsh commited on
Commit
1885ec3
·
verified ·
1 Parent(s): e9bc2bd

Upload folder using huggingface_hub

Browse files
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Hones
3
- emoji: 🏃
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.34.2
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: hones
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.34.2
 
 
6
  ---
 
 
__pycache__/model_and_load_toduckdb.cpython-311.pyc ADDED
Binary file (50.4 kB). View file
 
app.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from plotly.subplots import make_subplots
6
+ import duckdb
7
+ import numpy as np
8
+ from datetime import datetime
9
+ import os
10
+
11
+ # Database connection
12
+ DATABASE_PATH = "./data/h1bs_analytics.duckdb"
13
+
14
+ def get_db_connection():
15
+ """Create a connection to the DuckDB database"""
16
+ if os.path.exists(DATABASE_PATH):
17
+ return duckdb.connect(DATABASE_PATH, read_only=True)
18
+ else:
19
+ # Create sample data if database doesn't exist
20
+ return create_sample_data()
21
+
22
+ def create_sample_data():
23
+ """Create sample H1B facts data for demonstration"""
24
+ conn = duckdb.connect(":memory:")
25
+
26
+ # Sample fact table based on H1B schema
27
+ np.random.seed(42)
28
+ n_records = 5000
29
+
30
+ sample_facts = pd.DataFrame({
31
+ 'record_id': range(1, n_records + 1),
32
+ 'lottery_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
33
+ 'fiscal_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
34
+ 'country_of_birth': np.random.choice([
35
+ 'INDIA', 'CHINA', 'SOUTH KOREA', 'CANADA', 'UNITED KINGDOM',
36
+ 'PHILIPPINES', 'TAIWAN', 'JAPAN', 'MEXICO', 'BRAZIL'
37
+ ], n_records, p=[0.4, 0.15, 0.1, 0.08, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03]),
38
+ 'wage_amt': np.random.lognormal(11.2, 0.5, n_records).round(0), # Log-normal for realistic wage distribution
39
+ 'is_multiple_registration': np.random.choice([True, False], n_records, p=[0.3, 0.7]),
40
+ 'age_at_application': np.random.normal(28, 4, n_records).round(0).clip(22, 45),
41
+ 'years_since_application': np.random.choice([0, 1, 2, 3], n_records),
42
+ 'full_time_ind': np.random.choice([True, False], n_records, p=[0.85, 0.15]),
43
+ 'employer_worksite_same_state': np.random.choice([True, False], n_records, p=[0.7, 0.3]),
44
+ 'employer_sk': [f'EMP_{i%500}' for i in range(n_records)],
45
+ 'beneficiary_sk': [f'BEN_{i}' for i in range(n_records)],
46
+ 'job_sk': [f'JOB_{i%300}' for i in range(n_records)]
47
+ })
48
+
49
+ conn.execute("CREATE TABLE fct_h1b_applications AS SELECT * FROM sample_facts")
50
+
51
+ return conn
52
+
53
+ # Load data
54
+ conn = get_db_connection()
55
+
56
+ def load_facts_data():
57
+ """Load H1B applications fact table"""
58
+ try:
59
+ query = """
60
+ SELECT * FROM fct_h1b_applications
61
+ WHERE wage_amt IS NOT NULL
62
+ LIMIT 10000
63
+ """
64
+ return conn.execute(query).df()
65
+ except Exception as e:
66
+ print(f"Error loading facts data: {e}")
67
+ return pd.DataFrame()
68
+
69
+ # Load the facts data
70
+ facts_df = load_facts_data()
71
+
72
+ # ---- FACTS TABLE VISUALIZATIONS ----
73
+
74
+ def facts_overview():
75
+ """Overview of the facts table with key metrics"""
76
+ if facts_df.empty:
77
+ return go.Figure().update_layout(title="No facts data available")
78
+
79
+ # Key metrics
80
+ total_records = len(facts_df)
81
+ avg_wage = facts_df['wage_amt'].mean()
82
+ median_wage = facts_df['wage_amt'].median()
83
+ multiple_reg_pct = (facts_df['is_multiple_registration'].sum() / len(facts_df)) * 100
84
+
85
+ # Create metrics dashboard
86
+ fig = make_subplots(
87
+ rows=2, cols=2,
88
+ specs=[[{"type": "indicator"}, {"type": "indicator"}],
89
+ [{"type": "indicator"}, {"type": "indicator"}]],
90
+ subplot_titles=("Total Records", "Average Wage", "Median Wage", "Multiple Registration %")
91
+ )
92
+
93
+ fig.add_trace(
94
+ go.Indicator(
95
+ mode="number",
96
+ value=total_records,
97
+ number={"valueformat": ","},
98
+ title={"text": "Total Records"}
99
+ ),
100
+ row=1, col=1
101
+ )
102
+
103
+ fig.add_trace(
104
+ go.Indicator(
105
+ mode="number",
106
+ value=avg_wage,
107
+ number={"prefix": "$", "valueformat": ",.0f"},
108
+ title={"text": "Average Wage"}
109
+ ),
110
+ row=1, col=2
111
+ )
112
+
113
+ fig.add_trace(
114
+ go.Indicator(
115
+ mode="number",
116
+ value=median_wage,
117
+ number={"prefix": "$", "valueformat": ",.0f"},
118
+ title={"text": "Median Wage"}
119
+ ),
120
+ row=2, col=1
121
+ )
122
+
123
+ fig.add_trace(
124
+ go.Indicator(
125
+ mode="number",
126
+ value=multiple_reg_pct,
127
+ number={"suffix": "%", "valueformat": ".1f"},
128
+ title={"text": "Multiple Registrations"}
129
+ ),
130
+ row=2, col=2
131
+ )
132
+
133
+ fig.update_layout(
134
+ height=400,
135
+ title_text="H1B Facts Table - Key Metrics"
136
+ )
137
+
138
+ return fig
139
+
140
+ def wage_distribution():
141
+ """Visualize wage distribution from facts table"""
142
+ if facts_df.empty:
143
+ return go.Figure().update_layout(title="No data available")
144
+
145
+ fig = make_subplots(
146
+ rows=1, cols=2,
147
+ specs=[[{"type": "histogram"}, {"type": "box"}]],
148
+ subplot_titles=("Wage Distribution", "Wage Distribution (Box Plot)")
149
+ )
150
+
151
+ # Histogram
152
+ fig.add_trace(
153
+ go.Histogram(
154
+ x=facts_df['wage_amt'],
155
+ nbinsx=50,
156
+ marker_color='skyblue',
157
+ opacity=0.7,
158
+ name='Wage Distribution'
159
+ ),
160
+ row=1, col=1
161
+ )
162
+
163
+ # Box plot
164
+ fig.add_trace(
165
+ go.Box(
166
+ y=facts_df['wage_amt'],
167
+ marker_color='lightcoral',
168
+ name='Wage Box Plot'
169
+ ),
170
+ row=1, col=2
171
+ )
172
+
173
+ fig.update_layout(
174
+ height=500,
175
+ title_text="Wage Analysis from Facts Table",
176
+ showlegend=False
177
+ )
178
+
179
+ fig.update_xaxes(title_text="Wage Amount ($)", row=1, col=1)
180
+ fig.update_yaxes(title_text="Frequency", row=1, col=1)
181
+ fig.update_yaxes(title_text="Wage Amount ($)", row=1, col=2)
182
+
183
+ return fig
184
+
185
+ def country_analysis():
186
+ """Analyze country distribution from facts table"""
187
+ if facts_df.empty:
188
+ return go.Figure().update_layout(title="No data available")
189
+
190
+ # Country counts
191
+ country_counts = facts_df['country_of_birth'].value_counts().head(10)
192
+
193
+ # Average wage by country
194
+ country_wages = facts_df.groupby('country_of_birth')['wage_amt'].agg(['mean', 'count']).reset_index()
195
+ country_wages = country_wages[country_wages['count'] >= 50].nlargest(8, 'mean') # Min 50 applications
196
+
197
+ fig = make_subplots(
198
+ rows=1, cols=2,
199
+ specs=[[{"type": "bar"}, {"type": "bar"}]],
200
+ subplot_titles=("Applications by Country", "Average Wage by Country (Min 50 apps)")
201
+ )
202
+
203
+ # Applications by country
204
+ fig.add_trace(
205
+ go.Bar(
206
+ x=country_counts.index,
207
+ y=country_counts.values,
208
+ marker_color='teal',
209
+ text=country_counts.values,
210
+ textposition='auto',
211
+ name='Application Count'
212
+ ),
213
+ row=1, col=1
214
+ )
215
+
216
+ # Average wage by country
217
+ fig.add_trace(
218
+ go.Bar(
219
+ x=country_wages['country_of_birth'],
220
+ y=country_wages['mean'],
221
+ marker_color='orange',
222
+ text=['$' + f"{x:,.0f}" for x in country_wages['mean']],
223
+ textposition='auto',
224
+ name='Average Wage'
225
+ ),
226
+ row=1, col=2
227
+ )
228
+
229
+ fig.update_layout(
230
+ height=500,
231
+ title_text="Country Analysis from Facts Table",
232
+ showlegend=False
233
+ )
234
+
235
+ fig.update_xaxes(tickangle=45, row=1, col=1)
236
+ fig.update_xaxes(tickangle=45, row=1, col=2)
237
+ fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
238
+ fig.update_yaxes(title_text="Average Wage ($)", row=1, col=2)
239
+
240
+ return fig
241
+
242
+ def temporal_analysis():
243
+ """Analyze temporal patterns from facts table"""
244
+ if facts_df.empty:
245
+ return go.Figure().update_layout(title="No data available")
246
+
247
+ # Yearly trends
248
+ yearly_stats = facts_df.groupby('fiscal_year').agg({
249
+ 'record_id': 'count',
250
+ 'wage_amt': 'mean',
251
+ 'is_multiple_registration': 'mean'
252
+ }).reset_index()
253
+
254
+ yearly_stats['multiple_reg_pct'] = yearly_stats['is_multiple_registration'] * 100
255
+
256
+ fig = make_subplots(
257
+ rows=2, cols=1,
258
+ specs=[[{"secondary_y": True}], [{"type": "bar"}]],
259
+ subplot_titles=("Applications and Average Wage by Year", "Multiple Registration Percentage by Year")
260
+ )
261
+
262
+ # Applications count
263
+ fig.add_trace(
264
+ go.Scatter(
265
+ x=yearly_stats['fiscal_year'],
266
+ y=yearly_stats['record_id'],
267
+ mode='lines+markers',
268
+ name='Applications',
269
+ line=dict(color='blue', width=3),
270
+ marker=dict(size=8)
271
+ ),
272
+ row=1, col=1
273
+ )
274
+
275
+ # Average wage (secondary y-axis)
276
+ fig.add_trace(
277
+ go.Scatter(
278
+ x=yearly_stats['fiscal_year'],
279
+ y=yearly_stats['wage_amt'],
280
+ mode='lines+markers',
281
+ name='Average Wage',
282
+ line=dict(color='red', width=3),
283
+ marker=dict(size=8),
284
+ yaxis='y2'
285
+ ),
286
+ row=1, col=1
287
+ )
288
+
289
+ # Multiple registration percentage
290
+ fig.add_trace(
291
+ go.Bar(
292
+ x=yearly_stats['fiscal_year'],
293
+ y=yearly_stats['multiple_reg_pct'],
294
+ marker_color='green',
295
+ text=[f"{x:.1f}%" for x in yearly_stats['multiple_reg_pct']],
296
+ textposition='auto',
297
+ name='Multiple Registration %'
298
+ ),
299
+ row=2, col=1
300
+ )
301
+
302
+ # Update layout
303
+ fig.update_layout(
304
+ height=600,
305
+ title_text="Temporal Analysis from Facts Table"
306
+ )
307
+
308
+ # Update y-axes
309
+ fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
310
+ fig.update_yaxes(title_text="Average Wage ($)", secondary_y=True, row=1, col=1)
311
+ fig.update_yaxes(title_text="Multiple Registration (%)", row=2, col=1)
312
+ fig.update_xaxes(title_text="Fiscal Year", row=2, col=1)
313
+
314
+ return fig
315
+
316
+ def demographic_analysis():
317
+ """Analyze demographic patterns from facts table"""
318
+ if facts_df.empty:
319
+ return go.Figure().update_layout(title="No data available")
320
+
321
+ # Age distribution
322
+ age_bins = pd.cut(facts_df['age_at_application'], bins=range(20, 50, 5), right=False)
323
+ age_counts = age_bins.value_counts().sort_index()
324
+
325
+ # Full-time vs Part-time
326
+ employment_type = facts_df['full_time_ind'].value_counts()
327
+ employment_labels = ['Full-time' if x else 'Part-time' for x in employment_type.index]
328
+
329
+ # Same state employment
330
+ same_state = facts_df['employer_worksite_same_state'].value_counts()
331
+ same_state_labels = ['Same State' if x else 'Different State' for x in same_state.index]
332
+
333
+ fig = make_subplots(
334
+ rows=2, cols=2,
335
+ specs=[[{"type": "bar"}, {"type": "pie"}],
336
+ [{"type": "pie"}, {"type": "histogram"}]],
337
+ subplot_titles=("Age Distribution", "Employment Type", "Employer-Worksite Location", "Years Since Application")
338
+ )
339
+
340
+ # Age distribution
341
+ fig.add_trace(
342
+ go.Bar(
343
+ x=[str(interval) for interval in age_counts.index],
344
+ y=age_counts.values,
345
+ marker_color='lightblue',
346
+ name='Age Distribution'
347
+ ),
348
+ row=1, col=1
349
+ )
350
+
351
+ # Employment type pie chart
352
+ fig.add_trace(
353
+ go.Pie(
354
+ labels=employment_labels,
355
+ values=employment_type.values,
356
+ name="Employment Type"
357
+ ),
358
+ row=1, col=2
359
+ )
360
+
361
+ # Same state pie chart
362
+ fig.add_trace(
363
+ go.Pie(
364
+ labels=same_state_labels,
365
+ values=same_state.values,
366
+ name="Location"
367
+ ),
368
+ row=2, col=1
369
+ )
370
+
371
+ # Years since application
372
+ years_since = facts_df['years_since_application'].value_counts().sort_index()
373
+ fig.add_trace(
374
+ go.Histogram(
375
+ x=facts_df['years_since_application'],
376
+ nbinsx=10,
377
+ marker_color='lightgreen',
378
+ name='Years Since Application'
379
+ ),
380
+ row=2, col=2
381
+ )
382
+
383
+ fig.update_layout(
384
+ height=600,
385
+ title_text="Demographic Analysis from Facts Table",
386
+ showlegend=False
387
+ )
388
+
389
+ return fig
390
+
391
+ def facts_data_table():
392
+ """Display sample of facts table data"""
393
+ if facts_df.empty:
394
+ return pd.DataFrame()
395
+
396
+ # Return first 100 rows with key columns
397
+ display_cols = [
398
+ 'record_id', 'lottery_year', 'fiscal_year', 'country_of_birth',
399
+ 'wage_amt', 'age_at_application', 'is_multiple_registration',
400
+ 'full_time_ind', 'employer_worksite_same_state'
401
+ ]
402
+
403
+ sample_data = facts_df[display_cols].head(100).copy()
404
+
405
+ # Format wage column
406
+ sample_data['wage_amt'] = sample_data['wage_amt'].apply(lambda x: f"${x:,.0f}")
407
+
408
+ return sample_data
409
+
410
+ # ---- GRADIO INTERFACE ----
411
+
412
+ with gr.Blocks(theme=gr.themes.Soft(), title="H1B Facts Table Analytics") as demo:
413
+ gr.Markdown("# 📊 H1B Facts Table Analytics Dashboard")
414
+ gr.Markdown("### Comprehensive Analysis of H1B Applications Facts Data")
415
+
416
+ with gr.Tab("📈 Facts Overview"):
417
+ gr.Markdown("### Key Metrics from Facts Table")
418
+ facts_overview_plot = gr.Plot()
419
+ gr.Button("Load Facts Overview", variant="primary").click(
420
+ fn=facts_overview,
421
+ outputs=facts_overview_plot
422
+ )
423
+
424
+ with gr.Tab("💰 Wage Analysis"):
425
+ gr.Markdown("### Wage Distribution from Facts Table")
426
+ wage_plot = gr.Plot()
427
+ gr.Button("Analyze Wages", variant="primary").click(
428
+ fn=wage_distribution,
429
+ outputs=wage_plot
430
+ )
431
+
432
+ with gr.Tab("🌍 Country Analysis"):
433
+ gr.Markdown("### Country-wise Analysis from Facts Table")
434
+ country_plot = gr.Plot()
435
+ gr.Button("Analyze Countries", variant="primary").click(
436
+ fn=country_analysis,
437
+ outputs=country_plot
438
+ )
439
+
440
+ with gr.Tab("📅 Temporal Analysis"):
441
+ gr.Markdown("### Time-based Trends from Facts Table")
442
+ temporal_plot = gr.Plot()
443
+ gr.Button("Analyze Trends", variant="primary").click(
444
+ fn=temporal_analysis,
445
+ outputs=temporal_plot
446
+ )
447
+
448
+ with gr.Tab("👥 Demographics"):
449
+ gr.Markdown("### Demographic Patterns from Facts Table")
450
+ demo_plot = gr.Plot()
451
+ gr.Button("Analyze Demographics", variant="primary").click(
452
+ fn=demographic_analysis,
453
+ outputs=demo_plot
454
+ )
455
+
456
+ with gr.Tab("📋 Raw Data"):
457
+ gr.Markdown("### Sample Facts Table Data (First 100 rows)")
458
+ data_table = gr.DataFrame()
459
+ gr.Button("Load Sample Data", variant="primary").click(
460
+ fn=facts_data_table,
461
+ outputs=data_table
462
+ )
463
+
464
+ # Footer
465
+ gr.Markdown("---")
466
+ gr.Markdown("### Facts Table Schema")
467
+ gr.Markdown("""
468
+ **Table**: `fct_h1b_applications`
469
+
470
+ **Key Columns**:
471
+ - `record_id`: Unique identifier for each application
472
+ - `lottery_year`, `fiscal_year`: Temporal dimensions
473
+ - `country_of_birth`: Beneficiary country
474
+ - `wage_amt`: Offered wage amount
475
+ - `age_at_application`: Beneficiary age
476
+ - `is_multiple_registration`: Multiple lottery entries flag
477
+ - `full_time_ind`: Full-time employment indicator
478
+ - `employer_worksite_same_state`: Location alignment flag
479
+ - Foreign keys: `employer_sk`, `beneficiary_sk`, `job_sk`
480
+ """)
481
+
482
+ # Launch the app
483
+ if __name__ == "__main__":
484
+ demo.launch(
485
+ server_name="0.0.0.0",
486
+ server_port=7860,
487
+ share=True,
488
+ show_error=True
489
+ )
model_and_load_toduckdb.py ADDED
@@ -0,0 +1,973 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ H1B Data Analytics Pipeline
3
+
4
+ This module provides a comprehensive ETL pipeline for processing H1B visa application data.
5
+ It loads CSV files into DuckDB, creates dimensional models, and performs data quality checks.
6
+ """
7
+
8
+ import os
9
+ import gc
10
+ import logging
11
+ import hashlib
12
+ from datetime import datetime
13
+ from typing import List, Optional, Tuple
14
+ import traceback
15
+
16
+ import duckdb
17
+ import pandas as pd
18
+ import numpy as np
19
+ import psutil
20
+
21
+
22
+ class H1BDataPipeline:
23
+ """
24
+ Main pipeline class for processing H1B visa application data.
25
+
26
+ This class handles the complete ETL process including:
27
+ - Loading CSV files into DuckDB
28
+ - Creating dimensional models
29
+ - Data quality checks
30
+ - Database persistence
31
+ """
32
+
33
+ def __init__(self, db_path: str = ':memory:', log_level: int = logging.INFO):
34
+ """
35
+ Initialize the H1B data pipeline.
36
+
37
+ Args:
38
+ db_path: Path to DuckDB database file. Use ':memory:' for in-memory database.
39
+ log_level: Logging level for the pipeline.
40
+ """
41
+ self.db_path = db_path
42
+ self.conn = None
43
+ self.logger = self._setup_logging(log_level)
44
+ self._setup_database()
45
+
46
+ def _setup_logging(self, log_level: int) -> logging.Logger:
47
+ """Set up logging configuration for the pipeline."""
48
+ logger = logging.getLogger(__name__)
49
+ logging.basicConfig(
50
+ level=log_level,
51
+ format="{asctime} - {name} - {levelname} - {message}",
52
+ style="{",
53
+ datefmt="%Y-%m-%d %H:%M:%S",
54
+ )
55
+ return logger
56
+
57
+ def _setup_database(self) -> None:
58
+ """Initialize DuckDB connection."""
59
+ try:
60
+ self.conn = duckdb.connect(self.db_path)
61
+ self.logger.info(f"DuckDB connection established to {self.db_path}")
62
+ self.logger.info(f"DuckDB version: {duckdb.__version__}")
63
+
64
+ # Test connection
65
+ test_result = self.conn.execute("SELECT 'Hello DuckDB!' as message").fetchone()
66
+ self.logger.info(f"Connection test: {test_result[0]}")
67
+
68
+ except Exception as e:
69
+ self.logger.error(f"Failed to establish database connection: {e}")
70
+ raise
71
+
72
+ def __enter__(self):
73
+ """Context manager entry."""
74
+ return self
75
+
76
+ def close(self) -> None:
77
+ """Close database connection and cleanup resources."""
78
+ if self.conn:
79
+ self.conn.close()
80
+ self.logger.info("Database connection closed")
81
+
82
+ def __exit__(self, exc_type, exc_val, exc_tb):
83
+ """Context manager exit with cleanup."""
84
+ self.close()
85
+
86
+
87
+ class MemoryManager:
88
+ """Utility class for monitoring and managing memory usage."""
89
+
90
+ @staticmethod
91
+ def check_memory_usage() -> float:
92
+ """
93
+ Check current memory usage of the process.
94
+
95
+ Returns:
96
+ Memory usage in MB.
97
+ """
98
+ process = psutil.Process(os.getpid())
99
+ memory_mb = process.memory_info().rss / 1024 / 1024
100
+ print(f"Current memory usage: {memory_mb:.1f} MB")
101
+ return memory_mb
102
+
103
+ @staticmethod
104
+ def clear_memory() -> None:
105
+ """Force garbage collection to clear memory."""
106
+ gc.collect()
107
+ print("Memory cleared")
108
+
109
+
110
+ class FileValidator:
111
+ """Utility class for validating file existence and accessibility."""
112
+
113
+ @staticmethod
114
+ def validate_files(file_paths: List[str]) -> Tuple[List[str], List[str]]:
115
+ """
116
+ Validate that files exist and are accessible.
117
+
118
+ Args:
119
+ file_paths: List of file paths to validate.
120
+
121
+ Returns:
122
+ Tuple of (existing_files, missing_files).
123
+ """
124
+ existing_files = []
125
+ missing_files = []
126
+
127
+ for file_path in file_paths:
128
+ if os.path.exists(file_path):
129
+ existing_files.append(file_path)
130
+ print(f"✓ Found: {file_path}")
131
+ else:
132
+ missing_files.append(file_path)
133
+ print(f"✗ Missing: {file_path}")
134
+
135
+ return existing_files, missing_files
136
+
137
+
138
+ class DataLoader:
139
+ """Handles loading data from various sources into DuckDB."""
140
+
141
+ def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
142
+ """
143
+ Initialize data loader.
144
+
145
+ Args:
146
+ conn: DuckDB connection object.
147
+ logger: Logger instance for tracking operations.
148
+ """
149
+ self.conn = conn
150
+ self.logger = logger
151
+
152
+ def load_csv_files(self, file_paths: List[str]) -> None:
153
+ """
154
+ Load CSV files directly into DuckDB without loading into pandas first.
155
+
156
+ Args:
157
+ file_paths: List of CSV file paths to load.
158
+ """
159
+ self.logger.info("Loading CSV files directly into DuckDB...")
160
+
161
+ for file_path in file_paths:
162
+ try:
163
+ self._load_single_csv(file_path)
164
+ except Exception as e:
165
+ self.logger.error(f"Error loading {file_path}: {e}")
166
+
167
+ def _load_single_csv(self, file_path: str) -> None:
168
+ """
169
+ Load a single CSV file into DuckDB.
170
+
171
+ Args:
172
+ file_path: Path to the CSV file.
173
+ """
174
+ self.logger.info(f"Loading {file_path}")
175
+
176
+ # Extract metadata from filename
177
+ filename = file_path.split('/')[-1].replace('.csv', '')
178
+ table_name = f"raw_{filename}"
179
+ fiscal_year = self._extract_fiscal_year(filename)
180
+
181
+ # Load CSV directly into DuckDB
182
+ self.conn.execute(f"""
183
+ CREATE TABLE {table_name} AS
184
+ SELECT *,
185
+ '{file_path}' as source_file,
186
+ '{fiscal_year}' as fiscal_year
187
+ FROM read_csv_auto('{file_path}', header=true, normalize_names=true, ignore_errors=true)
188
+ """)
189
+
190
+ # Clean column names
191
+ self._clean_column_names(table_name)
192
+
193
+ # Log success
194
+ count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
195
+ self.logger.info(f"Loaded {count:,} records from {file_path} into {table_name}")
196
+
197
+ def _extract_fiscal_year(self, filename: str) -> str:
198
+ """Extract fiscal year from filename."""
199
+ import re
200
+ match = re.search(r'FY(\d{4})', filename)
201
+ if match:
202
+ return match.group(1) # Return only the year digits
203
+ return "unknown"
204
+
205
+ def _clean_column_names(self, table_name: str) -> None:
206
+ """
207
+ Clean column names in DuckDB table.
208
+
209
+ Args:
210
+ table_name: Name of the table to clean.
211
+ """
212
+ columns_query = f"PRAGMA table_info('{table_name}')"
213
+ columns_info = self.conn.execute(columns_query).fetchall()
214
+
215
+ for col_info in columns_info:
216
+ old_name = col_info[1]
217
+ new_name = self._normalize_column_name(old_name)
218
+
219
+ if old_name != new_name:
220
+ self.conn.execute(f"""
221
+ ALTER TABLE {table_name}
222
+ RENAME COLUMN "{old_name}" TO {new_name}
223
+ """)
224
+
225
+ @staticmethod
226
+ def _normalize_column_name(column_name: str) -> str:
227
+ """
228
+ Normalize column name to follow consistent naming convention.
229
+
230
+ Args:
231
+ column_name: Original column name.
232
+
233
+ Returns:
234
+ Normalized column name.
235
+ """
236
+ import re
237
+
238
+ # Remove URLs and other problematic patterns
239
+ normalized = re.sub(r'https?://[^\s]+', '', str(column_name))
240
+ normalized = re.sub(r'[^\w\s]', '_', normalized) # Replace special chars with underscore
241
+ normalized = re.sub(r'\s+', '_', normalized) # Replace spaces with underscore
242
+ normalized = re.sub(r'_+', '_', normalized) # Replace multiple underscores with single
243
+ normalized = normalized.lower().strip('_') # Lowercase and trim underscores
244
+
245
+ # Ensure it starts with letter or underscore
246
+ if normalized and not (normalized[0].isalpha() or normalized[0] == '_'):
247
+ normalized = f'col_{normalized}'
248
+
249
+ return normalized if normalized else 'unnamed_column'
250
+
251
+
252
+ class DataTransformer:
253
+ """Handles data transformation and dimensional modeling."""
254
+
255
+ def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
256
+ """
257
+ Initialize data transformer.
258
+
259
+ Args:
260
+ conn: DuckDB connection object.
261
+ logger: Logger instance for tracking operations.
262
+ """
263
+ self.conn = conn
264
+ self.logger = logger
265
+
266
+ def create_combined_table(self) -> None:
267
+ """Create a combined table from all raw tables in DuckDB."""
268
+ self.logger.info("Creating combined table in DuckDB...")
269
+
270
+ # Get list of raw tables
271
+ raw_tables = self.conn.execute("""
272
+ SELECT table_name
273
+ FROM information_schema.tables
274
+ WHERE table_name LIKE 'raw_%'
275
+ """).fetchall()
276
+
277
+ if not raw_tables:
278
+ raise ValueError("No raw tables found")
279
+
280
+ # Create union query
281
+ union_parts = [f"SELECT * FROM {table_info[0]}" for table_info in raw_tables]
282
+ union_query = " UNION ALL ".join(union_parts)
283
+
284
+ # Create combined table
285
+ self.conn.execute(f"""
286
+ CREATE TABLE combined_data AS
287
+ {union_query}
288
+ """)
289
+
290
+ count = self.conn.execute("SELECT COUNT(*) FROM combined_data").fetchone()[0]
291
+ self.logger.info(f"Created combined table with {count:,} records")
292
+
293
+ def remove_columns_with_missing_data(self, table_name: str, threshold: float = 0.8) -> List[str]:
294
+ """
295
+ Remove columns with high missing data directly in DuckDB.
296
+
297
+ Args:
298
+ table_name: Name of the table to clean.
299
+ threshold: Threshold for missing data ratio (0.0 to 1.0).
300
+
301
+ Returns:
302
+ List of columns that were kept.
303
+ """
304
+ self.logger.info(f"Removing columns with >{threshold*100}% missing data from {table_name}...")
305
+
306
+ total_rows = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
307
+ columns_info = self.conn.execute(f"PRAGMA table_info('{table_name}')").fetchall()
308
+
309
+ columns_to_keep = []
310
+ columns_removed = []
311
+
312
+ for col_info in columns_info:
313
+ col_name = col_info[1]
314
+ col_type = col_info[2] # Get column type for better handling
315
+
316
+ try:
317
+ # Handle different column types appropriately
318
+ if col_type.upper() in ['INTEGER', 'BIGINT', 'DOUBLE', 'FLOAT', 'DECIMAL', 'NUMERIC']:
319
+ # For numeric columns, only check for NULL
320
+ non_null_count = self.conn.execute(f"""
321
+ SELECT COUNT(*)
322
+ FROM {table_name}
323
+ WHERE "{col_name}" IS NOT NULL
324
+ """).fetchone()[0]
325
+ else:
326
+ # For text columns, check for NULL and empty strings
327
+ non_null_count = self.conn.execute(f"""
328
+ SELECT COUNT(*)
329
+ FROM {table_name}
330
+ WHERE "{col_name}" IS NOT NULL
331
+ AND TRIM(CAST("{col_name}" AS VARCHAR)) != ''
332
+ """).fetchone()[0]
333
+
334
+ missing_ratio = 1 - (non_null_count / total_rows)
335
+
336
+ self.logger.debug(f"Column {col_name}: {non_null_count}/{total_rows} non-null ({missing_ratio:.2%} missing)")
337
+
338
+ if missing_ratio <= threshold:
339
+ columns_to_keep.append(col_name)
340
+ else:
341
+ columns_removed.append(col_name)
342
+ self.logger.info(f"Removing column {col_name} with {missing_ratio:.2%} missing data")
343
+
344
+ except Exception as e:
345
+ self.logger.warning(f"Error processing column {col_name}: {e}")
346
+ # When in doubt, keep the column
347
+ columns_to_keep.append(col_name)
348
+
349
+ if columns_removed:
350
+ self.logger.info(f"Removing {len(columns_removed)} columns with high missing data")
351
+ self._recreate_table_with_columns(table_name, columns_to_keep)
352
+
353
+ return columns_to_keep
354
+
355
+ def _recreate_table_with_columns(self, table_name: str, columns_to_keep: List[str]) -> None:
356
+ """
357
+ Recreate table with only specified columns.
358
+
359
+ Args:
360
+ table_name: Original table name.
361
+ columns_to_keep: List of column names to retain.
362
+ """
363
+ columns_str = ', '.join(columns_to_keep)
364
+ self.conn.execute(f"""
365
+ CREATE TABLE {table_name}_clean AS
366
+ SELECT {columns_str}
367
+ FROM {table_name}
368
+ """)
369
+
370
+ self.conn.execute(f"DROP TABLE {table_name}")
371
+ self.conn.execute(f"ALTER TABLE {table_name}_clean RENAME TO {table_name}")
372
+
373
+
374
+ class DimensionalModeler:
375
+ """Creates dimensional model tables for analytics."""
376
+
377
+ def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
378
+ """
379
+ Initialize dimensional modeler.
380
+
381
+ Args:
382
+ conn: DuckDB connection object.
383
+ logger: Logger instance for tracking operations.
384
+ """
385
+ self.conn = conn
386
+ self.logger = logger
387
+
388
+ def create_all_dimensions(self) -> None:
389
+ """Create all dimension tables."""
390
+ self._prepare_cleaned_data()
391
+ self._create_beneficiary_dimension()
392
+ self._create_employer_dimension()
393
+ self._create_job_dimension()
394
+ self._create_agent_dimension()
395
+ self._create_status_dimension()
396
+ self._create_date_dimension()
397
+
398
+ def _prepare_cleaned_data(self) -> None:
399
+ """Prepare cleaned data table for dimension creation."""
400
+ self.logger.info("Preparing cleaned data...")
401
+ # First, check what columns actually exist in combined_data
402
+ columns_info = self.conn.execute("PRAGMA table_info('combined_data')").fetchall()
403
+ available_columns = [col[1] for col in columns_info]
404
+ self.logger.info(f"Available columns in combined_data: {available_columns}")
405
+ self.conn.execute("""
406
+ CREATE TABLE cleaned_data AS
407
+ SELECT
408
+ ROW_NUMBER() OVER () as original_row_id,
409
+ *
410
+ FROM combined_data
411
+ """)
412
+
413
+ def _create_beneficiary_dimension(self) -> None:
414
+ """Create beneficiary dimension table."""
415
+ self.logger.info("Creating dim_beneficiary...")
416
+ self.conn.execute("""
417
+ CREATE TABLE dim_beneficiary AS
418
+ SELECT DISTINCT
419
+ ROW_NUMBER() OVER () as beneficiary_key,
420
+ MD5(CONCAT(
421
+ COALESCE(country_of_birth, ''), '|',
422
+ COALESCE(country_of_nationality, ''), '|',
423
+ COALESCE(CAST(ben_year_of_birth AS VARCHAR), ''), '|',
424
+ COALESCE(gender, '')
425
+ )) as beneficiary_id,
426
+ country_of_birth,
427
+ country_of_nationality,
428
+ ben_year_of_birth,
429
+ gender,
430
+ ben_sex,
431
+ ben_country_of_birth,
432
+ ben_current_class,
433
+ ben_education_code,
434
+ ed_level_definition,
435
+ ben_pfield_of_study
436
+ FROM cleaned_data
437
+ WHERE country_of_birth IS NOT NULL
438
+ OR country_of_nationality IS NOT NULL
439
+ OR ben_year_of_birth IS NOT NULL
440
+ OR gender IS NOT NULL
441
+ """)
442
+
443
+ def _create_employer_dimension(self) -> None:
444
+ """Create employer dimension table."""
445
+ self.logger.info("Creating dim_employer...")
446
+ self.conn.execute("""
447
+ CREATE TABLE dim_employer AS
448
+ SELECT DISTINCT
449
+ ROW_NUMBER() OVER () as employer_key,
450
+ MD5(CONCAT(
451
+ COALESCE(employer_name, ''), '|',
452
+ COALESCE(fein, '')
453
+ )) as employer_id,
454
+ employer_name,
455
+ fein,
456
+ mail_addr,
457
+ city,
458
+ state,
459
+ zip
460
+ FROM cleaned_data
461
+ WHERE employer_name IS NOT NULL OR fein IS NOT NULL
462
+ """)
463
+
464
+ def _create_job_dimension(self) -> None:
465
+ """Create job dimension table."""
466
+ self.logger.info("Creating dim_job...")
467
+ self.conn.execute("""
468
+ CREATE TABLE dim_job AS
469
+ SELECT DISTINCT
470
+ ROW_NUMBER() OVER () as job_key,
471
+ MD5(CONCAT(
472
+ COALESCE(job_title, ''), '|',
473
+ COALESCE(naics_code, '')
474
+ )) as job_id,
475
+ job_title,
476
+ dot_code,
477
+ naics_code,
478
+ wage_amt,
479
+ wage_unit,
480
+ full_time_ind,
481
+ ben_comp_paid,
482
+ worksite_city,
483
+ worksite_state
484
+ FROM cleaned_data
485
+ WHERE job_title IS NOT NULL OR naics_code IS NOT NULL
486
+ """)
487
+
488
+ def _create_agent_dimension(self) -> None:
489
+ """Create agent dimension table."""
490
+ self.logger.info("Creating dim_agent...")
491
+ self.conn.execute("""
492
+ CREATE TABLE dim_agent AS
493
+ SELECT DISTINCT
494
+ ROW_NUMBER() OVER () as agent_key,
495
+ MD5(CONCAT(
496
+ COALESCE(agent_first_name, ''), '|',
497
+ COALESCE(agent_last_name, '')
498
+ )) as agent_id,
499
+ agent_first_name,
500
+ agent_last_name
501
+ FROM cleaned_data
502
+ WHERE agent_first_name IS NOT NULL OR agent_last_name IS NOT NULL
503
+ """)
504
+
505
+ def _create_status_dimension(self) -> None:
506
+ """Create status dimension table."""
507
+ self.logger.info("Creating dim_status...")
508
+ self.conn.execute("""
509
+ CREATE TABLE dim_status AS
510
+ SELECT DISTINCT
511
+ ROW_NUMBER() OVER () as status_key,
512
+ status_type,
513
+ first_decision
514
+ FROM cleaned_data
515
+ WHERE status_type IS NOT NULL OR first_decision IS NOT NULL
516
+ """)
517
+
518
+ def _create_date_dimension(self) -> None:
519
+ """Create date dimension table."""
520
+ self.logger.info("Creating dim_date...")
521
+ self.conn.execute("""
522
+ CREATE TABLE dim_date AS
523
+ WITH all_dates AS (
524
+ -- Handle MM/DD/YYYY format
525
+ SELECT TRY_STRPTIME(rec_date, '%m/%d/%Y') as date_value
526
+ FROM cleaned_data
527
+ WHERE rec_date IS NOT NULL
528
+ AND rec_date NOT LIKE '%(%'
529
+ AND LENGTH(rec_date) >= 8
530
+ AND rec_date ~ '^[0-9/-]+$'
531
+ AND TRY_STRPTIME(rec_date, '%m/%d/%Y') IS NOT NULL
532
+
533
+ UNION
534
+
535
+ -- Handle YYYY-MM-DD format
536
+ SELECT TRY_STRPTIME(rec_date, '%Y-%m-%d') as date_value
537
+ FROM cleaned_data
538
+ WHERE rec_date IS NOT NULL
539
+ AND rec_date NOT LIKE '%(%'
540
+ AND LENGTH(rec_date) >= 8
541
+ AND rec_date ~ '^[0-9-]+$'
542
+ AND TRY_STRPTIME(rec_date, '%Y-%m-%d') IS NOT NULL
543
+
544
+ UNION
545
+
546
+ -- Handle first_decision_date MM/DD/YYYY format
547
+ SELECT TRY_STRPTIME(first_decision_date, '%m/%d/%Y') as date_value
548
+ FROM cleaned_data
549
+ WHERE first_decision_date IS NOT NULL
550
+ AND first_decision_date NOT LIKE '%(%'
551
+ AND LENGTH(first_decision_date) >= 8
552
+ AND first_decision_date ~ '^[0-9/-]+$'
553
+ AND TRY_STRPTIME(first_decision_date, '%m/%d/%Y') IS NOT NULL
554
+
555
+ UNION
556
+
557
+ -- Handle first_decision_date YYYY-MM-DD format
558
+ SELECT TRY_STRPTIME(first_decision_date, '%Y-%m-%d') as date_value
559
+ FROM cleaned_data
560
+ WHERE first_decision_date IS NOT NULL
561
+ AND first_decision_date NOT LIKE '%(%'
562
+ AND LENGTH(first_decision_date) >= 8
563
+ AND first_decision_date ~ '^[0-9-]+$'
564
+ AND TRY_STRPTIME(first_decision_date, '%Y-%m-%d') IS NOT NULL
565
+
566
+ UNION
567
+
568
+ -- Handle valid_from MM/DD/YYYY format
569
+ SELECT TRY_STRPTIME(valid_from, '%m/%d/%Y') as date_value
570
+ FROM cleaned_data
571
+ WHERE valid_from IS NOT NULL
572
+ AND valid_from NOT LIKE '%(%'
573
+ AND LENGTH(valid_from) >= 8
574
+ AND valid_from ~ '^[0-9/-]+$'
575
+ AND TRY_STRPTIME(valid_from, '%m/%d/%Y') IS NOT NULL
576
+
577
+ UNION
578
+
579
+ -- Handle valid_from YYYY-MM-DD format
580
+ SELECT TRY_STRPTIME(valid_from, '%Y-%m-%d') as date_value
581
+ FROM cleaned_data
582
+ WHERE valid_from IS NOT NULL
583
+ AND valid_from NOT LIKE '%(%'
584
+ AND LENGTH(valid_from) >= 8
585
+ AND valid_from ~ '^[0-9-]+$'
586
+ AND TRY_STRPTIME(valid_from, '%Y-%m-%d') IS NOT NULL
587
+
588
+ UNION
589
+
590
+ -- Handle valid_to MM/DD/YYYY format
591
+ SELECT TRY_STRPTIME(valid_to, '%m/%d/%Y') as date_value
592
+ FROM cleaned_data
593
+ WHERE valid_to IS NOT NULL
594
+ AND valid_to NOT LIKE '%(%'
595
+ AND LENGTH(valid_to) >= 8
596
+ AND valid_to ~ '^[0-9/]+$'
597
+ AND valid_to LIKE '%/%/%'
598
+ AND TRY_STRPTIME(valid_to, '%m/%d/%Y') IS NOT NULL
599
+
600
+ UNION
601
+
602
+ -- Handle valid_to YYYY-MM-DD format
603
+ SELECT TRY_STRPTIME(valid_to, '%Y-%m-%d') as date_value
604
+ FROM cleaned_data
605
+ WHERE valid_to IS NOT NULL
606
+ AND valid_to NOT LIKE '%(%'
607
+ AND LENGTH(valid_to) >= 8
608
+ AND valid_to ~ '^[0-9-]+$'
609
+ AND TRY_STRPTIME(valid_to, '%Y-%m-%d') IS NOT NULL
610
+ )
611
+ SELECT DISTINCT
612
+ date_value as date,
613
+ EXTRACT(YEAR FROM date_value) as year,
614
+ EXTRACT(MONTH FROM date_value) as month,
615
+ EXTRACT(QUARTER FROM date_value) as quarter,
616
+ EXTRACT(DOW FROM date_value) as day_of_week,
617
+ MONTHNAME(date_value) as month_name,
618
+ 'Q' || CAST(EXTRACT(QUARTER FROM date_value) AS VARCHAR) as quarter_name,
619
+ CASE
620
+ WHEN EXTRACT(MONTH FROM date_value) >= 10
621
+ THEN EXTRACT(YEAR FROM date_value)
622
+ ELSE EXTRACT(YEAR FROM date_value) - 1
623
+ END as fiscal_year
624
+ FROM all_dates
625
+ WHERE date_value IS NOT NULL
626
+ ORDER BY date_value
627
+ """)
628
+
629
+ def create_fact_table(self) -> None:
630
+ """Create the fact table with foreign keys."""
631
+ self.logger.info("Creating fact table in DuckDB...")
632
+
633
+ self.conn.execute("""
634
+ CREATE TABLE fact_h1b_applications AS
635
+ SELECT
636
+ ROW_NUMBER() OVER () as record_id,
637
+
638
+ COALESCE(db.beneficiary_key, -1) as beneficiary_key,
639
+ COALESCE(de.employer_key, -1) as employer_key,
640
+ COALESCE(dj.job_key, -1) as job_key,
641
+ COALESCE(da.agent_key, -1) as agent_key,
642
+ COALESCE(ds.status_key, -1) as status_key,
643
+
644
+ -- Handle multiple date formats for rec_date
645
+ CASE
646
+ WHEN cd.rec_date IS NOT NULL AND cd.rec_date NOT LIKE '%(%'
647
+ THEN CASE
648
+ WHEN TRY_STRPTIME(cd.rec_date, '%m/%d/%Y') IS NOT NULL
649
+ THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.rec_date, '%m/%d/%Y')) AS INTEGER)
650
+ WHEN TRY_STRPTIME(cd.rec_date, '%Y-%m-%d') IS NOT NULL
651
+ THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.rec_date, '%Y-%m-%d')) AS INTEGER)
652
+ ELSE NULL
653
+ END
654
+ ELSE NULL
655
+ END as rec_date_key,
656
+
657
+ -- Handle multiple date formats for first_decision_date
658
+ CASE
659
+ WHEN cd.first_decision_date IS NOT NULL AND cd.first_decision_date NOT LIKE '%(%'
660
+ THEN CASE
661
+ WHEN TRY_STRPTIME(cd.first_decision_date, '%m/%d/%Y') IS NOT NULL
662
+ THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.first_decision_date, '%m/%d/%Y')) AS INTEGER)
663
+ WHEN TRY_STRPTIME(cd.first_decision_date, '%Y-%m-%d') IS NOT NULL
664
+ THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.first_decision_date, '%Y-%m-%d')) AS INTEGER)
665
+ ELSE NULL
666
+ END
667
+ ELSE NULL
668
+ END as first_decision_date_key,
669
+
670
+ cd.lottery_year,
671
+ cd.ben_multi_reg_ind,
672
+ cd.receipt_number,
673
+ cd.source_file,
674
+ cd.fiscal_year
675
+
676
+ FROM cleaned_data cd
677
+
678
+ LEFT JOIN dim_beneficiary db ON
679
+ cd.original_row_id = db.beneficiary_key AND
680
+ COALESCE(cd.country_of_birth, '') = COALESCE(db.country_of_birth, '') AND
681
+ COALESCE(cd.country_of_nationality, '') = COALESCE(db.country_of_nationality, '') AND
682
+ COALESCE(cd.ben_year_of_birth, '0') = COALESCE(db.ben_year_of_birth, '0') AND
683
+ COALESCE(cd.gender, '') = COALESCE(db.gender, '')
684
+
685
+ LEFT JOIN dim_employer de ON
686
+ cd.original_row_id = de.employer_key AND
687
+ COALESCE(cd.employer_name, '') = COALESCE(de.employer_name, '') AND
688
+ COALESCE(cd.fein, '') = COALESCE(de.fein, '')
689
+
690
+ LEFT JOIN dim_job dj ON
691
+ cd.original_row_id = dj.job_key AND
692
+ COALESCE(cd.job_title, '') = COALESCE(dj.job_title, '') AND
693
+ COALESCE(cd.naics_code, '') = COALESCE(dj.naics_code, '')
694
+
695
+ LEFT JOIN dim_agent da ON
696
+ cd.original_row_id = da.agent_key AND
697
+ COALESCE(cd.agent_first_name, '') = COALESCE(da.agent_first_name, '') AND
698
+ COALESCE(cd.agent_last_name, '') = COALESCE(da.agent_last_name, '')
699
+
700
+ LEFT JOIN dim_status ds ON
701
+ cd.original_row_id = ds.status_key AND
702
+ COALESCE(cd.status_type, '') = COALESCE(ds.status_type, '') AND
703
+ COALESCE(cd.first_decision, '') = COALESCE(ds.first_decision, '')
704
+ """)
705
+
706
+ def create_lookup_tables(self) -> None:
707
+ """Create lookup tables for reference data."""
708
+ self.logger.info("Creating lookup tables in DuckDB...")
709
+
710
+ # Country codes lookup
711
+ self.conn.execute("""
712
+ CREATE TABLE lookup_country_codes AS
713
+ SELECT * FROM VALUES
714
+ ('IND', 'India', 'Asia'),
715
+ ('CHN', 'China', 'Asia'),
716
+ ('KOR', 'South Korea', 'Asia'),
717
+ ('CAN', 'Canada', 'North America'),
718
+ ('NPL', 'Nepal', 'Asia'),
719
+ ('USA', 'United States', 'North America')
720
+ AS t(country_code, country_name, region)
721
+ """)
722
+
723
+ # Education levels
724
+ self.conn.execute("""
725
+ CREATE TABLE lookup_education_levels AS
726
+ SELECT * FROM VALUES
727
+ ('A', 'No Diploma', 'Basic'),
728
+ ('B', 'High School', 'Basic'),
729
+ ('C', 'Some College', 'Undergraduate'),
730
+ ('D', 'College No Degree', 'Undergraduate'),
731
+ ('E', 'Associates', 'Undergraduate'),
732
+ ('F', 'Bachelors', 'Undergraduate'),
733
+ ('G', 'Masters', 'Graduate'),
734
+ ('H', 'Professional', 'Graduate'),
735
+ ('I', 'Doctorate', 'Graduate')
736
+ AS t(education_code, education_level, education_category)
737
+ """)
738
+
739
+ # Application status types
740
+ self.conn.execute("""
741
+ CREATE TABLE lookup_status_types AS
742
+ SELECT * FROM VALUES
743
+ ('ELIGIBLE', 'Application is eligible for lottery', 'Lottery'),
744
+ ('SELECTED', 'Selected in H-1B lottery', 'Lottery'),
745
+ ('CREATED', 'Application record created', 'Administrative')
746
+ AS t(status_type, status_description, status_category)
747
+ """)
748
+
749
+
750
+ class DatabaseOptimizer:
751
+ """Handles database optimization and indexing."""
752
+
753
+ def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
754
+ """
755
+ Initialize database optimizer.
756
+
757
+ Args:
758
+ conn: DuckDB connection object.
759
+ logger: Logger instance for tracking operations.
760
+ """
761
+ self.conn = conn
762
+ self.logger = logger
763
+
764
+ def create_indexes(self) -> None:
765
+ """Create indexes for better query performance."""
766
+ self.logger.info("Creating indexes in DuckDB...")
767
+
768
+ indexes = [
769
+ ("idx_fact_beneficiary", "fact_h1b_applications", "beneficiary_key"),
770
+ ("idx_fact_employer", "fact_h1b_applications", "employer_key"),
771
+ ("idx_fact_job", "fact_h1b_applications", "job_key"),
772
+ ("idx_fact_lottery_year", "fact_h1b_applications", "lottery_year"),
773
+ ("idx_fact_fiscal_year", "fact_h1b_applications", "fiscal_year"),
774
+ ("idx_fact_rec_date", "fact_h1b_applications", "rec_date_key"),
775
+ ("idx_dim_beneficiary_id", "dim_beneficiary", "beneficiary_id"),
776
+ ("idx_dim_employer_id", "dim_employer", "employer_id"),
777
+ ("idx_dim_job_id", "dim_job", "job_id"),
778
+ ]
779
+
780
+ for index_name, table_name, column_name in indexes:
781
+ try:
782
+ self.conn.execute(f"CREATE INDEX {index_name} ON {table_name}({column_name})")
783
+ except Exception as e:
784
+ self.logger.warning(f"Could not create index {index_name}: {e}")
785
+
786
+ self.logger.info("Indexes created successfully!")
787
+
788
+
789
+ class DataQualityChecker:
790
+ """Performs data quality checks and validation."""
791
+
792
+ def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
793
+ """
794
+ Initialize data quality checker.
795
+
796
+ Args:
797
+ conn: DuckDB connection object.
798
+ logger: Logger instance for tracking operations.
799
+ """
800
+ self.conn = conn
801
+ self.logger = logger
802
+
803
+ def run_all_checks(self) -> bool:
804
+ """
805
+ Run all data quality checks.
806
+
807
+ Returns:
808
+ True if all checks pass, False otherwise.
809
+ """
810
+ self.logger.info("Running data quality checks...")
811
+
812
+ try:
813
+ self._check_table_counts()
814
+ self._check_fact_table_integrity()
815
+ return True
816
+ except Exception as e:
817
+ self.logger.error(f"Error in data quality checks: {e}")
818
+ return False
819
+
820
+ def _check_table_counts(self) -> None:
821
+ """Check row counts for all tables."""
822
+ tables_query = """
823
+ SELECT table_name, estimated_size as row_count
824
+ FROM duckdb_tables()
825
+ WHERE schema_name = 'main'
826
+ ORDER BY table_name
827
+ """
828
+ tables_info = self.conn.execute(tables_query).fetchall()
829
+
830
+ self.logger.info("Table row counts:")
831
+ for table_name, _ in tables_info:
832
+ if not table_name.startswith('raw_'):
833
+ count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
834
+ self.logger.info(f" {table_name}: {count:,} records")
835
+
836
+ def _check_fact_table_integrity(self) -> None:
837
+ """Check fact table for duplicates and integrity."""
838
+ dup_check = self.conn.execute("""
839
+ SELECT COUNT(*) as total_records,
840
+ COUNT(DISTINCT record_id) as unique_records
841
+ FROM fact_h1b_applications
842
+ """).fetchone()
843
+
844
+ self.logger.info(f"Fact table: {dup_check[0]:,} total records, {dup_check[1]:,} unique records")
845
+
846
+
847
+ class DatabasePersistence:
848
+ """Handles database persistence operations."""
849
+
850
+ def __init__(self, logger: logging.Logger):
851
+ """
852
+ Initialize database persistence handler.
853
+
854
+ Args:
855
+ logger: Logger instance for tracking operations.
856
+ """
857
+ self.logger = logger
858
+
859
+ def save_to_persistent_database(self, source_conn: duckdb.DuckDBPyConnection,
860
+ target_path: str) -> None:
861
+ """
862
+ Save tables to a persistent database file.
863
+
864
+ Args:
865
+ source_conn: Source database connection.
866
+ target_path: Path to the target persistent database file.
867
+ """
868
+ self.logger.info(f"Saving to persistent database: {target_path}")
869
+
870
+ # Remove existing file if it exists
871
+ if os.path.exists(target_path):
872
+ os.remove(target_path)
873
+ self.logger.info(f"Removed existing database file: {target_path}")
874
+
875
+ # Create persistent database connection
876
+ with duckdb.connect(target_path) as persistent_conn:
877
+ # Get tables to copy (exclude temporary tables)
878
+ tables_to_keep = source_conn.execute("""
879
+ SELECT table_name
880
+ FROM information_schema.tables
881
+ WHERE table_name NOT LIKE 'raw_%'
882
+ AND table_name NOT IN ('combined_data', 'cleaned_data')
883
+ AND table_schema = 'main'
884
+ """).fetchall()
885
+
886
+ # Copy tables
887
+ for table_info in tables_to_keep:
888
+ table_name = table_info[0]
889
+ df = source_conn.execute(f"SELECT * FROM {table_name}").df()
890
+ persistent_conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df")
891
+ self.logger.info(f"Copied table {table_name} to persistent database")
892
+
893
+ self.logger.info(f"Persistent database saved to: {target_path}")
894
+
895
+
896
+ # Configuration and Constants
897
+ class Config:
898
+ """Configuration class for the H1B data pipeline."""
899
+
900
+ CSV_FILES = [
901
+ './data/TRK_13139_FY2021.csv',
902
+ './data/TRK_13139_FY2022.csv',
903
+ './data/TRK_13139_FY2023.csv',
904
+ './data/TRK_13139_FY2024_single_reg.csv',
905
+ './data/TRK_13139_FY2024_multi_reg.csv'
906
+ ]
907
+
908
+ XLSX_FILE = './data/TRK_13139_I129_H1B_Registrations_FY21_FY24_FOIA_FIN.xlsx'
909
+ PERSISTENT_DB_PATH = './data/h1bs_analytics.duckdb'
910
+ MISSING_DATA_THRESHOLD = 0.99
911
+
912
+
913
+ def main():
914
+ """Main execution function for the H1B data pipeline."""
915
+ print("Starting H1B Data Analytics Pipeline...")
916
+ print("All imports successful!")
917
+
918
+ # Check memory usage
919
+ MemoryManager.check_memory_usage()
920
+
921
+ # Validate input files
922
+ existing_files, missing_files = FileValidator.validate_files(Config.CSV_FILES)
923
+ if missing_files:
924
+ print(f"Warning: {len(missing_files)} files are missing")
925
+
926
+ # Run the pipeline
927
+ try:
928
+ with H1BDataPipeline() as pipeline:
929
+ # Load data
930
+ data_loader = DataLoader(pipeline.conn, pipeline.logger)
931
+ data_loader.load_csv_files(existing_files)
932
+
933
+ # Transform data
934
+ transformer = DataTransformer(pipeline.conn, pipeline.logger)
935
+ transformer.create_combined_table()
936
+ kept_columns = transformer.remove_columns_with_missing_data(
937
+ 'combined_data', Config.MISSING_DATA_THRESHOLD
938
+ )
939
+ print(f"Kept {len(kept_columns)} columns after cleaning")
940
+
941
+ # Create dimensional model
942
+ modeler = DimensionalModeler(pipeline.conn, pipeline.logger)
943
+ modeler.create_all_dimensions()
944
+ modeler.create_fact_table()
945
+ modeler.create_lookup_tables()
946
+
947
+ # Optimize database
948
+ optimizer = DatabaseOptimizer(pipeline.conn, pipeline.logger)
949
+ optimizer.create_indexes()
950
+
951
+ # Run quality checks
952
+ quality_checker = DataQualityChecker(pipeline.conn, pipeline.logger)
953
+ quality_checker.run_all_checks()
954
+
955
+ # Save to persistent database
956
+ persistence = DatabasePersistence(pipeline.logger)
957
+ persistence.save_to_persistent_database(
958
+ pipeline.conn, Config.PERSISTENT_DB_PATH
959
+ )
960
+
961
+ # Final memory check
962
+ MemoryManager.check_memory_usage()
963
+ MemoryManager.clear_memory()
964
+
965
+ print("Pipeline completed successfully!")
966
+
967
+ except Exception as e:
968
+ print(f"Pipeline failed with error: {e}")
969
+ traceback.print_exc()
970
+
971
+
972
+ if __name__ == "__main__":
973
+ main()