Upload folder using huggingface_hub
Browse files- .gradio/certificate.pem +31 -0
- README.md +2 -8
- __pycache__/model_and_load_toduckdb.cpython-311.pyc +0 -0
- app.py +489 -0
- model_and_load_toduckdb.py +973 -0
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: gray
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.34.2
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: hones
|
3 |
+
app_file: app.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 5.34.2
|
|
|
|
|
6 |
---
|
|
|
|
__pycache__/model_and_load_toduckdb.cpython-311.pyc
ADDED
Binary file (50.4 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,489 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
from plotly.subplots import make_subplots
|
6 |
+
import duckdb
|
7 |
+
import numpy as np
|
8 |
+
from datetime import datetime
|
9 |
+
import os
|
10 |
+
|
11 |
+
# Database connection
|
12 |
+
DATABASE_PATH = "./data/h1bs_analytics.duckdb"
|
13 |
+
|
14 |
+
def get_db_connection():
|
15 |
+
"""Create a connection to the DuckDB database"""
|
16 |
+
if os.path.exists(DATABASE_PATH):
|
17 |
+
return duckdb.connect(DATABASE_PATH, read_only=True)
|
18 |
+
else:
|
19 |
+
# Create sample data if database doesn't exist
|
20 |
+
return create_sample_data()
|
21 |
+
|
22 |
+
def create_sample_data():
|
23 |
+
"""Create sample H1B facts data for demonstration"""
|
24 |
+
conn = duckdb.connect(":memory:")
|
25 |
+
|
26 |
+
# Sample fact table based on H1B schema
|
27 |
+
np.random.seed(42)
|
28 |
+
n_records = 5000
|
29 |
+
|
30 |
+
sample_facts = pd.DataFrame({
|
31 |
+
'record_id': range(1, n_records + 1),
|
32 |
+
'lottery_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
|
33 |
+
'fiscal_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
|
34 |
+
'country_of_birth': np.random.choice([
|
35 |
+
'INDIA', 'CHINA', 'SOUTH KOREA', 'CANADA', 'UNITED KINGDOM',
|
36 |
+
'PHILIPPINES', 'TAIWAN', 'JAPAN', 'MEXICO', 'BRAZIL'
|
37 |
+
], n_records, p=[0.4, 0.15, 0.1, 0.08, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03]),
|
38 |
+
'wage_amt': np.random.lognormal(11.2, 0.5, n_records).round(0), # Log-normal for realistic wage distribution
|
39 |
+
'is_multiple_registration': np.random.choice([True, False], n_records, p=[0.3, 0.7]),
|
40 |
+
'age_at_application': np.random.normal(28, 4, n_records).round(0).clip(22, 45),
|
41 |
+
'years_since_application': np.random.choice([0, 1, 2, 3], n_records),
|
42 |
+
'full_time_ind': np.random.choice([True, False], n_records, p=[0.85, 0.15]),
|
43 |
+
'employer_worksite_same_state': np.random.choice([True, False], n_records, p=[0.7, 0.3]),
|
44 |
+
'employer_sk': [f'EMP_{i%500}' for i in range(n_records)],
|
45 |
+
'beneficiary_sk': [f'BEN_{i}' for i in range(n_records)],
|
46 |
+
'job_sk': [f'JOB_{i%300}' for i in range(n_records)]
|
47 |
+
})
|
48 |
+
|
49 |
+
conn.execute("CREATE TABLE fct_h1b_applications AS SELECT * FROM sample_facts")
|
50 |
+
|
51 |
+
return conn
|
52 |
+
|
53 |
+
# Load data
|
54 |
+
conn = get_db_connection()
|
55 |
+
|
56 |
+
def load_facts_data():
|
57 |
+
"""Load H1B applications fact table"""
|
58 |
+
try:
|
59 |
+
query = """
|
60 |
+
SELECT * FROM fct_h1b_applications
|
61 |
+
WHERE wage_amt IS NOT NULL
|
62 |
+
LIMIT 10000
|
63 |
+
"""
|
64 |
+
return conn.execute(query).df()
|
65 |
+
except Exception as e:
|
66 |
+
print(f"Error loading facts data: {e}")
|
67 |
+
return pd.DataFrame()
|
68 |
+
|
69 |
+
# Load the facts data
|
70 |
+
facts_df = load_facts_data()
|
71 |
+
|
72 |
+
# ---- FACTS TABLE VISUALIZATIONS ----
|
73 |
+
|
74 |
+
def facts_overview():
|
75 |
+
"""Overview of the facts table with key metrics"""
|
76 |
+
if facts_df.empty:
|
77 |
+
return go.Figure().update_layout(title="No facts data available")
|
78 |
+
|
79 |
+
# Key metrics
|
80 |
+
total_records = len(facts_df)
|
81 |
+
avg_wage = facts_df['wage_amt'].mean()
|
82 |
+
median_wage = facts_df['wage_amt'].median()
|
83 |
+
multiple_reg_pct = (facts_df['is_multiple_registration'].sum() / len(facts_df)) * 100
|
84 |
+
|
85 |
+
# Create metrics dashboard
|
86 |
+
fig = make_subplots(
|
87 |
+
rows=2, cols=2,
|
88 |
+
specs=[[{"type": "indicator"}, {"type": "indicator"}],
|
89 |
+
[{"type": "indicator"}, {"type": "indicator"}]],
|
90 |
+
subplot_titles=("Total Records", "Average Wage", "Median Wage", "Multiple Registration %")
|
91 |
+
)
|
92 |
+
|
93 |
+
fig.add_trace(
|
94 |
+
go.Indicator(
|
95 |
+
mode="number",
|
96 |
+
value=total_records,
|
97 |
+
number={"valueformat": ","},
|
98 |
+
title={"text": "Total Records"}
|
99 |
+
),
|
100 |
+
row=1, col=1
|
101 |
+
)
|
102 |
+
|
103 |
+
fig.add_trace(
|
104 |
+
go.Indicator(
|
105 |
+
mode="number",
|
106 |
+
value=avg_wage,
|
107 |
+
number={"prefix": "$", "valueformat": ",.0f"},
|
108 |
+
title={"text": "Average Wage"}
|
109 |
+
),
|
110 |
+
row=1, col=2
|
111 |
+
)
|
112 |
+
|
113 |
+
fig.add_trace(
|
114 |
+
go.Indicator(
|
115 |
+
mode="number",
|
116 |
+
value=median_wage,
|
117 |
+
number={"prefix": "$", "valueformat": ",.0f"},
|
118 |
+
title={"text": "Median Wage"}
|
119 |
+
),
|
120 |
+
row=2, col=1
|
121 |
+
)
|
122 |
+
|
123 |
+
fig.add_trace(
|
124 |
+
go.Indicator(
|
125 |
+
mode="number",
|
126 |
+
value=multiple_reg_pct,
|
127 |
+
number={"suffix": "%", "valueformat": ".1f"},
|
128 |
+
title={"text": "Multiple Registrations"}
|
129 |
+
),
|
130 |
+
row=2, col=2
|
131 |
+
)
|
132 |
+
|
133 |
+
fig.update_layout(
|
134 |
+
height=400,
|
135 |
+
title_text="H1B Facts Table - Key Metrics"
|
136 |
+
)
|
137 |
+
|
138 |
+
return fig
|
139 |
+
|
140 |
+
def wage_distribution():
|
141 |
+
"""Visualize wage distribution from facts table"""
|
142 |
+
if facts_df.empty:
|
143 |
+
return go.Figure().update_layout(title="No data available")
|
144 |
+
|
145 |
+
fig = make_subplots(
|
146 |
+
rows=1, cols=2,
|
147 |
+
specs=[[{"type": "histogram"}, {"type": "box"}]],
|
148 |
+
subplot_titles=("Wage Distribution", "Wage Distribution (Box Plot)")
|
149 |
+
)
|
150 |
+
|
151 |
+
# Histogram
|
152 |
+
fig.add_trace(
|
153 |
+
go.Histogram(
|
154 |
+
x=facts_df['wage_amt'],
|
155 |
+
nbinsx=50,
|
156 |
+
marker_color='skyblue',
|
157 |
+
opacity=0.7,
|
158 |
+
name='Wage Distribution'
|
159 |
+
),
|
160 |
+
row=1, col=1
|
161 |
+
)
|
162 |
+
|
163 |
+
# Box plot
|
164 |
+
fig.add_trace(
|
165 |
+
go.Box(
|
166 |
+
y=facts_df['wage_amt'],
|
167 |
+
marker_color='lightcoral',
|
168 |
+
name='Wage Box Plot'
|
169 |
+
),
|
170 |
+
row=1, col=2
|
171 |
+
)
|
172 |
+
|
173 |
+
fig.update_layout(
|
174 |
+
height=500,
|
175 |
+
title_text="Wage Analysis from Facts Table",
|
176 |
+
showlegend=False
|
177 |
+
)
|
178 |
+
|
179 |
+
fig.update_xaxes(title_text="Wage Amount ($)", row=1, col=1)
|
180 |
+
fig.update_yaxes(title_text="Frequency", row=1, col=1)
|
181 |
+
fig.update_yaxes(title_text="Wage Amount ($)", row=1, col=2)
|
182 |
+
|
183 |
+
return fig
|
184 |
+
|
185 |
+
def country_analysis():
|
186 |
+
"""Analyze country distribution from facts table"""
|
187 |
+
if facts_df.empty:
|
188 |
+
return go.Figure().update_layout(title="No data available")
|
189 |
+
|
190 |
+
# Country counts
|
191 |
+
country_counts = facts_df['country_of_birth'].value_counts().head(10)
|
192 |
+
|
193 |
+
# Average wage by country
|
194 |
+
country_wages = facts_df.groupby('country_of_birth')['wage_amt'].agg(['mean', 'count']).reset_index()
|
195 |
+
country_wages = country_wages[country_wages['count'] >= 50].nlargest(8, 'mean') # Min 50 applications
|
196 |
+
|
197 |
+
fig = make_subplots(
|
198 |
+
rows=1, cols=2,
|
199 |
+
specs=[[{"type": "bar"}, {"type": "bar"}]],
|
200 |
+
subplot_titles=("Applications by Country", "Average Wage by Country (Min 50 apps)")
|
201 |
+
)
|
202 |
+
|
203 |
+
# Applications by country
|
204 |
+
fig.add_trace(
|
205 |
+
go.Bar(
|
206 |
+
x=country_counts.index,
|
207 |
+
y=country_counts.values,
|
208 |
+
marker_color='teal',
|
209 |
+
text=country_counts.values,
|
210 |
+
textposition='auto',
|
211 |
+
name='Application Count'
|
212 |
+
),
|
213 |
+
row=1, col=1
|
214 |
+
)
|
215 |
+
|
216 |
+
# Average wage by country
|
217 |
+
fig.add_trace(
|
218 |
+
go.Bar(
|
219 |
+
x=country_wages['country_of_birth'],
|
220 |
+
y=country_wages['mean'],
|
221 |
+
marker_color='orange',
|
222 |
+
text=['$' + f"{x:,.0f}" for x in country_wages['mean']],
|
223 |
+
textposition='auto',
|
224 |
+
name='Average Wage'
|
225 |
+
),
|
226 |
+
row=1, col=2
|
227 |
+
)
|
228 |
+
|
229 |
+
fig.update_layout(
|
230 |
+
height=500,
|
231 |
+
title_text="Country Analysis from Facts Table",
|
232 |
+
showlegend=False
|
233 |
+
)
|
234 |
+
|
235 |
+
fig.update_xaxes(tickangle=45, row=1, col=1)
|
236 |
+
fig.update_xaxes(tickangle=45, row=1, col=2)
|
237 |
+
fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
|
238 |
+
fig.update_yaxes(title_text="Average Wage ($)", row=1, col=2)
|
239 |
+
|
240 |
+
return fig
|
241 |
+
|
242 |
+
def temporal_analysis():
|
243 |
+
"""Analyze temporal patterns from facts table"""
|
244 |
+
if facts_df.empty:
|
245 |
+
return go.Figure().update_layout(title="No data available")
|
246 |
+
|
247 |
+
# Yearly trends
|
248 |
+
yearly_stats = facts_df.groupby('fiscal_year').agg({
|
249 |
+
'record_id': 'count',
|
250 |
+
'wage_amt': 'mean',
|
251 |
+
'is_multiple_registration': 'mean'
|
252 |
+
}).reset_index()
|
253 |
+
|
254 |
+
yearly_stats['multiple_reg_pct'] = yearly_stats['is_multiple_registration'] * 100
|
255 |
+
|
256 |
+
fig = make_subplots(
|
257 |
+
rows=2, cols=1,
|
258 |
+
specs=[[{"secondary_y": True}], [{"type": "bar"}]],
|
259 |
+
subplot_titles=("Applications and Average Wage by Year", "Multiple Registration Percentage by Year")
|
260 |
+
)
|
261 |
+
|
262 |
+
# Applications count
|
263 |
+
fig.add_trace(
|
264 |
+
go.Scatter(
|
265 |
+
x=yearly_stats['fiscal_year'],
|
266 |
+
y=yearly_stats['record_id'],
|
267 |
+
mode='lines+markers',
|
268 |
+
name='Applications',
|
269 |
+
line=dict(color='blue', width=3),
|
270 |
+
marker=dict(size=8)
|
271 |
+
),
|
272 |
+
row=1, col=1
|
273 |
+
)
|
274 |
+
|
275 |
+
# Average wage (secondary y-axis)
|
276 |
+
fig.add_trace(
|
277 |
+
go.Scatter(
|
278 |
+
x=yearly_stats['fiscal_year'],
|
279 |
+
y=yearly_stats['wage_amt'],
|
280 |
+
mode='lines+markers',
|
281 |
+
name='Average Wage',
|
282 |
+
line=dict(color='red', width=3),
|
283 |
+
marker=dict(size=8),
|
284 |
+
yaxis='y2'
|
285 |
+
),
|
286 |
+
row=1, col=1
|
287 |
+
)
|
288 |
+
|
289 |
+
# Multiple registration percentage
|
290 |
+
fig.add_trace(
|
291 |
+
go.Bar(
|
292 |
+
x=yearly_stats['fiscal_year'],
|
293 |
+
y=yearly_stats['multiple_reg_pct'],
|
294 |
+
marker_color='green',
|
295 |
+
text=[f"{x:.1f}%" for x in yearly_stats['multiple_reg_pct']],
|
296 |
+
textposition='auto',
|
297 |
+
name='Multiple Registration %'
|
298 |
+
),
|
299 |
+
row=2, col=1
|
300 |
+
)
|
301 |
+
|
302 |
+
# Update layout
|
303 |
+
fig.update_layout(
|
304 |
+
height=600,
|
305 |
+
title_text="Temporal Analysis from Facts Table"
|
306 |
+
)
|
307 |
+
|
308 |
+
# Update y-axes
|
309 |
+
fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
|
310 |
+
fig.update_yaxes(title_text="Average Wage ($)", secondary_y=True, row=1, col=1)
|
311 |
+
fig.update_yaxes(title_text="Multiple Registration (%)", row=2, col=1)
|
312 |
+
fig.update_xaxes(title_text="Fiscal Year", row=2, col=1)
|
313 |
+
|
314 |
+
return fig
|
315 |
+
|
316 |
+
def demographic_analysis():
|
317 |
+
"""Analyze demographic patterns from facts table"""
|
318 |
+
if facts_df.empty:
|
319 |
+
return go.Figure().update_layout(title="No data available")
|
320 |
+
|
321 |
+
# Age distribution
|
322 |
+
age_bins = pd.cut(facts_df['age_at_application'], bins=range(20, 50, 5), right=False)
|
323 |
+
age_counts = age_bins.value_counts().sort_index()
|
324 |
+
|
325 |
+
# Full-time vs Part-time
|
326 |
+
employment_type = facts_df['full_time_ind'].value_counts()
|
327 |
+
employment_labels = ['Full-time' if x else 'Part-time' for x in employment_type.index]
|
328 |
+
|
329 |
+
# Same state employment
|
330 |
+
same_state = facts_df['employer_worksite_same_state'].value_counts()
|
331 |
+
same_state_labels = ['Same State' if x else 'Different State' for x in same_state.index]
|
332 |
+
|
333 |
+
fig = make_subplots(
|
334 |
+
rows=2, cols=2,
|
335 |
+
specs=[[{"type": "bar"}, {"type": "pie"}],
|
336 |
+
[{"type": "pie"}, {"type": "histogram"}]],
|
337 |
+
subplot_titles=("Age Distribution", "Employment Type", "Employer-Worksite Location", "Years Since Application")
|
338 |
+
)
|
339 |
+
|
340 |
+
# Age distribution
|
341 |
+
fig.add_trace(
|
342 |
+
go.Bar(
|
343 |
+
x=[str(interval) for interval in age_counts.index],
|
344 |
+
y=age_counts.values,
|
345 |
+
marker_color='lightblue',
|
346 |
+
name='Age Distribution'
|
347 |
+
),
|
348 |
+
row=1, col=1
|
349 |
+
)
|
350 |
+
|
351 |
+
# Employment type pie chart
|
352 |
+
fig.add_trace(
|
353 |
+
go.Pie(
|
354 |
+
labels=employment_labels,
|
355 |
+
values=employment_type.values,
|
356 |
+
name="Employment Type"
|
357 |
+
),
|
358 |
+
row=1, col=2
|
359 |
+
)
|
360 |
+
|
361 |
+
# Same state pie chart
|
362 |
+
fig.add_trace(
|
363 |
+
go.Pie(
|
364 |
+
labels=same_state_labels,
|
365 |
+
values=same_state.values,
|
366 |
+
name="Location"
|
367 |
+
),
|
368 |
+
row=2, col=1
|
369 |
+
)
|
370 |
+
|
371 |
+
# Years since application
|
372 |
+
years_since = facts_df['years_since_application'].value_counts().sort_index()
|
373 |
+
fig.add_trace(
|
374 |
+
go.Histogram(
|
375 |
+
x=facts_df['years_since_application'],
|
376 |
+
nbinsx=10,
|
377 |
+
marker_color='lightgreen',
|
378 |
+
name='Years Since Application'
|
379 |
+
),
|
380 |
+
row=2, col=2
|
381 |
+
)
|
382 |
+
|
383 |
+
fig.update_layout(
|
384 |
+
height=600,
|
385 |
+
title_text="Demographic Analysis from Facts Table",
|
386 |
+
showlegend=False
|
387 |
+
)
|
388 |
+
|
389 |
+
return fig
|
390 |
+
|
391 |
+
def facts_data_table():
|
392 |
+
"""Display sample of facts table data"""
|
393 |
+
if facts_df.empty:
|
394 |
+
return pd.DataFrame()
|
395 |
+
|
396 |
+
# Return first 100 rows with key columns
|
397 |
+
display_cols = [
|
398 |
+
'record_id', 'lottery_year', 'fiscal_year', 'country_of_birth',
|
399 |
+
'wage_amt', 'age_at_application', 'is_multiple_registration',
|
400 |
+
'full_time_ind', 'employer_worksite_same_state'
|
401 |
+
]
|
402 |
+
|
403 |
+
sample_data = facts_df[display_cols].head(100).copy()
|
404 |
+
|
405 |
+
# Format wage column
|
406 |
+
sample_data['wage_amt'] = sample_data['wage_amt'].apply(lambda x: f"${x:,.0f}")
|
407 |
+
|
408 |
+
return sample_data
|
409 |
+
|
410 |
+
# ---- GRADIO INTERFACE ----
|
411 |
+
|
412 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="H1B Facts Table Analytics") as demo:
|
413 |
+
gr.Markdown("# 📊 H1B Facts Table Analytics Dashboard")
|
414 |
+
gr.Markdown("### Comprehensive Analysis of H1B Applications Facts Data")
|
415 |
+
|
416 |
+
with gr.Tab("📈 Facts Overview"):
|
417 |
+
gr.Markdown("### Key Metrics from Facts Table")
|
418 |
+
facts_overview_plot = gr.Plot()
|
419 |
+
gr.Button("Load Facts Overview", variant="primary").click(
|
420 |
+
fn=facts_overview,
|
421 |
+
outputs=facts_overview_plot
|
422 |
+
)
|
423 |
+
|
424 |
+
with gr.Tab("💰 Wage Analysis"):
|
425 |
+
gr.Markdown("### Wage Distribution from Facts Table")
|
426 |
+
wage_plot = gr.Plot()
|
427 |
+
gr.Button("Analyze Wages", variant="primary").click(
|
428 |
+
fn=wage_distribution,
|
429 |
+
outputs=wage_plot
|
430 |
+
)
|
431 |
+
|
432 |
+
with gr.Tab("🌍 Country Analysis"):
|
433 |
+
gr.Markdown("### Country-wise Analysis from Facts Table")
|
434 |
+
country_plot = gr.Plot()
|
435 |
+
gr.Button("Analyze Countries", variant="primary").click(
|
436 |
+
fn=country_analysis,
|
437 |
+
outputs=country_plot
|
438 |
+
)
|
439 |
+
|
440 |
+
with gr.Tab("📅 Temporal Analysis"):
|
441 |
+
gr.Markdown("### Time-based Trends from Facts Table")
|
442 |
+
temporal_plot = gr.Plot()
|
443 |
+
gr.Button("Analyze Trends", variant="primary").click(
|
444 |
+
fn=temporal_analysis,
|
445 |
+
outputs=temporal_plot
|
446 |
+
)
|
447 |
+
|
448 |
+
with gr.Tab("👥 Demographics"):
|
449 |
+
gr.Markdown("### Demographic Patterns from Facts Table")
|
450 |
+
demo_plot = gr.Plot()
|
451 |
+
gr.Button("Analyze Demographics", variant="primary").click(
|
452 |
+
fn=demographic_analysis,
|
453 |
+
outputs=demo_plot
|
454 |
+
)
|
455 |
+
|
456 |
+
with gr.Tab("📋 Raw Data"):
|
457 |
+
gr.Markdown("### Sample Facts Table Data (First 100 rows)")
|
458 |
+
data_table = gr.DataFrame()
|
459 |
+
gr.Button("Load Sample Data", variant="primary").click(
|
460 |
+
fn=facts_data_table,
|
461 |
+
outputs=data_table
|
462 |
+
)
|
463 |
+
|
464 |
+
# Footer
|
465 |
+
gr.Markdown("---")
|
466 |
+
gr.Markdown("### Facts Table Schema")
|
467 |
+
gr.Markdown("""
|
468 |
+
**Table**: `fct_h1b_applications`
|
469 |
+
|
470 |
+
**Key Columns**:
|
471 |
+
- `record_id`: Unique identifier for each application
|
472 |
+
- `lottery_year`, `fiscal_year`: Temporal dimensions
|
473 |
+
- `country_of_birth`: Beneficiary country
|
474 |
+
- `wage_amt`: Offered wage amount
|
475 |
+
- `age_at_application`: Beneficiary age
|
476 |
+
- `is_multiple_registration`: Multiple lottery entries flag
|
477 |
+
- `full_time_ind`: Full-time employment indicator
|
478 |
+
- `employer_worksite_same_state`: Location alignment flag
|
479 |
+
- Foreign keys: `employer_sk`, `beneficiary_sk`, `job_sk`
|
480 |
+
""")
|
481 |
+
|
482 |
+
# Launch the app
|
483 |
+
if __name__ == "__main__":
|
484 |
+
demo.launch(
|
485 |
+
server_name="0.0.0.0",
|
486 |
+
server_port=7860,
|
487 |
+
share=True,
|
488 |
+
show_error=True
|
489 |
+
)
|
model_and_load_toduckdb.py
ADDED
@@ -0,0 +1,973 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
H1B Data Analytics Pipeline
|
3 |
+
|
4 |
+
This module provides a comprehensive ETL pipeline for processing H1B visa application data.
|
5 |
+
It loads CSV files into DuckDB, creates dimensional models, and performs data quality checks.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import gc
|
10 |
+
import logging
|
11 |
+
import hashlib
|
12 |
+
from datetime import datetime
|
13 |
+
from typing import List, Optional, Tuple
|
14 |
+
import traceback
|
15 |
+
|
16 |
+
import duckdb
|
17 |
+
import pandas as pd
|
18 |
+
import numpy as np
|
19 |
+
import psutil
|
20 |
+
|
21 |
+
|
22 |
+
class H1BDataPipeline:
|
23 |
+
"""
|
24 |
+
Main pipeline class for processing H1B visa application data.
|
25 |
+
|
26 |
+
This class handles the complete ETL process including:
|
27 |
+
- Loading CSV files into DuckDB
|
28 |
+
- Creating dimensional models
|
29 |
+
- Data quality checks
|
30 |
+
- Database persistence
|
31 |
+
"""
|
32 |
+
|
33 |
+
def __init__(self, db_path: str = ':memory:', log_level: int = logging.INFO):
|
34 |
+
"""
|
35 |
+
Initialize the H1B data pipeline.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
db_path: Path to DuckDB database file. Use ':memory:' for in-memory database.
|
39 |
+
log_level: Logging level for the pipeline.
|
40 |
+
"""
|
41 |
+
self.db_path = db_path
|
42 |
+
self.conn = None
|
43 |
+
self.logger = self._setup_logging(log_level)
|
44 |
+
self._setup_database()
|
45 |
+
|
46 |
+
def _setup_logging(self, log_level: int) -> logging.Logger:
|
47 |
+
"""Set up logging configuration for the pipeline."""
|
48 |
+
logger = logging.getLogger(__name__)
|
49 |
+
logging.basicConfig(
|
50 |
+
level=log_level,
|
51 |
+
format="{asctime} - {name} - {levelname} - {message}",
|
52 |
+
style="{",
|
53 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
54 |
+
)
|
55 |
+
return logger
|
56 |
+
|
57 |
+
def _setup_database(self) -> None:
|
58 |
+
"""Initialize DuckDB connection."""
|
59 |
+
try:
|
60 |
+
self.conn = duckdb.connect(self.db_path)
|
61 |
+
self.logger.info(f"DuckDB connection established to {self.db_path}")
|
62 |
+
self.logger.info(f"DuckDB version: {duckdb.__version__}")
|
63 |
+
|
64 |
+
# Test connection
|
65 |
+
test_result = self.conn.execute("SELECT 'Hello DuckDB!' as message").fetchone()
|
66 |
+
self.logger.info(f"Connection test: {test_result[0]}")
|
67 |
+
|
68 |
+
except Exception as e:
|
69 |
+
self.logger.error(f"Failed to establish database connection: {e}")
|
70 |
+
raise
|
71 |
+
|
72 |
+
def __enter__(self):
|
73 |
+
"""Context manager entry."""
|
74 |
+
return self
|
75 |
+
|
76 |
+
def close(self) -> None:
|
77 |
+
"""Close database connection and cleanup resources."""
|
78 |
+
if self.conn:
|
79 |
+
self.conn.close()
|
80 |
+
self.logger.info("Database connection closed")
|
81 |
+
|
82 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
83 |
+
"""Context manager exit with cleanup."""
|
84 |
+
self.close()
|
85 |
+
|
86 |
+
|
87 |
+
class MemoryManager:
|
88 |
+
"""Utility class for monitoring and managing memory usage."""
|
89 |
+
|
90 |
+
@staticmethod
|
91 |
+
def check_memory_usage() -> float:
|
92 |
+
"""
|
93 |
+
Check current memory usage of the process.
|
94 |
+
|
95 |
+
Returns:
|
96 |
+
Memory usage in MB.
|
97 |
+
"""
|
98 |
+
process = psutil.Process(os.getpid())
|
99 |
+
memory_mb = process.memory_info().rss / 1024 / 1024
|
100 |
+
print(f"Current memory usage: {memory_mb:.1f} MB")
|
101 |
+
return memory_mb
|
102 |
+
|
103 |
+
@staticmethod
|
104 |
+
def clear_memory() -> None:
|
105 |
+
"""Force garbage collection to clear memory."""
|
106 |
+
gc.collect()
|
107 |
+
print("Memory cleared")
|
108 |
+
|
109 |
+
|
110 |
+
class FileValidator:
|
111 |
+
"""Utility class for validating file existence and accessibility."""
|
112 |
+
|
113 |
+
@staticmethod
|
114 |
+
def validate_files(file_paths: List[str]) -> Tuple[List[str], List[str]]:
|
115 |
+
"""
|
116 |
+
Validate that files exist and are accessible.
|
117 |
+
|
118 |
+
Args:
|
119 |
+
file_paths: List of file paths to validate.
|
120 |
+
|
121 |
+
Returns:
|
122 |
+
Tuple of (existing_files, missing_files).
|
123 |
+
"""
|
124 |
+
existing_files = []
|
125 |
+
missing_files = []
|
126 |
+
|
127 |
+
for file_path in file_paths:
|
128 |
+
if os.path.exists(file_path):
|
129 |
+
existing_files.append(file_path)
|
130 |
+
print(f"✓ Found: {file_path}")
|
131 |
+
else:
|
132 |
+
missing_files.append(file_path)
|
133 |
+
print(f"✗ Missing: {file_path}")
|
134 |
+
|
135 |
+
return existing_files, missing_files
|
136 |
+
|
137 |
+
|
138 |
+
class DataLoader:
|
139 |
+
"""Handles loading data from various sources into DuckDB."""
|
140 |
+
|
141 |
+
def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
|
142 |
+
"""
|
143 |
+
Initialize data loader.
|
144 |
+
|
145 |
+
Args:
|
146 |
+
conn: DuckDB connection object.
|
147 |
+
logger: Logger instance for tracking operations.
|
148 |
+
"""
|
149 |
+
self.conn = conn
|
150 |
+
self.logger = logger
|
151 |
+
|
152 |
+
def load_csv_files(self, file_paths: List[str]) -> None:
|
153 |
+
"""
|
154 |
+
Load CSV files directly into DuckDB without loading into pandas first.
|
155 |
+
|
156 |
+
Args:
|
157 |
+
file_paths: List of CSV file paths to load.
|
158 |
+
"""
|
159 |
+
self.logger.info("Loading CSV files directly into DuckDB...")
|
160 |
+
|
161 |
+
for file_path in file_paths:
|
162 |
+
try:
|
163 |
+
self._load_single_csv(file_path)
|
164 |
+
except Exception as e:
|
165 |
+
self.logger.error(f"Error loading {file_path}: {e}")
|
166 |
+
|
167 |
+
def _load_single_csv(self, file_path: str) -> None:
|
168 |
+
"""
|
169 |
+
Load a single CSV file into DuckDB.
|
170 |
+
|
171 |
+
Args:
|
172 |
+
file_path: Path to the CSV file.
|
173 |
+
"""
|
174 |
+
self.logger.info(f"Loading {file_path}")
|
175 |
+
|
176 |
+
# Extract metadata from filename
|
177 |
+
filename = file_path.split('/')[-1].replace('.csv', '')
|
178 |
+
table_name = f"raw_{filename}"
|
179 |
+
fiscal_year = self._extract_fiscal_year(filename)
|
180 |
+
|
181 |
+
# Load CSV directly into DuckDB
|
182 |
+
self.conn.execute(f"""
|
183 |
+
CREATE TABLE {table_name} AS
|
184 |
+
SELECT *,
|
185 |
+
'{file_path}' as source_file,
|
186 |
+
'{fiscal_year}' as fiscal_year
|
187 |
+
FROM read_csv_auto('{file_path}', header=true, normalize_names=true, ignore_errors=true)
|
188 |
+
""")
|
189 |
+
|
190 |
+
# Clean column names
|
191 |
+
self._clean_column_names(table_name)
|
192 |
+
|
193 |
+
# Log success
|
194 |
+
count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
195 |
+
self.logger.info(f"Loaded {count:,} records from {file_path} into {table_name}")
|
196 |
+
|
197 |
+
def _extract_fiscal_year(self, filename: str) -> str:
|
198 |
+
"""Extract fiscal year from filename."""
|
199 |
+
import re
|
200 |
+
match = re.search(r'FY(\d{4})', filename)
|
201 |
+
if match:
|
202 |
+
return match.group(1) # Return only the year digits
|
203 |
+
return "unknown"
|
204 |
+
|
205 |
+
def _clean_column_names(self, table_name: str) -> None:
|
206 |
+
"""
|
207 |
+
Clean column names in DuckDB table.
|
208 |
+
|
209 |
+
Args:
|
210 |
+
table_name: Name of the table to clean.
|
211 |
+
"""
|
212 |
+
columns_query = f"PRAGMA table_info('{table_name}')"
|
213 |
+
columns_info = self.conn.execute(columns_query).fetchall()
|
214 |
+
|
215 |
+
for col_info in columns_info:
|
216 |
+
old_name = col_info[1]
|
217 |
+
new_name = self._normalize_column_name(old_name)
|
218 |
+
|
219 |
+
if old_name != new_name:
|
220 |
+
self.conn.execute(f"""
|
221 |
+
ALTER TABLE {table_name}
|
222 |
+
RENAME COLUMN "{old_name}" TO {new_name}
|
223 |
+
""")
|
224 |
+
|
225 |
+
@staticmethod
|
226 |
+
def _normalize_column_name(column_name: str) -> str:
|
227 |
+
"""
|
228 |
+
Normalize column name to follow consistent naming convention.
|
229 |
+
|
230 |
+
Args:
|
231 |
+
column_name: Original column name.
|
232 |
+
|
233 |
+
Returns:
|
234 |
+
Normalized column name.
|
235 |
+
"""
|
236 |
+
import re
|
237 |
+
|
238 |
+
# Remove URLs and other problematic patterns
|
239 |
+
normalized = re.sub(r'https?://[^\s]+', '', str(column_name))
|
240 |
+
normalized = re.sub(r'[^\w\s]', '_', normalized) # Replace special chars with underscore
|
241 |
+
normalized = re.sub(r'\s+', '_', normalized) # Replace spaces with underscore
|
242 |
+
normalized = re.sub(r'_+', '_', normalized) # Replace multiple underscores with single
|
243 |
+
normalized = normalized.lower().strip('_') # Lowercase and trim underscores
|
244 |
+
|
245 |
+
# Ensure it starts with letter or underscore
|
246 |
+
if normalized and not (normalized[0].isalpha() or normalized[0] == '_'):
|
247 |
+
normalized = f'col_{normalized}'
|
248 |
+
|
249 |
+
return normalized if normalized else 'unnamed_column'
|
250 |
+
|
251 |
+
|
252 |
+
class DataTransformer:
|
253 |
+
"""Handles data transformation and dimensional modeling."""
|
254 |
+
|
255 |
+
def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
|
256 |
+
"""
|
257 |
+
Initialize data transformer.
|
258 |
+
|
259 |
+
Args:
|
260 |
+
conn: DuckDB connection object.
|
261 |
+
logger: Logger instance for tracking operations.
|
262 |
+
"""
|
263 |
+
self.conn = conn
|
264 |
+
self.logger = logger
|
265 |
+
|
266 |
+
def create_combined_table(self) -> None:
|
267 |
+
"""Create a combined table from all raw tables in DuckDB."""
|
268 |
+
self.logger.info("Creating combined table in DuckDB...")
|
269 |
+
|
270 |
+
# Get list of raw tables
|
271 |
+
raw_tables = self.conn.execute("""
|
272 |
+
SELECT table_name
|
273 |
+
FROM information_schema.tables
|
274 |
+
WHERE table_name LIKE 'raw_%'
|
275 |
+
""").fetchall()
|
276 |
+
|
277 |
+
if not raw_tables:
|
278 |
+
raise ValueError("No raw tables found")
|
279 |
+
|
280 |
+
# Create union query
|
281 |
+
union_parts = [f"SELECT * FROM {table_info[0]}" for table_info in raw_tables]
|
282 |
+
union_query = " UNION ALL ".join(union_parts)
|
283 |
+
|
284 |
+
# Create combined table
|
285 |
+
self.conn.execute(f"""
|
286 |
+
CREATE TABLE combined_data AS
|
287 |
+
{union_query}
|
288 |
+
""")
|
289 |
+
|
290 |
+
count = self.conn.execute("SELECT COUNT(*) FROM combined_data").fetchone()[0]
|
291 |
+
self.logger.info(f"Created combined table with {count:,} records")
|
292 |
+
|
293 |
+
def remove_columns_with_missing_data(self, table_name: str, threshold: float = 0.8) -> List[str]:
|
294 |
+
"""
|
295 |
+
Remove columns with high missing data directly in DuckDB.
|
296 |
+
|
297 |
+
Args:
|
298 |
+
table_name: Name of the table to clean.
|
299 |
+
threshold: Threshold for missing data ratio (0.0 to 1.0).
|
300 |
+
|
301 |
+
Returns:
|
302 |
+
List of columns that were kept.
|
303 |
+
"""
|
304 |
+
self.logger.info(f"Removing columns with >{threshold*100}% missing data from {table_name}...")
|
305 |
+
|
306 |
+
total_rows = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
307 |
+
columns_info = self.conn.execute(f"PRAGMA table_info('{table_name}')").fetchall()
|
308 |
+
|
309 |
+
columns_to_keep = []
|
310 |
+
columns_removed = []
|
311 |
+
|
312 |
+
for col_info in columns_info:
|
313 |
+
col_name = col_info[1]
|
314 |
+
col_type = col_info[2] # Get column type for better handling
|
315 |
+
|
316 |
+
try:
|
317 |
+
# Handle different column types appropriately
|
318 |
+
if col_type.upper() in ['INTEGER', 'BIGINT', 'DOUBLE', 'FLOAT', 'DECIMAL', 'NUMERIC']:
|
319 |
+
# For numeric columns, only check for NULL
|
320 |
+
non_null_count = self.conn.execute(f"""
|
321 |
+
SELECT COUNT(*)
|
322 |
+
FROM {table_name}
|
323 |
+
WHERE "{col_name}" IS NOT NULL
|
324 |
+
""").fetchone()[0]
|
325 |
+
else:
|
326 |
+
# For text columns, check for NULL and empty strings
|
327 |
+
non_null_count = self.conn.execute(f"""
|
328 |
+
SELECT COUNT(*)
|
329 |
+
FROM {table_name}
|
330 |
+
WHERE "{col_name}" IS NOT NULL
|
331 |
+
AND TRIM(CAST("{col_name}" AS VARCHAR)) != ''
|
332 |
+
""").fetchone()[0]
|
333 |
+
|
334 |
+
missing_ratio = 1 - (non_null_count / total_rows)
|
335 |
+
|
336 |
+
self.logger.debug(f"Column {col_name}: {non_null_count}/{total_rows} non-null ({missing_ratio:.2%} missing)")
|
337 |
+
|
338 |
+
if missing_ratio <= threshold:
|
339 |
+
columns_to_keep.append(col_name)
|
340 |
+
else:
|
341 |
+
columns_removed.append(col_name)
|
342 |
+
self.logger.info(f"Removing column {col_name} with {missing_ratio:.2%} missing data")
|
343 |
+
|
344 |
+
except Exception as e:
|
345 |
+
self.logger.warning(f"Error processing column {col_name}: {e}")
|
346 |
+
# When in doubt, keep the column
|
347 |
+
columns_to_keep.append(col_name)
|
348 |
+
|
349 |
+
if columns_removed:
|
350 |
+
self.logger.info(f"Removing {len(columns_removed)} columns with high missing data")
|
351 |
+
self._recreate_table_with_columns(table_name, columns_to_keep)
|
352 |
+
|
353 |
+
return columns_to_keep
|
354 |
+
|
355 |
+
def _recreate_table_with_columns(self, table_name: str, columns_to_keep: List[str]) -> None:
|
356 |
+
"""
|
357 |
+
Recreate table with only specified columns.
|
358 |
+
|
359 |
+
Args:
|
360 |
+
table_name: Original table name.
|
361 |
+
columns_to_keep: List of column names to retain.
|
362 |
+
"""
|
363 |
+
columns_str = ', '.join(columns_to_keep)
|
364 |
+
self.conn.execute(f"""
|
365 |
+
CREATE TABLE {table_name}_clean AS
|
366 |
+
SELECT {columns_str}
|
367 |
+
FROM {table_name}
|
368 |
+
""")
|
369 |
+
|
370 |
+
self.conn.execute(f"DROP TABLE {table_name}")
|
371 |
+
self.conn.execute(f"ALTER TABLE {table_name}_clean RENAME TO {table_name}")
|
372 |
+
|
373 |
+
|
374 |
+
class DimensionalModeler:
|
375 |
+
"""Creates dimensional model tables for analytics."""
|
376 |
+
|
377 |
+
def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
|
378 |
+
"""
|
379 |
+
Initialize dimensional modeler.
|
380 |
+
|
381 |
+
Args:
|
382 |
+
conn: DuckDB connection object.
|
383 |
+
logger: Logger instance for tracking operations.
|
384 |
+
"""
|
385 |
+
self.conn = conn
|
386 |
+
self.logger = logger
|
387 |
+
|
388 |
+
def create_all_dimensions(self) -> None:
|
389 |
+
"""Create all dimension tables."""
|
390 |
+
self._prepare_cleaned_data()
|
391 |
+
self._create_beneficiary_dimension()
|
392 |
+
self._create_employer_dimension()
|
393 |
+
self._create_job_dimension()
|
394 |
+
self._create_agent_dimension()
|
395 |
+
self._create_status_dimension()
|
396 |
+
self._create_date_dimension()
|
397 |
+
|
398 |
+
def _prepare_cleaned_data(self) -> None:
|
399 |
+
"""Prepare cleaned data table for dimension creation."""
|
400 |
+
self.logger.info("Preparing cleaned data...")
|
401 |
+
# First, check what columns actually exist in combined_data
|
402 |
+
columns_info = self.conn.execute("PRAGMA table_info('combined_data')").fetchall()
|
403 |
+
available_columns = [col[1] for col in columns_info]
|
404 |
+
self.logger.info(f"Available columns in combined_data: {available_columns}")
|
405 |
+
self.conn.execute("""
|
406 |
+
CREATE TABLE cleaned_data AS
|
407 |
+
SELECT
|
408 |
+
ROW_NUMBER() OVER () as original_row_id,
|
409 |
+
*
|
410 |
+
FROM combined_data
|
411 |
+
""")
|
412 |
+
|
413 |
+
def _create_beneficiary_dimension(self) -> None:
|
414 |
+
"""Create beneficiary dimension table."""
|
415 |
+
self.logger.info("Creating dim_beneficiary...")
|
416 |
+
self.conn.execute("""
|
417 |
+
CREATE TABLE dim_beneficiary AS
|
418 |
+
SELECT DISTINCT
|
419 |
+
ROW_NUMBER() OVER () as beneficiary_key,
|
420 |
+
MD5(CONCAT(
|
421 |
+
COALESCE(country_of_birth, ''), '|',
|
422 |
+
COALESCE(country_of_nationality, ''), '|',
|
423 |
+
COALESCE(CAST(ben_year_of_birth AS VARCHAR), ''), '|',
|
424 |
+
COALESCE(gender, '')
|
425 |
+
)) as beneficiary_id,
|
426 |
+
country_of_birth,
|
427 |
+
country_of_nationality,
|
428 |
+
ben_year_of_birth,
|
429 |
+
gender,
|
430 |
+
ben_sex,
|
431 |
+
ben_country_of_birth,
|
432 |
+
ben_current_class,
|
433 |
+
ben_education_code,
|
434 |
+
ed_level_definition,
|
435 |
+
ben_pfield_of_study
|
436 |
+
FROM cleaned_data
|
437 |
+
WHERE country_of_birth IS NOT NULL
|
438 |
+
OR country_of_nationality IS NOT NULL
|
439 |
+
OR ben_year_of_birth IS NOT NULL
|
440 |
+
OR gender IS NOT NULL
|
441 |
+
""")
|
442 |
+
|
443 |
+
def _create_employer_dimension(self) -> None:
|
444 |
+
"""Create employer dimension table."""
|
445 |
+
self.logger.info("Creating dim_employer...")
|
446 |
+
self.conn.execute("""
|
447 |
+
CREATE TABLE dim_employer AS
|
448 |
+
SELECT DISTINCT
|
449 |
+
ROW_NUMBER() OVER () as employer_key,
|
450 |
+
MD5(CONCAT(
|
451 |
+
COALESCE(employer_name, ''), '|',
|
452 |
+
COALESCE(fein, '')
|
453 |
+
)) as employer_id,
|
454 |
+
employer_name,
|
455 |
+
fein,
|
456 |
+
mail_addr,
|
457 |
+
city,
|
458 |
+
state,
|
459 |
+
zip
|
460 |
+
FROM cleaned_data
|
461 |
+
WHERE employer_name IS NOT NULL OR fein IS NOT NULL
|
462 |
+
""")
|
463 |
+
|
464 |
+
def _create_job_dimension(self) -> None:
|
465 |
+
"""Create job dimension table."""
|
466 |
+
self.logger.info("Creating dim_job...")
|
467 |
+
self.conn.execute("""
|
468 |
+
CREATE TABLE dim_job AS
|
469 |
+
SELECT DISTINCT
|
470 |
+
ROW_NUMBER() OVER () as job_key,
|
471 |
+
MD5(CONCAT(
|
472 |
+
COALESCE(job_title, ''), '|',
|
473 |
+
COALESCE(naics_code, '')
|
474 |
+
)) as job_id,
|
475 |
+
job_title,
|
476 |
+
dot_code,
|
477 |
+
naics_code,
|
478 |
+
wage_amt,
|
479 |
+
wage_unit,
|
480 |
+
full_time_ind,
|
481 |
+
ben_comp_paid,
|
482 |
+
worksite_city,
|
483 |
+
worksite_state
|
484 |
+
FROM cleaned_data
|
485 |
+
WHERE job_title IS NOT NULL OR naics_code IS NOT NULL
|
486 |
+
""")
|
487 |
+
|
488 |
+
def _create_agent_dimension(self) -> None:
|
489 |
+
"""Create agent dimension table."""
|
490 |
+
self.logger.info("Creating dim_agent...")
|
491 |
+
self.conn.execute("""
|
492 |
+
CREATE TABLE dim_agent AS
|
493 |
+
SELECT DISTINCT
|
494 |
+
ROW_NUMBER() OVER () as agent_key,
|
495 |
+
MD5(CONCAT(
|
496 |
+
COALESCE(agent_first_name, ''), '|',
|
497 |
+
COALESCE(agent_last_name, '')
|
498 |
+
)) as agent_id,
|
499 |
+
agent_first_name,
|
500 |
+
agent_last_name
|
501 |
+
FROM cleaned_data
|
502 |
+
WHERE agent_first_name IS NOT NULL OR agent_last_name IS NOT NULL
|
503 |
+
""")
|
504 |
+
|
505 |
+
def _create_status_dimension(self) -> None:
|
506 |
+
"""Create status dimension table."""
|
507 |
+
self.logger.info("Creating dim_status...")
|
508 |
+
self.conn.execute("""
|
509 |
+
CREATE TABLE dim_status AS
|
510 |
+
SELECT DISTINCT
|
511 |
+
ROW_NUMBER() OVER () as status_key,
|
512 |
+
status_type,
|
513 |
+
first_decision
|
514 |
+
FROM cleaned_data
|
515 |
+
WHERE status_type IS NOT NULL OR first_decision IS NOT NULL
|
516 |
+
""")
|
517 |
+
|
518 |
+
def _create_date_dimension(self) -> None:
|
519 |
+
"""Create date dimension table."""
|
520 |
+
self.logger.info("Creating dim_date...")
|
521 |
+
self.conn.execute("""
|
522 |
+
CREATE TABLE dim_date AS
|
523 |
+
WITH all_dates AS (
|
524 |
+
-- Handle MM/DD/YYYY format
|
525 |
+
SELECT TRY_STRPTIME(rec_date, '%m/%d/%Y') as date_value
|
526 |
+
FROM cleaned_data
|
527 |
+
WHERE rec_date IS NOT NULL
|
528 |
+
AND rec_date NOT LIKE '%(%'
|
529 |
+
AND LENGTH(rec_date) >= 8
|
530 |
+
AND rec_date ~ '^[0-9/-]+$'
|
531 |
+
AND TRY_STRPTIME(rec_date, '%m/%d/%Y') IS NOT NULL
|
532 |
+
|
533 |
+
UNION
|
534 |
+
|
535 |
+
-- Handle YYYY-MM-DD format
|
536 |
+
SELECT TRY_STRPTIME(rec_date, '%Y-%m-%d') as date_value
|
537 |
+
FROM cleaned_data
|
538 |
+
WHERE rec_date IS NOT NULL
|
539 |
+
AND rec_date NOT LIKE '%(%'
|
540 |
+
AND LENGTH(rec_date) >= 8
|
541 |
+
AND rec_date ~ '^[0-9-]+$'
|
542 |
+
AND TRY_STRPTIME(rec_date, '%Y-%m-%d') IS NOT NULL
|
543 |
+
|
544 |
+
UNION
|
545 |
+
|
546 |
+
-- Handle first_decision_date MM/DD/YYYY format
|
547 |
+
SELECT TRY_STRPTIME(first_decision_date, '%m/%d/%Y') as date_value
|
548 |
+
FROM cleaned_data
|
549 |
+
WHERE first_decision_date IS NOT NULL
|
550 |
+
AND first_decision_date NOT LIKE '%(%'
|
551 |
+
AND LENGTH(first_decision_date) >= 8
|
552 |
+
AND first_decision_date ~ '^[0-9/-]+$'
|
553 |
+
AND TRY_STRPTIME(first_decision_date, '%m/%d/%Y') IS NOT NULL
|
554 |
+
|
555 |
+
UNION
|
556 |
+
|
557 |
+
-- Handle first_decision_date YYYY-MM-DD format
|
558 |
+
SELECT TRY_STRPTIME(first_decision_date, '%Y-%m-%d') as date_value
|
559 |
+
FROM cleaned_data
|
560 |
+
WHERE first_decision_date IS NOT NULL
|
561 |
+
AND first_decision_date NOT LIKE '%(%'
|
562 |
+
AND LENGTH(first_decision_date) >= 8
|
563 |
+
AND first_decision_date ~ '^[0-9-]+$'
|
564 |
+
AND TRY_STRPTIME(first_decision_date, '%Y-%m-%d') IS NOT NULL
|
565 |
+
|
566 |
+
UNION
|
567 |
+
|
568 |
+
-- Handle valid_from MM/DD/YYYY format
|
569 |
+
SELECT TRY_STRPTIME(valid_from, '%m/%d/%Y') as date_value
|
570 |
+
FROM cleaned_data
|
571 |
+
WHERE valid_from IS NOT NULL
|
572 |
+
AND valid_from NOT LIKE '%(%'
|
573 |
+
AND LENGTH(valid_from) >= 8
|
574 |
+
AND valid_from ~ '^[0-9/-]+$'
|
575 |
+
AND TRY_STRPTIME(valid_from, '%m/%d/%Y') IS NOT NULL
|
576 |
+
|
577 |
+
UNION
|
578 |
+
|
579 |
+
-- Handle valid_from YYYY-MM-DD format
|
580 |
+
SELECT TRY_STRPTIME(valid_from, '%Y-%m-%d') as date_value
|
581 |
+
FROM cleaned_data
|
582 |
+
WHERE valid_from IS NOT NULL
|
583 |
+
AND valid_from NOT LIKE '%(%'
|
584 |
+
AND LENGTH(valid_from) >= 8
|
585 |
+
AND valid_from ~ '^[0-9-]+$'
|
586 |
+
AND TRY_STRPTIME(valid_from, '%Y-%m-%d') IS NOT NULL
|
587 |
+
|
588 |
+
UNION
|
589 |
+
|
590 |
+
-- Handle valid_to MM/DD/YYYY format
|
591 |
+
SELECT TRY_STRPTIME(valid_to, '%m/%d/%Y') as date_value
|
592 |
+
FROM cleaned_data
|
593 |
+
WHERE valid_to IS NOT NULL
|
594 |
+
AND valid_to NOT LIKE '%(%'
|
595 |
+
AND LENGTH(valid_to) >= 8
|
596 |
+
AND valid_to ~ '^[0-9/]+$'
|
597 |
+
AND valid_to LIKE '%/%/%'
|
598 |
+
AND TRY_STRPTIME(valid_to, '%m/%d/%Y') IS NOT NULL
|
599 |
+
|
600 |
+
UNION
|
601 |
+
|
602 |
+
-- Handle valid_to YYYY-MM-DD format
|
603 |
+
SELECT TRY_STRPTIME(valid_to, '%Y-%m-%d') as date_value
|
604 |
+
FROM cleaned_data
|
605 |
+
WHERE valid_to IS NOT NULL
|
606 |
+
AND valid_to NOT LIKE '%(%'
|
607 |
+
AND LENGTH(valid_to) >= 8
|
608 |
+
AND valid_to ~ '^[0-9-]+$'
|
609 |
+
AND TRY_STRPTIME(valid_to, '%Y-%m-%d') IS NOT NULL
|
610 |
+
)
|
611 |
+
SELECT DISTINCT
|
612 |
+
date_value as date,
|
613 |
+
EXTRACT(YEAR FROM date_value) as year,
|
614 |
+
EXTRACT(MONTH FROM date_value) as month,
|
615 |
+
EXTRACT(QUARTER FROM date_value) as quarter,
|
616 |
+
EXTRACT(DOW FROM date_value) as day_of_week,
|
617 |
+
MONTHNAME(date_value) as month_name,
|
618 |
+
'Q' || CAST(EXTRACT(QUARTER FROM date_value) AS VARCHAR) as quarter_name,
|
619 |
+
CASE
|
620 |
+
WHEN EXTRACT(MONTH FROM date_value) >= 10
|
621 |
+
THEN EXTRACT(YEAR FROM date_value)
|
622 |
+
ELSE EXTRACT(YEAR FROM date_value) - 1
|
623 |
+
END as fiscal_year
|
624 |
+
FROM all_dates
|
625 |
+
WHERE date_value IS NOT NULL
|
626 |
+
ORDER BY date_value
|
627 |
+
""")
|
628 |
+
|
629 |
+
def create_fact_table(self) -> None:
|
630 |
+
"""Create the fact table with foreign keys."""
|
631 |
+
self.logger.info("Creating fact table in DuckDB...")
|
632 |
+
|
633 |
+
self.conn.execute("""
|
634 |
+
CREATE TABLE fact_h1b_applications AS
|
635 |
+
SELECT
|
636 |
+
ROW_NUMBER() OVER () as record_id,
|
637 |
+
|
638 |
+
COALESCE(db.beneficiary_key, -1) as beneficiary_key,
|
639 |
+
COALESCE(de.employer_key, -1) as employer_key,
|
640 |
+
COALESCE(dj.job_key, -1) as job_key,
|
641 |
+
COALESCE(da.agent_key, -1) as agent_key,
|
642 |
+
COALESCE(ds.status_key, -1) as status_key,
|
643 |
+
|
644 |
+
-- Handle multiple date formats for rec_date
|
645 |
+
CASE
|
646 |
+
WHEN cd.rec_date IS NOT NULL AND cd.rec_date NOT LIKE '%(%'
|
647 |
+
THEN CASE
|
648 |
+
WHEN TRY_STRPTIME(cd.rec_date, '%m/%d/%Y') IS NOT NULL
|
649 |
+
THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.rec_date, '%m/%d/%Y')) AS INTEGER)
|
650 |
+
WHEN TRY_STRPTIME(cd.rec_date, '%Y-%m-%d') IS NOT NULL
|
651 |
+
THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.rec_date, '%Y-%m-%d')) AS INTEGER)
|
652 |
+
ELSE NULL
|
653 |
+
END
|
654 |
+
ELSE NULL
|
655 |
+
END as rec_date_key,
|
656 |
+
|
657 |
+
-- Handle multiple date formats for first_decision_date
|
658 |
+
CASE
|
659 |
+
WHEN cd.first_decision_date IS NOT NULL AND cd.first_decision_date NOT LIKE '%(%'
|
660 |
+
THEN CASE
|
661 |
+
WHEN TRY_STRPTIME(cd.first_decision_date, '%m/%d/%Y') IS NOT NULL
|
662 |
+
THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.first_decision_date, '%m/%d/%Y')) AS INTEGER)
|
663 |
+
WHEN TRY_STRPTIME(cd.first_decision_date, '%Y-%m-%d') IS NOT NULL
|
664 |
+
THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.first_decision_date, '%Y-%m-%d')) AS INTEGER)
|
665 |
+
ELSE NULL
|
666 |
+
END
|
667 |
+
ELSE NULL
|
668 |
+
END as first_decision_date_key,
|
669 |
+
|
670 |
+
cd.lottery_year,
|
671 |
+
cd.ben_multi_reg_ind,
|
672 |
+
cd.receipt_number,
|
673 |
+
cd.source_file,
|
674 |
+
cd.fiscal_year
|
675 |
+
|
676 |
+
FROM cleaned_data cd
|
677 |
+
|
678 |
+
LEFT JOIN dim_beneficiary db ON
|
679 |
+
cd.original_row_id = db.beneficiary_key AND
|
680 |
+
COALESCE(cd.country_of_birth, '') = COALESCE(db.country_of_birth, '') AND
|
681 |
+
COALESCE(cd.country_of_nationality, '') = COALESCE(db.country_of_nationality, '') AND
|
682 |
+
COALESCE(cd.ben_year_of_birth, '0') = COALESCE(db.ben_year_of_birth, '0') AND
|
683 |
+
COALESCE(cd.gender, '') = COALESCE(db.gender, '')
|
684 |
+
|
685 |
+
LEFT JOIN dim_employer de ON
|
686 |
+
cd.original_row_id = de.employer_key AND
|
687 |
+
COALESCE(cd.employer_name, '') = COALESCE(de.employer_name, '') AND
|
688 |
+
COALESCE(cd.fein, '') = COALESCE(de.fein, '')
|
689 |
+
|
690 |
+
LEFT JOIN dim_job dj ON
|
691 |
+
cd.original_row_id = dj.job_key AND
|
692 |
+
COALESCE(cd.job_title, '') = COALESCE(dj.job_title, '') AND
|
693 |
+
COALESCE(cd.naics_code, '') = COALESCE(dj.naics_code, '')
|
694 |
+
|
695 |
+
LEFT JOIN dim_agent da ON
|
696 |
+
cd.original_row_id = da.agent_key AND
|
697 |
+
COALESCE(cd.agent_first_name, '') = COALESCE(da.agent_first_name, '') AND
|
698 |
+
COALESCE(cd.agent_last_name, '') = COALESCE(da.agent_last_name, '')
|
699 |
+
|
700 |
+
LEFT JOIN dim_status ds ON
|
701 |
+
cd.original_row_id = ds.status_key AND
|
702 |
+
COALESCE(cd.status_type, '') = COALESCE(ds.status_type, '') AND
|
703 |
+
COALESCE(cd.first_decision, '') = COALESCE(ds.first_decision, '')
|
704 |
+
""")
|
705 |
+
|
706 |
+
def create_lookup_tables(self) -> None:
|
707 |
+
"""Create lookup tables for reference data."""
|
708 |
+
self.logger.info("Creating lookup tables in DuckDB...")
|
709 |
+
|
710 |
+
# Country codes lookup
|
711 |
+
self.conn.execute("""
|
712 |
+
CREATE TABLE lookup_country_codes AS
|
713 |
+
SELECT * FROM VALUES
|
714 |
+
('IND', 'India', 'Asia'),
|
715 |
+
('CHN', 'China', 'Asia'),
|
716 |
+
('KOR', 'South Korea', 'Asia'),
|
717 |
+
('CAN', 'Canada', 'North America'),
|
718 |
+
('NPL', 'Nepal', 'Asia'),
|
719 |
+
('USA', 'United States', 'North America')
|
720 |
+
AS t(country_code, country_name, region)
|
721 |
+
""")
|
722 |
+
|
723 |
+
# Education levels
|
724 |
+
self.conn.execute("""
|
725 |
+
CREATE TABLE lookup_education_levels AS
|
726 |
+
SELECT * FROM VALUES
|
727 |
+
('A', 'No Diploma', 'Basic'),
|
728 |
+
('B', 'High School', 'Basic'),
|
729 |
+
('C', 'Some College', 'Undergraduate'),
|
730 |
+
('D', 'College No Degree', 'Undergraduate'),
|
731 |
+
('E', 'Associates', 'Undergraduate'),
|
732 |
+
('F', 'Bachelors', 'Undergraduate'),
|
733 |
+
('G', 'Masters', 'Graduate'),
|
734 |
+
('H', 'Professional', 'Graduate'),
|
735 |
+
('I', 'Doctorate', 'Graduate')
|
736 |
+
AS t(education_code, education_level, education_category)
|
737 |
+
""")
|
738 |
+
|
739 |
+
# Application status types
|
740 |
+
self.conn.execute("""
|
741 |
+
CREATE TABLE lookup_status_types AS
|
742 |
+
SELECT * FROM VALUES
|
743 |
+
('ELIGIBLE', 'Application is eligible for lottery', 'Lottery'),
|
744 |
+
('SELECTED', 'Selected in H-1B lottery', 'Lottery'),
|
745 |
+
('CREATED', 'Application record created', 'Administrative')
|
746 |
+
AS t(status_type, status_description, status_category)
|
747 |
+
""")
|
748 |
+
|
749 |
+
|
750 |
+
class DatabaseOptimizer:
|
751 |
+
"""Handles database optimization and indexing."""
|
752 |
+
|
753 |
+
def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
|
754 |
+
"""
|
755 |
+
Initialize database optimizer.
|
756 |
+
|
757 |
+
Args:
|
758 |
+
conn: DuckDB connection object.
|
759 |
+
logger: Logger instance for tracking operations.
|
760 |
+
"""
|
761 |
+
self.conn = conn
|
762 |
+
self.logger = logger
|
763 |
+
|
764 |
+
def create_indexes(self) -> None:
|
765 |
+
"""Create indexes for better query performance."""
|
766 |
+
self.logger.info("Creating indexes in DuckDB...")
|
767 |
+
|
768 |
+
indexes = [
|
769 |
+
("idx_fact_beneficiary", "fact_h1b_applications", "beneficiary_key"),
|
770 |
+
("idx_fact_employer", "fact_h1b_applications", "employer_key"),
|
771 |
+
("idx_fact_job", "fact_h1b_applications", "job_key"),
|
772 |
+
("idx_fact_lottery_year", "fact_h1b_applications", "lottery_year"),
|
773 |
+
("idx_fact_fiscal_year", "fact_h1b_applications", "fiscal_year"),
|
774 |
+
("idx_fact_rec_date", "fact_h1b_applications", "rec_date_key"),
|
775 |
+
("idx_dim_beneficiary_id", "dim_beneficiary", "beneficiary_id"),
|
776 |
+
("idx_dim_employer_id", "dim_employer", "employer_id"),
|
777 |
+
("idx_dim_job_id", "dim_job", "job_id"),
|
778 |
+
]
|
779 |
+
|
780 |
+
for index_name, table_name, column_name in indexes:
|
781 |
+
try:
|
782 |
+
self.conn.execute(f"CREATE INDEX {index_name} ON {table_name}({column_name})")
|
783 |
+
except Exception as e:
|
784 |
+
self.logger.warning(f"Could not create index {index_name}: {e}")
|
785 |
+
|
786 |
+
self.logger.info("Indexes created successfully!")
|
787 |
+
|
788 |
+
|
789 |
+
class DataQualityChecker:
|
790 |
+
"""Performs data quality checks and validation."""
|
791 |
+
|
792 |
+
def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
|
793 |
+
"""
|
794 |
+
Initialize data quality checker.
|
795 |
+
|
796 |
+
Args:
|
797 |
+
conn: DuckDB connection object.
|
798 |
+
logger: Logger instance for tracking operations.
|
799 |
+
"""
|
800 |
+
self.conn = conn
|
801 |
+
self.logger = logger
|
802 |
+
|
803 |
+
def run_all_checks(self) -> bool:
|
804 |
+
"""
|
805 |
+
Run all data quality checks.
|
806 |
+
|
807 |
+
Returns:
|
808 |
+
True if all checks pass, False otherwise.
|
809 |
+
"""
|
810 |
+
self.logger.info("Running data quality checks...")
|
811 |
+
|
812 |
+
try:
|
813 |
+
self._check_table_counts()
|
814 |
+
self._check_fact_table_integrity()
|
815 |
+
return True
|
816 |
+
except Exception as e:
|
817 |
+
self.logger.error(f"Error in data quality checks: {e}")
|
818 |
+
return False
|
819 |
+
|
820 |
+
def _check_table_counts(self) -> None:
|
821 |
+
"""Check row counts for all tables."""
|
822 |
+
tables_query = """
|
823 |
+
SELECT table_name, estimated_size as row_count
|
824 |
+
FROM duckdb_tables()
|
825 |
+
WHERE schema_name = 'main'
|
826 |
+
ORDER BY table_name
|
827 |
+
"""
|
828 |
+
tables_info = self.conn.execute(tables_query).fetchall()
|
829 |
+
|
830 |
+
self.logger.info("Table row counts:")
|
831 |
+
for table_name, _ in tables_info:
|
832 |
+
if not table_name.startswith('raw_'):
|
833 |
+
count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
834 |
+
self.logger.info(f" {table_name}: {count:,} records")
|
835 |
+
|
836 |
+
def _check_fact_table_integrity(self) -> None:
|
837 |
+
"""Check fact table for duplicates and integrity."""
|
838 |
+
dup_check = self.conn.execute("""
|
839 |
+
SELECT COUNT(*) as total_records,
|
840 |
+
COUNT(DISTINCT record_id) as unique_records
|
841 |
+
FROM fact_h1b_applications
|
842 |
+
""").fetchone()
|
843 |
+
|
844 |
+
self.logger.info(f"Fact table: {dup_check[0]:,} total records, {dup_check[1]:,} unique records")
|
845 |
+
|
846 |
+
|
847 |
+
class DatabasePersistence:
|
848 |
+
"""Handles database persistence operations."""
|
849 |
+
|
850 |
+
def __init__(self, logger: logging.Logger):
|
851 |
+
"""
|
852 |
+
Initialize database persistence handler.
|
853 |
+
|
854 |
+
Args:
|
855 |
+
logger: Logger instance for tracking operations.
|
856 |
+
"""
|
857 |
+
self.logger = logger
|
858 |
+
|
859 |
+
def save_to_persistent_database(self, source_conn: duckdb.DuckDBPyConnection,
|
860 |
+
target_path: str) -> None:
|
861 |
+
"""
|
862 |
+
Save tables to a persistent database file.
|
863 |
+
|
864 |
+
Args:
|
865 |
+
source_conn: Source database connection.
|
866 |
+
target_path: Path to the target persistent database file.
|
867 |
+
"""
|
868 |
+
self.logger.info(f"Saving to persistent database: {target_path}")
|
869 |
+
|
870 |
+
# Remove existing file if it exists
|
871 |
+
if os.path.exists(target_path):
|
872 |
+
os.remove(target_path)
|
873 |
+
self.logger.info(f"Removed existing database file: {target_path}")
|
874 |
+
|
875 |
+
# Create persistent database connection
|
876 |
+
with duckdb.connect(target_path) as persistent_conn:
|
877 |
+
# Get tables to copy (exclude temporary tables)
|
878 |
+
tables_to_keep = source_conn.execute("""
|
879 |
+
SELECT table_name
|
880 |
+
FROM information_schema.tables
|
881 |
+
WHERE table_name NOT LIKE 'raw_%'
|
882 |
+
AND table_name NOT IN ('combined_data', 'cleaned_data')
|
883 |
+
AND table_schema = 'main'
|
884 |
+
""").fetchall()
|
885 |
+
|
886 |
+
# Copy tables
|
887 |
+
for table_info in tables_to_keep:
|
888 |
+
table_name = table_info[0]
|
889 |
+
df = source_conn.execute(f"SELECT * FROM {table_name}").df()
|
890 |
+
persistent_conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df")
|
891 |
+
self.logger.info(f"Copied table {table_name} to persistent database")
|
892 |
+
|
893 |
+
self.logger.info(f"Persistent database saved to: {target_path}")
|
894 |
+
|
895 |
+
|
896 |
+
# Configuration and Constants
|
897 |
+
class Config:
|
898 |
+
"""Configuration class for the H1B data pipeline."""
|
899 |
+
|
900 |
+
CSV_FILES = [
|
901 |
+
'./data/TRK_13139_FY2021.csv',
|
902 |
+
'./data/TRK_13139_FY2022.csv',
|
903 |
+
'./data/TRK_13139_FY2023.csv',
|
904 |
+
'./data/TRK_13139_FY2024_single_reg.csv',
|
905 |
+
'./data/TRK_13139_FY2024_multi_reg.csv'
|
906 |
+
]
|
907 |
+
|
908 |
+
XLSX_FILE = './data/TRK_13139_I129_H1B_Registrations_FY21_FY24_FOIA_FIN.xlsx'
|
909 |
+
PERSISTENT_DB_PATH = './data/h1bs_analytics.duckdb'
|
910 |
+
MISSING_DATA_THRESHOLD = 0.99
|
911 |
+
|
912 |
+
|
913 |
+
def main():
|
914 |
+
"""Main execution function for the H1B data pipeline."""
|
915 |
+
print("Starting H1B Data Analytics Pipeline...")
|
916 |
+
print("All imports successful!")
|
917 |
+
|
918 |
+
# Check memory usage
|
919 |
+
MemoryManager.check_memory_usage()
|
920 |
+
|
921 |
+
# Validate input files
|
922 |
+
existing_files, missing_files = FileValidator.validate_files(Config.CSV_FILES)
|
923 |
+
if missing_files:
|
924 |
+
print(f"Warning: {len(missing_files)} files are missing")
|
925 |
+
|
926 |
+
# Run the pipeline
|
927 |
+
try:
|
928 |
+
with H1BDataPipeline() as pipeline:
|
929 |
+
# Load data
|
930 |
+
data_loader = DataLoader(pipeline.conn, pipeline.logger)
|
931 |
+
data_loader.load_csv_files(existing_files)
|
932 |
+
|
933 |
+
# Transform data
|
934 |
+
transformer = DataTransformer(pipeline.conn, pipeline.logger)
|
935 |
+
transformer.create_combined_table()
|
936 |
+
kept_columns = transformer.remove_columns_with_missing_data(
|
937 |
+
'combined_data', Config.MISSING_DATA_THRESHOLD
|
938 |
+
)
|
939 |
+
print(f"Kept {len(kept_columns)} columns after cleaning")
|
940 |
+
|
941 |
+
# Create dimensional model
|
942 |
+
modeler = DimensionalModeler(pipeline.conn, pipeline.logger)
|
943 |
+
modeler.create_all_dimensions()
|
944 |
+
modeler.create_fact_table()
|
945 |
+
modeler.create_lookup_tables()
|
946 |
+
|
947 |
+
# Optimize database
|
948 |
+
optimizer = DatabaseOptimizer(pipeline.conn, pipeline.logger)
|
949 |
+
optimizer.create_indexes()
|
950 |
+
|
951 |
+
# Run quality checks
|
952 |
+
quality_checker = DataQualityChecker(pipeline.conn, pipeline.logger)
|
953 |
+
quality_checker.run_all_checks()
|
954 |
+
|
955 |
+
# Save to persistent database
|
956 |
+
persistence = DatabasePersistence(pipeline.logger)
|
957 |
+
persistence.save_to_persistent_database(
|
958 |
+
pipeline.conn, Config.PERSISTENT_DB_PATH
|
959 |
+
)
|
960 |
+
|
961 |
+
# Final memory check
|
962 |
+
MemoryManager.check_memory_usage()
|
963 |
+
MemoryManager.clear_memory()
|
964 |
+
|
965 |
+
print("Pipeline completed successfully!")
|
966 |
+
|
967 |
+
except Exception as e:
|
968 |
+
print(f"Pipeline failed with error: {e}")
|
969 |
+
traceback.print_exc()
|
970 |
+
|
971 |
+
|
972 |
+
if __name__ == "__main__":
|
973 |
+
main()
|