Spaces:

LeonceNsh
/

hones

Sleeping

App Files Files Community

LeonceNsh commited on Jun 25

Commit

1885ec3

verified ·

1 Parent(s): e9bc2bd

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

.gradio/certificate.pem +31 -0
README.md +2 -8
__pycache__/model_and_load_toduckdb.cpython-311.pyc +0 -0
app.py +489 -0
model_and_load_toduckdb.py +973 -0

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Hones
-emoji: 🏃
-colorFrom: blue
-colorTo: gray
 sdk: gradio
 sdk_version: 5.34.2
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: hones
+app_file: app.py
 sdk: gradio
 sdk_version: 5.34.2
 ---

__pycache__/model_and_load_toduckdb.cpython-311.pyc ADDED Viewed

Binary file (50.4 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,489 @@

+import gradio as gr
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import duckdb
+import numpy as np
+from datetime import datetime
+import os
+# Database connection
+DATABASE_PATH = "./data/h1bs_analytics.duckdb"
+def get_db_connection():
+    """Create a connection to the DuckDB database"""
+    if os.path.exists(DATABASE_PATH):
+        return duckdb.connect(DATABASE_PATH, read_only=True)
+    else:
+        # Create sample data if database doesn't exist
+        return create_sample_data()
+def create_sample_data():
+    """Create sample H1B facts data for demonstration"""
+    conn = duckdb.connect(":memory:")
+    # Sample fact table based on H1B schema
+    np.random.seed(42)
+    n_records = 5000
+    sample_facts = pd.DataFrame({
+        'record_id': range(1, n_records + 1),
+        'lottery_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
+        'fiscal_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
+        'country_of_birth': np.random.choice([
+            'INDIA', 'CHINA', 'SOUTH KOREA', 'CANADA', 'UNITED KINGDOM',
+            'PHILIPPINES', 'TAIWAN', 'JAPAN', 'MEXICO', 'BRAZIL'
+        ], n_records, p=[0.4, 0.15, 0.1, 0.08, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03]),
+        'wage_amt': np.random.lognormal(11.2, 0.5, n_records).round(0),  # Log-normal for realistic wage distribution
+        'is_multiple_registration': np.random.choice([True, False], n_records, p=[0.3, 0.7]),
+        'age_at_application': np.random.normal(28, 4, n_records).round(0).clip(22, 45),
+        'years_since_application': np.random.choice([0, 1, 2, 3], n_records),
+        'full_time_ind': np.random.choice([True, False], n_records, p=[0.85, 0.15]),
+        'employer_worksite_same_state': np.random.choice([True, False], n_records, p=[0.7, 0.3]),
+        'employer_sk': [f'EMP_{i%500}' for i in range(n_records)],
+        'beneficiary_sk': [f'BEN_{i}' for i in range(n_records)],
+        'job_sk': [f'JOB_{i%300}' for i in range(n_records)]
+    })
+    conn.execute("CREATE TABLE fct_h1b_applications AS SELECT * FROM sample_facts")
+    return conn
+# Load data
+conn = get_db_connection()
+def load_facts_data():
+    """Load H1B applications fact table"""
+    try:
+        query = """
+        SELECT * FROM fct_h1b_applications
+        WHERE wage_amt IS NOT NULL
+        LIMIT 10000
+        """
+        return conn.execute(query).df()
+    except Exception as e:
+        print(f"Error loading facts data: {e}")
+        return pd.DataFrame()
+# Load the facts data
+facts_df = load_facts_data()
+# ---- FACTS TABLE VISUALIZATIONS ----
+def facts_overview():
+    """Overview of the facts table with key metrics"""
+    if facts_df.empty:
+        return go.Figure().update_layout(title="No facts data available")
+    # Key metrics
+    total_records = len(facts_df)
+    avg_wage = facts_df['wage_amt'].mean()
+    median_wage = facts_df['wage_amt'].median()
+    multiple_reg_pct = (facts_df['is_multiple_registration'].sum() / len(facts_df)) * 100
+    # Create metrics dashboard
+    fig = make_subplots(
+        rows=2, cols=2,
+        specs=[[{"type": "indicator"}, {"type": "indicator"}],
+               [{"type": "indicator"}, {"type": "indicator"}]],
+        subplot_titles=("Total Records", "Average Wage", "Median Wage", "Multiple Registration %")
+    )
+    fig.add_trace(
+        go.Indicator(
+            mode="number",
+            value=total_records,
+            number={"valueformat": ","},
+            title={"text": "Total Records"}
+        ),
+        row=1, col=1
+    )
+    fig.add_trace(
+        go.Indicator(
+            mode="number",
+            value=avg_wage,
+            number={"prefix": "$", "valueformat": ",.0f"},
+            title={"text": "Average Wage"}
+        ),
+        row=1, col=2
+    )
+    fig.add_trace(
+        go.Indicator(
+            mode="number",
+            value=median_wage,
+            number={"prefix": "$", "valueformat": ",.0f"},
+            title={"text": "Median Wage"}
+        ),
+        row=2, col=1
+    )
+    fig.add_trace(
+        go.Indicator(
+            mode="number",
+            value=multiple_reg_pct,
+            number={"suffix": "%", "valueformat": ".1f"},
+            title={"text": "Multiple Registrations"}
+        ),
+        row=2, col=2
+    )
+    fig.update_layout(
+        height=400,
+        title_text="H1B Facts Table - Key Metrics"
+    )
+    return fig
+def wage_distribution():
+    """Visualize wage distribution from facts table"""
+    if facts_df.empty:
+        return go.Figure().update_layout(title="No data available")
+    fig = make_subplots(
+        rows=1, cols=2,
+        specs=[[{"type": "histogram"}, {"type": "box"}]],
+        subplot_titles=("Wage Distribution", "Wage Distribution (Box Plot)")
+    )
+    # Histogram
+    fig.add_trace(
+        go.Histogram(
+            x=facts_df['wage_amt'],
+            nbinsx=50,
+            marker_color='skyblue',
+            opacity=0.7,
+            name='Wage Distribution'
+        ),
+        row=1, col=1
+    )
+    # Box plot
+    fig.add_trace(
+        go.Box(
+            y=facts_df['wage_amt'],
+            marker_color='lightcoral',
+            name='Wage Box Plot'
+        ),
+        row=1, col=2
+    )
+    fig.update_layout(
+        height=500,
+        title_text="Wage Analysis from Facts Table",
+        showlegend=False
+    )
+    fig.update_xaxes(title_text="Wage Amount ($)", row=1, col=1)
+    fig.update_yaxes(title_text="Frequency", row=1, col=1)
+    fig.update_yaxes(title_text="Wage Amount ($)", row=1, col=2)
+    return fig
+def country_analysis():
+    """Analyze country distribution from facts table"""
+    if facts_df.empty:
+        return go.Figure().update_layout(title="No data available")
+    # Country counts
+    country_counts = facts_df['country_of_birth'].value_counts().head(10)
+    # Average wage by country
+    country_wages = facts_df.groupby('country_of_birth')['wage_amt'].agg(['mean', 'count']).reset_index()
+    country_wages = country_wages[country_wages['count'] >= 50].nlargest(8, 'mean')  # Min 50 applications
+    fig = make_subplots(
+        rows=1, cols=2,
+        specs=[[{"type": "bar"}, {"type": "bar"}]],
+        subplot_titles=("Applications by Country", "Average Wage by Country (Min 50 apps)")
+    )
+    # Applications by country
+    fig.add_trace(
+        go.Bar(
+            x=country_counts.index,
+            y=country_counts.values,
+            marker_color='teal',
+            text=country_counts.values,
+            textposition='auto',
+            name='Application Count'
+        ),
+        row=1, col=1
+    )
+    # Average wage by country
+    fig.add_trace(
+        go.Bar(
+            x=country_wages['country_of_birth'],
+            y=country_wages['mean'],
+            marker_color='orange',
+            text=['$' + f"{x:,.0f}" for x in country_wages['mean']],
+            textposition='auto',
+            name='Average Wage'
+        ),
+        row=1, col=2
+    )
+    fig.update_layout(
+        height=500,
+        title_text="Country Analysis from Facts Table",
+        showlegend=False
+    )
+    fig.update_xaxes(tickangle=45, row=1, col=1)
+    fig.update_xaxes(tickangle=45, row=1, col=2)
+    fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
+    fig.update_yaxes(title_text="Average Wage ($)", row=1, col=2)
+    return fig
+def temporal_analysis():
+    """Analyze temporal patterns from facts table"""
+    if facts_df.empty:
+        return go.Figure().update_layout(title="No data available")
+    # Yearly trends
+    yearly_stats = facts_df.groupby('fiscal_year').agg({
+        'record_id': 'count',
+        'wage_amt': 'mean',
+        'is_multiple_registration': 'mean'
+    }).reset_index()
+    yearly_stats['multiple_reg_pct'] = yearly_stats['is_multiple_registration'] * 100
+    fig = make_subplots(
+        rows=2, cols=1,
+        specs=[[{"secondary_y": True}], [{"type": "bar"}]],
+        subplot_titles=("Applications and Average Wage by Year", "Multiple Registration Percentage by Year")
+    )
+    # Applications count
+    fig.add_trace(
+        go.Scatter(
+            x=yearly_stats['fiscal_year'],
+            y=yearly_stats['record_id'],
+            mode='lines+markers',
+            name='Applications',
+            line=dict(color='blue', width=3),
+            marker=dict(size=8)
+        ),
+        row=1, col=1
+    )
+    # Average wage (secondary y-axis)
+    fig.add_trace(
+        go.Scatter(
+            x=yearly_stats['fiscal_year'],
+            y=yearly_stats['wage_amt'],
+            mode='lines+markers',
+            name='Average Wage',
+            line=dict(color='red', width=3),
+            marker=dict(size=8),
+            yaxis='y2'
+        ),
+        row=1, col=1
+    )
+    # Multiple registration percentage
+    fig.add_trace(
+        go.Bar(
+            x=yearly_stats['fiscal_year'],
+            y=yearly_stats['multiple_reg_pct'],
+            marker_color='green',
+            text=[f"{x:.1f}%" for x in yearly_stats['multiple_reg_pct']],
+            textposition='auto',
+            name='Multiple Registration %'
+        ),
+        row=2, col=1
+    )
+    # Update layout
+    fig.update_layout(
+        height=600,
+        title_text="Temporal Analysis from Facts Table"
+    )
+    # Update y-axes
+    fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
+    fig.update_yaxes(title_text="Average Wage ($)", secondary_y=True, row=1, col=1)
+    fig.update_yaxes(title_text="Multiple Registration (%)", row=2, col=1)
+    fig.update_xaxes(title_text="Fiscal Year", row=2, col=1)
+    return fig
+def demographic_analysis():
+    """Analyze demographic patterns from facts table"""
+    if facts_df.empty:
+        return go.Figure().update_layout(title="No data available")
+    # Age distribution
+    age_bins = pd.cut(facts_df['age_at_application'], bins=range(20, 50, 5), right=False)
+    age_counts = age_bins.value_counts().sort_index()
+    # Full-time vs Part-time
+    employment_type = facts_df['full_time_ind'].value_counts()
+    employment_labels = ['Full-time' if x else 'Part-time' for x in employment_type.index]
+    # Same state employment
+    same_state = facts_df['employer_worksite_same_state'].value_counts()
+    same_state_labels = ['Same State' if x else 'Different State' for x in same_state.index]
+    fig = make_subplots(
+        rows=2, cols=2,
+        specs=[[{"type": "bar"}, {"type": "pie"}],
+               [{"type": "pie"}, {"type": "histogram"}]],
+        subplot_titles=("Age Distribution", "Employment Type", "Employer-Worksite Location", "Years Since Application")
+    )
+    # Age distribution
+    fig.add_trace(
+        go.Bar(
+            x=[str(interval) for interval in age_counts.index],
+            y=age_counts.values,
+            marker_color='lightblue',
+            name='Age Distribution'
+        ),
+        row=1, col=1
+    )
+    # Employment type pie chart
+    fig.add_trace(
+        go.Pie(
+            labels=employment_labels,
+            values=employment_type.values,
+            name="Employment Type"
+        ),
+        row=1, col=2
+    )
+    # Same state pie chart
+    fig.add_trace(
+        go.Pie(
+            labels=same_state_labels,
+            values=same_state.values,
+            name="Location"
+        ),
+        row=2, col=1
+    )
+    # Years since application
+    years_since = facts_df['years_since_application'].value_counts().sort_index()
+    fig.add_trace(
+        go.Histogram(
+            x=facts_df['years_since_application'],
+            nbinsx=10,
+            marker_color='lightgreen',
+            name='Years Since Application'
+        ),
+        row=2, col=2
+    )
+    fig.update_layout(
+        height=600,
+        title_text="Demographic Analysis from Facts Table",
+        showlegend=False
+    )
+    return fig
+def facts_data_table():
+    """Display sample of facts table data"""
+    if facts_df.empty:
+        return pd.DataFrame()
+    # Return first 100 rows with key columns
+    display_cols = [
+        'record_id', 'lottery_year', 'fiscal_year', 'country_of_birth',
+        'wage_amt', 'age_at_application', 'is_multiple_registration',
+        'full_time_ind', 'employer_worksite_same_state'
+    ]
+    sample_data = facts_df[display_cols].head(100).copy()
+    # Format wage column
+    sample_data['wage_amt'] = sample_data['wage_amt'].apply(lambda x: f"${x:,.0f}")
+    return sample_data
+# ---- GRADIO INTERFACE ----
+with gr.Blocks(theme=gr.themes.Soft(), title="H1B Facts Table Analytics") as demo:
+    gr.Markdown("# 📊 H1B Facts Table Analytics Dashboard")
+    gr.Markdown("### Comprehensive Analysis of H1B Applications Facts Data")
+    with gr.Tab("📈 Facts Overview"):
+        gr.Markdown("### Key Metrics from Facts Table")
+        facts_overview_plot = gr.Plot()
+        gr.Button("Load Facts Overview", variant="primary").click(
+            fn=facts_overview,
+            outputs=facts_overview_plot
+        )
+    with gr.Tab("💰 Wage Analysis"):
+        gr.Markdown("### Wage Distribution from Facts Table")
+        wage_plot = gr.Plot()
+        gr.Button("Analyze Wages", variant="primary").click(
+            fn=wage_distribution,
+            outputs=wage_plot
+        )
+    with gr.Tab("🌍 Country Analysis"):
+        gr.Markdown("### Country-wise Analysis from Facts Table")
+        country_plot = gr.Plot()
+        gr.Button("Analyze Countries", variant="primary").click(
+            fn=country_analysis,
+            outputs=country_plot
+        )
+    with gr.Tab("📅 Temporal Analysis"):
+        gr.Markdown("### Time-based Trends from Facts Table")
+        temporal_plot = gr.Plot()
+        gr.Button("Analyze Trends", variant="primary").click(
+            fn=temporal_analysis,
+            outputs=temporal_plot
+        )
+    with gr.Tab("👥 Demographics"):
+        gr.Markdown("### Demographic Patterns from Facts Table")
+        demo_plot = gr.Plot()
+        gr.Button("Analyze Demographics", variant="primary").click(
+            fn=demographic_analysis,
+            outputs=demo_plot
+        )
+    with gr.Tab("📋 Raw Data"):
+        gr.Markdown("### Sample Facts Table Data (First 100 rows)")
+        data_table = gr.DataFrame()
+        gr.Button("Load Sample Data", variant="primary").click(
+            fn=facts_data_table,
+            outputs=data_table
+        )
+    # Footer
+    gr.Markdown("---")
+    gr.Markdown("### Facts Table Schema")
+    gr.Markdown("""
+    **Table**: `fct_h1b_applications`
+    **Key Columns**:
+    - `record_id`: Unique identifier for each application
+    - `lottery_year`, `fiscal_year`: Temporal dimensions
+    - `country_of_birth`: Beneficiary country
+    - `wage_amt`: Offered wage amount
+    - `age_at_application`: Beneficiary age
+    - `is_multiple_registration`: Multiple lottery entries flag
+    - `full_time_ind`: Full-time employment indicator
+    - `employer_worksite_same_state`: Location alignment flag
+    - Foreign keys: `employer_sk`, `beneficiary_sk`, `job_sk`
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        show_error=True
+    )

model_and_load_toduckdb.py ADDED Viewed

	@@ -0,0 +1,973 @@

+"""
+H1B Data Analytics Pipeline
+This module provides a comprehensive ETL pipeline for processing H1B visa application data.
+It loads CSV files into DuckDB, creates dimensional models, and performs data quality checks.
+"""
+import os
+import gc
+import logging
+import hashlib
+from datetime import datetime
+from typing import List, Optional, Tuple
+import traceback
+import duckdb
+import pandas as pd
+import numpy as np
+import psutil
+class H1BDataPipeline:
+    """
+    Main pipeline class for processing H1B visa application data.
+    This class handles the complete ETL process including:
+    - Loading CSV files into DuckDB
+    - Creating dimensional models
+    - Data quality checks
+    - Database persistence
+    """
+    def __init__(self, db_path: str = ':memory:', log_level: int = logging.INFO):
+        """
+        Initialize the H1B data pipeline.
+        Args:
+            db_path: Path to DuckDB database file. Use ':memory:' for in-memory database.
+            log_level: Logging level for the pipeline.
+        """
+        self.db_path = db_path
+        self.conn = None
+        self.logger = self._setup_logging(log_level)
+        self._setup_database()
+    def _setup_logging(self, log_level: int) -> logging.Logger:
+        """Set up logging configuration for the pipeline."""
+        logger = logging.getLogger(__name__)
+        logging.basicConfig(
+            level=log_level,
+            format="{asctime} - {name} - {levelname} - {message}",
+            style="{",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+        return logger
+    def _setup_database(self) -> None:
+        """Initialize DuckDB connection."""
+        try:
+            self.conn = duckdb.connect(self.db_path)
+            self.logger.info(f"DuckDB connection established to {self.db_path}")
+            self.logger.info(f"DuckDB version: {duckdb.__version__}")
+            # Test connection
+            test_result = self.conn.execute("SELECT 'Hello DuckDB!' as message").fetchone()
+            self.logger.info(f"Connection test: {test_result[0]}")
+        except Exception as e:
+            self.logger.error(f"Failed to establish database connection: {e}")
+            raise
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+    def close(self) -> None:
+        """Close database connection and cleanup resources."""
+        if self.conn:
+            self.conn.close()
+            self.logger.info("Database connection closed")
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit with cleanup."""
+        self.close()
+class MemoryManager:
+    """Utility class for monitoring and managing memory usage."""
+    @staticmethod
+    def check_memory_usage() -> float:
+        """
+        Check current memory usage of the process.
+        Returns:
+            Memory usage in MB.
+        """
+        process = psutil.Process(os.getpid())
+        memory_mb = process.memory_info().rss / 1024 / 1024
+        print(f"Current memory usage: {memory_mb:.1f} MB")
+        return memory_mb
+    @staticmethod
+    def clear_memory() -> None:
+        """Force garbage collection to clear memory."""
+        gc.collect()
+        print("Memory cleared")
+class FileValidator:
+    """Utility class for validating file existence and accessibility."""
+    @staticmethod
+    def validate_files(file_paths: List[str]) -> Tuple[List[str], List[str]]:
+        """
+        Validate that files exist and are accessible.
+        Args:
+            file_paths: List of file paths to validate.
+        Returns:
+            Tuple of (existing_files, missing_files).
+        """
+        existing_files = []
+        missing_files = []
+        for file_path in file_paths:
+            if os.path.exists(file_path):
+                existing_files.append(file_path)
+                print(f"✓ Found: {file_path}")
+            else:
+                missing_files.append(file_path)
+                print(f"✗ Missing: {file_path}")
+        return existing_files, missing_files
+class DataLoader:
+    """Handles loading data from various sources into DuckDB."""
+    def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
+        """
+        Initialize data loader.
+        Args:
+            conn: DuckDB connection object.
+            logger: Logger instance for tracking operations.
+        """
+        self.conn = conn
+        self.logger = logger
+    def load_csv_files(self, file_paths: List[str]) -> None:
+        """
+        Load CSV files directly into DuckDB without loading into pandas first.
+        Args:
+            file_paths: List of CSV file paths to load.
+        """
+        self.logger.info("Loading CSV files directly into DuckDB...")
+        for file_path in file_paths:
+            try:
+                self._load_single_csv(file_path)
+            except Exception as e:
+                self.logger.error(f"Error loading {file_path}: {e}")
+    def _load_single_csv(self, file_path: str) -> None:
+        """
+        Load a single CSV file into DuckDB.
+        Args:
+            file_path: Path to the CSV file.
+        """
+        self.logger.info(f"Loading {file_path}")
+        # Extract metadata from filename
+        filename = file_path.split('/')[-1].replace('.csv', '')
+        table_name = f"raw_{filename}"
+        fiscal_year = self._extract_fiscal_year(filename)
+        # Load CSV directly into DuckDB
+        self.conn.execute(f"""
+            CREATE TABLE {table_name} AS
+            SELECT *,
+                '{file_path}' as source_file,
+                '{fiscal_year}' as fiscal_year
+            FROM read_csv_auto('{file_path}', header=true, normalize_names=true, ignore_errors=true)
+        """)
+        # Clean column names
+        self._clean_column_names(table_name)
+        # Log success
+        count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
+        self.logger.info(f"Loaded {count:,} records from {file_path} into {table_name}")
+    def _extract_fiscal_year(self, filename: str) -> str:
+        """Extract fiscal year from filename."""
+        import re
+        match = re.search(r'FY(\d{4})', filename)
+        if match:
+            return match.group(1)  # Return only the year digits
+        return "unknown"
+    def _clean_column_names(self, table_name: str) -> None:
+        """
+        Clean column names in DuckDB table.
+        Args:
+            table_name: Name of the table to clean.
+        """
+        columns_query = f"PRAGMA table_info('{table_name}')"
+        columns_info = self.conn.execute(columns_query).fetchall()
+        for col_info in columns_info:
+            old_name = col_info[1]
+            new_name = self._normalize_column_name(old_name)
+            if old_name != new_name:
+                self.conn.execute(f"""
+                    ALTER TABLE {table_name}
+                    RENAME COLUMN "{old_name}" TO {new_name}
+                """)
+    @staticmethod
+    def _normalize_column_name(column_name: str) -> str:
+        """
+        Normalize column name to follow consistent naming convention.
+        Args:
+            column_name: Original column name.
+        Returns:
+            Normalized column name.
+        """
+        import re
+        # Remove URLs and other problematic patterns
+        normalized = re.sub(r'https?://[^\s]+', '', str(column_name))
+        normalized = re.sub(r'[^\w\s]', '_', normalized)  # Replace special chars with underscore
+        normalized = re.sub(r'\s+', '_', normalized)      # Replace spaces with underscore
+        normalized = re.sub(r'_+', '_', normalized)       # Replace multiple underscores with single
+        normalized = normalized.lower().strip('_')        # Lowercase and trim underscores
+        # Ensure it starts with letter or underscore
+        if normalized and not (normalized[0].isalpha() or normalized[0] == '_'):
+            normalized = f'col_{normalized}'
+        return normalized if normalized else 'unnamed_column'
+class DataTransformer:
+    """Handles data transformation and dimensional modeling."""
+    def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
+        """
+        Initialize data transformer.
+        Args:
+            conn: DuckDB connection object.
+            logger: Logger instance for tracking operations.
+        """
+        self.conn = conn
+        self.logger = logger
+    def create_combined_table(self) -> None:
+        """Create a combined table from all raw tables in DuckDB."""
+        self.logger.info("Creating combined table in DuckDB...")
+        # Get list of raw tables
+        raw_tables = self.conn.execute("""
+            SELECT table_name
+            FROM information_schema.tables
+            WHERE table_name LIKE 'raw_%'
+        """).fetchall()
+        if not raw_tables:
+            raise ValueError("No raw tables found")
+        # Create union query
+        union_parts = [f"SELECT * FROM {table_info[0]}" for table_info in raw_tables]
+        union_query = " UNION ALL ".join(union_parts)
+        # Create combined table
+        self.conn.execute(f"""
+            CREATE TABLE combined_data AS
+            {union_query}
+        """)
+        count = self.conn.execute("SELECT COUNT(*) FROM combined_data").fetchone()[0]
+        self.logger.info(f"Created combined table with {count:,} records")
+    def remove_columns_with_missing_data(self, table_name: str, threshold: float = 0.8) -> List[str]:
+        """
+        Remove columns with high missing data directly in DuckDB.
+        Args:
+            table_name: Name of the table to clean.
+            threshold: Threshold for missing data ratio (0.0 to 1.0).
+        Returns:
+            List of columns that were kept.
+        """
+        self.logger.info(f"Removing columns with >{threshold*100}% missing data from {table_name}...")
+        total_rows = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
+        columns_info = self.conn.execute(f"PRAGMA table_info('{table_name}')").fetchall()
+        columns_to_keep = []
+        columns_removed = []
+        for col_info in columns_info:
+            col_name = col_info[1]
+            col_type = col_info[2]  # Get column type for better handling
+            try:
+                # Handle different column types appropriately
+                if col_type.upper() in ['INTEGER', 'BIGINT', 'DOUBLE', 'FLOAT', 'DECIMAL', 'NUMERIC']:
+                    # For numeric columns, only check for NULL
+                    non_null_count = self.conn.execute(f"""
+                        SELECT COUNT(*)
+                        FROM {table_name}
+                        WHERE "{col_name}" IS NOT NULL
+                    """).fetchone()[0]
+                else:
+                    # For text columns, check for NULL and empty strings
+                    non_null_count = self.conn.execute(f"""
+                        SELECT COUNT(*)
+                        FROM {table_name}
+                        WHERE "{col_name}" IS NOT NULL
+                        AND TRIM(CAST("{col_name}" AS VARCHAR)) != ''
+                    """).fetchone()[0]
+                missing_ratio = 1 - (non_null_count / total_rows)
+                self.logger.debug(f"Column {col_name}: {non_null_count}/{total_rows} non-null ({missing_ratio:.2%} missing)")
+                if missing_ratio <= threshold:
+                    columns_to_keep.append(col_name)
+                else:
+                    columns_removed.append(col_name)
+                    self.logger.info(f"Removing column {col_name} with {missing_ratio:.2%} missing data")
+            except Exception as e:
+                self.logger.warning(f"Error processing column {col_name}: {e}")
+                # When in doubt, keep the column
+                columns_to_keep.append(col_name)
+        if columns_removed:
+            self.logger.info(f"Removing {len(columns_removed)} columns with high missing data")
+            self._recreate_table_with_columns(table_name, columns_to_keep)
+        return columns_to_keep
+    def _recreate_table_with_columns(self, table_name: str, columns_to_keep: List[str]) -> None:
+        """
+        Recreate table with only specified columns.
+        Args:
+            table_name: Original table name.
+            columns_to_keep: List of column names to retain.
+        """
+        columns_str = ', '.join(columns_to_keep)
+        self.conn.execute(f"""
+            CREATE TABLE {table_name}_clean AS
+            SELECT {columns_str}
+            FROM {table_name}
+        """)
+        self.conn.execute(f"DROP TABLE {table_name}")
+        self.conn.execute(f"ALTER TABLE {table_name}_clean RENAME TO {table_name}")
+class DimensionalModeler:
+    """Creates dimensional model tables for analytics."""
+    def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
+        """
+        Initialize dimensional modeler.
+        Args:
+            conn: DuckDB connection object.
+            logger: Logger instance for tracking operations.
+        """
+        self.conn = conn
+        self.logger = logger
+    def create_all_dimensions(self) -> None:
+        """Create all dimension tables."""
+        self._prepare_cleaned_data()
+        self._create_beneficiary_dimension()
+        self._create_employer_dimension()
+        self._create_job_dimension()
+        self._create_agent_dimension()
+        self._create_status_dimension()
+        self._create_date_dimension()
+    def _prepare_cleaned_data(self) -> None:
+        """Prepare cleaned data table for dimension creation."""
+        self.logger.info("Preparing cleaned data...")
+        # First, check what columns actually exist in combined_data
+        columns_info = self.conn.execute("PRAGMA table_info('combined_data')").fetchall()
+        available_columns = [col[1] for col in columns_info]
+        self.logger.info(f"Available columns in combined_data: {available_columns}")
+        self.conn.execute("""
+            CREATE TABLE cleaned_data AS
+            SELECT
+                ROW_NUMBER() OVER () as original_row_id,
+                *
+            FROM combined_data
+        """)
+    def _create_beneficiary_dimension(self) -> None:
+        """Create beneficiary dimension table."""
+        self.logger.info("Creating dim_beneficiary...")
+        self.conn.execute("""
+            CREATE TABLE dim_beneficiary AS
+            SELECT DISTINCT
+                ROW_NUMBER() OVER () as beneficiary_key,
+                MD5(CONCAT(
+                    COALESCE(country_of_birth, ''), '|',
+                    COALESCE(country_of_nationality, ''), '|',
+                    COALESCE(CAST(ben_year_of_birth AS VARCHAR), ''), '|',
+                    COALESCE(gender, '')
+                )) as beneficiary_id,
+                country_of_birth,
+                country_of_nationality,
+                ben_year_of_birth,
+                gender,
+                ben_sex,
+                ben_country_of_birth,
+                ben_current_class,
+                ben_education_code,
+                ed_level_definition,
+                ben_pfield_of_study
+            FROM cleaned_data
+            WHERE country_of_birth IS NOT NULL
+               OR country_of_nationality IS NOT NULL
+               OR ben_year_of_birth IS NOT NULL
+               OR gender IS NOT NULL
+        """)
+    def _create_employer_dimension(self) -> None:
+        """Create employer dimension table."""
+        self.logger.info("Creating dim_employer...")
+        self.conn.execute("""
+            CREATE TABLE dim_employer AS
+            SELECT DISTINCT
+                ROW_NUMBER() OVER () as employer_key,
+                MD5(CONCAT(
+                    COALESCE(employer_name, ''), '|',
+                    COALESCE(fein, '')
+                )) as employer_id,
+                employer_name,
+                fein,
+                mail_addr,
+                city,
+                state,
+                zip
+            FROM cleaned_data
+            WHERE employer_name IS NOT NULL OR fein IS NOT NULL
+        """)
+    def _create_job_dimension(self) -> None:
+        """Create job dimension table."""
+        self.logger.info("Creating dim_job...")
+        self.conn.execute("""
+            CREATE TABLE dim_job AS
+            SELECT DISTINCT
+                ROW_NUMBER() OVER () as job_key,
+                MD5(CONCAT(
+                    COALESCE(job_title, ''), '|',
+                    COALESCE(naics_code, '')
+                )) as job_id,
+                job_title,
+                dot_code,
+                naics_code,
+                wage_amt,
+                wage_unit,
+                full_time_ind,
+                ben_comp_paid,
+                worksite_city,
+                worksite_state
+            FROM cleaned_data
+            WHERE job_title IS NOT NULL OR naics_code IS NOT NULL
+        """)
+    def _create_agent_dimension(self) -> None:
+        """Create agent dimension table."""
+        self.logger.info("Creating dim_agent...")
+        self.conn.execute("""
+            CREATE TABLE dim_agent AS
+            SELECT DISTINCT
+                ROW_NUMBER() OVER () as agent_key,
+                MD5(CONCAT(
+                    COALESCE(agent_first_name, ''), '|',
+                    COALESCE(agent_last_name, '')
+                )) as agent_id,
+                agent_first_name,
+                agent_last_name
+            FROM cleaned_data
+            WHERE agent_first_name IS NOT NULL OR agent_last_name IS NOT NULL
+        """)
+    def _create_status_dimension(self) -> None:
+        """Create status dimension table."""
+        self.logger.info("Creating dim_status...")
+        self.conn.execute("""
+            CREATE TABLE dim_status AS
+            SELECT DISTINCT
+                ROW_NUMBER() OVER () as status_key,
+                status_type,
+                first_decision
+            FROM cleaned_data
+            WHERE status_type IS NOT NULL OR first_decision IS NOT NULL
+        """)
+    def _create_date_dimension(self) -> None:
+        """Create date dimension table."""
+        self.logger.info("Creating dim_date...")
+        self.conn.execute("""
+            CREATE TABLE dim_date AS
+            WITH all_dates AS (
+                -- Handle MM/DD/YYYY format
+                SELECT TRY_STRPTIME(rec_date, '%m/%d/%Y') as date_value
+                FROM cleaned_data
+                WHERE rec_date IS NOT NULL
+                AND rec_date NOT LIKE '%(%'
+                AND LENGTH(rec_date) >= 8
+                AND rec_date ~ '^[0-9/-]+$'
+                AND TRY_STRPTIME(rec_date, '%m/%d/%Y') IS NOT NULL
+                UNION
+                -- Handle YYYY-MM-DD format
+                SELECT TRY_STRPTIME(rec_date, '%Y-%m-%d') as date_value
+                FROM cleaned_data
+                WHERE rec_date IS NOT NULL
+                AND rec_date NOT LIKE '%(%'
+                AND LENGTH(rec_date) >= 8
+                AND rec_date ~ '^[0-9-]+$'
+                AND TRY_STRPTIME(rec_date, '%Y-%m-%d') IS NOT NULL
+                UNION
+                -- Handle first_decision_date MM/DD/YYYY format
+                SELECT TRY_STRPTIME(first_decision_date, '%m/%d/%Y') as date_value
+                FROM cleaned_data
+                WHERE first_decision_date IS NOT NULL
+                AND first_decision_date NOT LIKE '%(%'
+                AND LENGTH(first_decision_date) >= 8
+                AND first_decision_date ~ '^[0-9/-]+$'
+                AND TRY_STRPTIME(first_decision_date, '%m/%d/%Y') IS NOT NULL
+                UNION
+                -- Handle first_decision_date YYYY-MM-DD format
+                SELECT TRY_STRPTIME(first_decision_date, '%Y-%m-%d') as date_value
+                FROM cleaned_data
+                WHERE first_decision_date IS NOT NULL
+                AND first_decision_date NOT LIKE '%(%'
+                AND LENGTH(first_decision_date) >= 8
+                AND first_decision_date ~ '^[0-9-]+$'
+                AND TRY_STRPTIME(first_decision_date, '%Y-%m-%d') IS NOT NULL
+                UNION
+                -- Handle valid_from MM/DD/YYYY format
+                SELECT TRY_STRPTIME(valid_from, '%m/%d/%Y') as date_value
+                FROM cleaned_data
+                WHERE valid_from IS NOT NULL
+                AND valid_from NOT LIKE '%(%'
+                AND LENGTH(valid_from) >= 8
+                AND valid_from ~ '^[0-9/-]+$'
+                AND TRY_STRPTIME(valid_from, '%m/%d/%Y') IS NOT NULL
+                UNION
+                -- Handle valid_from YYYY-MM-DD format
+                SELECT TRY_STRPTIME(valid_from, '%Y-%m-%d') as date_value
+                FROM cleaned_data
+                WHERE valid_from IS NOT NULL
+                AND valid_from NOT LIKE '%(%'
+                AND LENGTH(valid_from) >= 8
+                AND valid_from ~ '^[0-9-]+$'
+                AND TRY_STRPTIME(valid_from, '%Y-%m-%d') IS NOT NULL
+                UNION
+                -- Handle valid_to MM/DD/YYYY format
+                SELECT TRY_STRPTIME(valid_to, '%m/%d/%Y') as date_value
+                FROM cleaned_data
+                WHERE valid_to IS NOT NULL
+                AND valid_to NOT LIKE '%(%'
+                AND LENGTH(valid_to) >= 8
+                AND valid_to ~ '^[0-9/]+$'
+                AND valid_to LIKE '%/%/%'
+                AND TRY_STRPTIME(valid_to, '%m/%d/%Y') IS NOT NULL
+                UNION
+                -- Handle valid_to YYYY-MM-DD format
+                SELECT TRY_STRPTIME(valid_to, '%Y-%m-%d') as date_value
+                FROM cleaned_data
+                WHERE valid_to IS NOT NULL
+                AND valid_to NOT LIKE '%(%'
+                AND LENGTH(valid_to) >= 8
+                AND valid_to ~ '^[0-9-]+$'
+                AND TRY_STRPTIME(valid_to, '%Y-%m-%d') IS NOT NULL
+            )
+            SELECT DISTINCT
+                date_value as date,
+                EXTRACT(YEAR FROM date_value) as year,
+                EXTRACT(MONTH FROM date_value) as month,
+                EXTRACT(QUARTER FROM date_value) as quarter,
+                EXTRACT(DOW FROM date_value) as day_of_week,
+                MONTHNAME(date_value) as month_name,
+                'Q' || CAST(EXTRACT(QUARTER FROM date_value) AS VARCHAR) as quarter_name,
+                CASE
+                    WHEN EXTRACT(MONTH FROM date_value) >= 10
+                    THEN EXTRACT(YEAR FROM date_value)
+                    ELSE EXTRACT(YEAR FROM date_value) - 1
+                END as fiscal_year
+            FROM all_dates
+            WHERE date_value IS NOT NULL
+            ORDER BY date_value
+        """)
+    def create_fact_table(self) -> None:
+        """Create the fact table with foreign keys."""
+        self.logger.info("Creating fact table in DuckDB...")
+        self.conn.execute("""
+            CREATE TABLE fact_h1b_applications AS
+            SELECT
+                ROW_NUMBER() OVER () as record_id,
+                COALESCE(db.beneficiary_key, -1) as beneficiary_key,
+                COALESCE(de.employer_key, -1) as employer_key,
+                COALESCE(dj.job_key, -1) as job_key,
+                COALESCE(da.agent_key, -1) as agent_key,
+                COALESCE(ds.status_key, -1) as status_key,
+                -- Handle multiple date formats for rec_date
+                CASE
+                    WHEN cd.rec_date IS NOT NULL AND cd.rec_date NOT LIKE '%(%'
+                    THEN CASE
+                        WHEN TRY_STRPTIME(cd.rec_date, '%m/%d/%Y') IS NOT NULL
+                        THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.rec_date, '%m/%d/%Y')) AS INTEGER)
+                        WHEN TRY_STRPTIME(cd.rec_date, '%Y-%m-%d') IS NOT NULL
+                        THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.rec_date, '%Y-%m-%d')) AS INTEGER)
+                        ELSE NULL
+                    END
+                    ELSE NULL
+                END as rec_date_key,
+                -- Handle multiple date formats for first_decision_date
+                CASE
+                    WHEN cd.first_decision_date IS NOT NULL AND cd.first_decision_date NOT LIKE '%(%'
+                    THEN CASE
+                        WHEN TRY_STRPTIME(cd.first_decision_date, '%m/%d/%Y') IS NOT NULL
+                        THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.first_decision_date, '%m/%d/%Y')) AS INTEGER)
+                        WHEN TRY_STRPTIME(cd.first_decision_date, '%Y-%m-%d') IS NOT NULL
+                        THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.first_decision_date, '%Y-%m-%d')) AS INTEGER)
+                        ELSE NULL
+                    END
+                    ELSE NULL
+                END as first_decision_date_key,
+                cd.lottery_year,
+                cd.ben_multi_reg_ind,
+                cd.receipt_number,
+                cd.source_file,
+                cd.fiscal_year
+            FROM cleaned_data cd
+            LEFT JOIN dim_beneficiary db ON
+                cd.original_row_id = db.beneficiary_key AND
+                COALESCE(cd.country_of_birth, '') = COALESCE(db.country_of_birth, '') AND
+                COALESCE(cd.country_of_nationality, '') = COALESCE(db.country_of_nationality, '') AND
+                COALESCE(cd.ben_year_of_birth, '0') = COALESCE(db.ben_year_of_birth, '0') AND
+                COALESCE(cd.gender, '') = COALESCE(db.gender, '')
+            LEFT JOIN dim_employer de ON
+                cd.original_row_id = de.employer_key AND
+                COALESCE(cd.employer_name, '') = COALESCE(de.employer_name, '') AND
+                COALESCE(cd.fein, '') = COALESCE(de.fein, '')
+            LEFT JOIN dim_job dj ON
+                cd.original_row_id = dj.job_key AND
+                COALESCE(cd.job_title, '') = COALESCE(dj.job_title, '') AND
+                COALESCE(cd.naics_code, '') = COALESCE(dj.naics_code, '')
+            LEFT JOIN dim_agent da ON
+                cd.original_row_id = da.agent_key AND
+                COALESCE(cd.agent_first_name, '') = COALESCE(da.agent_first_name, '') AND
+                COALESCE(cd.agent_last_name, '') = COALESCE(da.agent_last_name, '')
+            LEFT JOIN dim_status ds ON
+                cd.original_row_id = ds.status_key AND
+                COALESCE(cd.status_type, '') = COALESCE(ds.status_type, '') AND
+                COALESCE(cd.first_decision, '') = COALESCE(ds.first_decision, '')
+        """)
+    def create_lookup_tables(self) -> None:
+        """Create lookup tables for reference data."""
+        self.logger.info("Creating lookup tables in DuckDB...")
+        # Country codes lookup
+        self.conn.execute("""
+            CREATE TABLE lookup_country_codes AS
+            SELECT * FROM VALUES
+                ('IND', 'India', 'Asia'),
+                ('CHN', 'China', 'Asia'),
+                ('KOR', 'South Korea', 'Asia'),
+                ('CAN', 'Canada', 'North America'),
+                ('NPL', 'Nepal', 'Asia'),
+                ('USA', 'United States', 'North America')
+            AS t(country_code, country_name, region)
+        """)
+        # Education levels
+        self.conn.execute("""
+            CREATE TABLE lookup_education_levels AS
+            SELECT * FROM VALUES
+                ('A', 'No Diploma', 'Basic'),
+                ('B', 'High School', 'Basic'),
+                ('C', 'Some College', 'Undergraduate'),
+                ('D', 'College No Degree', 'Undergraduate'),
+                ('E', 'Associates', 'Undergraduate'),
+                ('F', 'Bachelors', 'Undergraduate'),
+                ('G', 'Masters', 'Graduate'),
+                ('H', 'Professional', 'Graduate'),
+                ('I', 'Doctorate', 'Graduate')
+            AS t(education_code, education_level, education_category)
+        """)
+        # Application status types
+        self.conn.execute("""
+            CREATE TABLE lookup_status_types AS
+            SELECT * FROM VALUES
+                ('ELIGIBLE', 'Application is eligible for lottery', 'Lottery'),
+                ('SELECTED', 'Selected in H-1B lottery', 'Lottery'),
+                ('CREATED', 'Application record created', 'Administrative')
+            AS t(status_type, status_description, status_category)
+        """)
+class DatabaseOptimizer:
+    """Handles database optimization and indexing."""
+    def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
+        """
+        Initialize database optimizer.
+        Args:
+            conn: DuckDB connection object.
+            logger: Logger instance for tracking operations.
+        """
+        self.conn = conn
+        self.logger = logger
+    def create_indexes(self) -> None:
+        """Create indexes for better query performance."""
+        self.logger.info("Creating indexes in DuckDB...")
+        indexes = [
+            ("idx_fact_beneficiary", "fact_h1b_applications", "beneficiary_key"),
+            ("idx_fact_employer", "fact_h1b_applications", "employer_key"),
+            ("idx_fact_job", "fact_h1b_applications", "job_key"),
+            ("idx_fact_lottery_year", "fact_h1b_applications", "lottery_year"),
+            ("idx_fact_fiscal_year", "fact_h1b_applications", "fiscal_year"),
+            ("idx_fact_rec_date", "fact_h1b_applications", "rec_date_key"),
+            ("idx_dim_beneficiary_id", "dim_beneficiary", "beneficiary_id"),
+            ("idx_dim_employer_id", "dim_employer", "employer_id"),
+            ("idx_dim_job_id", "dim_job", "job_id"),
+        ]
+        for index_name, table_name, column_name in indexes:
+            try:
+                self.conn.execute(f"CREATE INDEX {index_name} ON {table_name}({column_name})")
+            except Exception as e:
+                self.logger.warning(f"Could not create index {index_name}: {e}")
+        self.logger.info("Indexes created successfully!")
+class DataQualityChecker:
+    """Performs data quality checks and validation."""
+    def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
+        """
+        Initialize data quality checker.
+        Args:
+            conn: DuckDB connection object.
+            logger: Logger instance for tracking operations.
+        """
+        self.conn = conn
+        self.logger = logger
+    def run_all_checks(self) -> bool:
+        """
+        Run all data quality checks.
+        Returns:
+            True if all checks pass, False otherwise.
+        """
+        self.logger.info("Running data quality checks...")
+        try:
+            self._check_table_counts()
+            self._check_fact_table_integrity()
+            return True
+        except Exception as e:
+            self.logger.error(f"Error in data quality checks: {e}")
+            return False
+    def _check_table_counts(self) -> None:
+        """Check row counts for all tables."""
+        tables_query = """
+            SELECT table_name, estimated_size as row_count
+            FROM duckdb_tables()
+            WHERE schema_name = 'main'
+            ORDER BY table_name
+        """
+        tables_info = self.conn.execute(tables_query).fetchall()
+        self.logger.info("Table row counts:")
+        for table_name, _ in tables_info:
+            if not table_name.startswith('raw_'):
+                count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
+                self.logger.info(f"  {table_name}: {count:,} records")
+    def _check_fact_table_integrity(self) -> None:
+        """Check fact table for duplicates and integrity."""
+        dup_check = self.conn.execute("""
+            SELECT COUNT(*) as total_records,
+                   COUNT(DISTINCT record_id) as unique_records
+            FROM fact_h1b_applications
+        """).fetchone()
+        self.logger.info(f"Fact table: {dup_check[0]:,} total records, {dup_check[1]:,} unique records")
+class DatabasePersistence:
+    """Handles database persistence operations."""
+    def __init__(self, logger: logging.Logger):
+        """
+        Initialize database persistence handler.
+        Args:
+            logger: Logger instance for tracking operations.
+        """
+        self.logger = logger
+    def save_to_persistent_database(self, source_conn: duckdb.DuckDBPyConnection,
+                                   target_path: str) -> None:
+        """
+        Save tables to a persistent database file.
+        Args:
+            source_conn: Source database connection.
+            target_path: Path to the target persistent database file.
+        """
+        self.logger.info(f"Saving to persistent database: {target_path}")
+        # Remove existing file if it exists
+        if os.path.exists(target_path):
+            os.remove(target_path)
+            self.logger.info(f"Removed existing database file: {target_path}")
+        # Create persistent database connection
+        with duckdb.connect(target_path) as persistent_conn:
+            # Get tables to copy (exclude temporary tables)
+            tables_to_keep = source_conn.execute("""
+                SELECT table_name
+                FROM information_schema.tables
+                WHERE table_name NOT LIKE 'raw_%'
+                AND table_name NOT IN ('combined_data', 'cleaned_data')
+                AND table_schema = 'main'
+            """).fetchall()
+            # Copy tables
+            for table_info in tables_to_keep:
+                table_name = table_info[0]
+                df = source_conn.execute(f"SELECT * FROM {table_name}").df()
+                persistent_conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df")
+                self.logger.info(f"Copied table {table_name} to persistent database")
+        self.logger.info(f"Persistent database saved to: {target_path}")
+# Configuration and Constants
+class Config:
+    """Configuration class for the H1B data pipeline."""
+    CSV_FILES = [
+        './data/TRK_13139_FY2021.csv',
+        './data/TRK_13139_FY2022.csv',
+        './data/TRK_13139_FY2023.csv',
+        './data/TRK_13139_FY2024_single_reg.csv',
+        './data/TRK_13139_FY2024_multi_reg.csv'
+    ]
+    XLSX_FILE = './data/TRK_13139_I129_H1B_Registrations_FY21_FY24_FOIA_FIN.xlsx'
+    PERSISTENT_DB_PATH = './data/h1bs_analytics.duckdb'
+    MISSING_DATA_THRESHOLD = 0.99
+def main():
+    """Main execution function for the H1B data pipeline."""
+    print("Starting H1B Data Analytics Pipeline...")
+    print("All imports successful!")
+    # Check memory usage
+    MemoryManager.check_memory_usage()
+    # Validate input files
+    existing_files, missing_files = FileValidator.validate_files(Config.CSV_FILES)
+    if missing_files:
+        print(f"Warning: {len(missing_files)} files are missing")
+    # Run the pipeline
+    try:
+        with H1BDataPipeline() as pipeline:
+            # Load data
+            data_loader = DataLoader(pipeline.conn, pipeline.logger)
+            data_loader.load_csv_files(existing_files)
+            # Transform data
+            transformer = DataTransformer(pipeline.conn, pipeline.logger)
+            transformer.create_combined_table()
+            kept_columns = transformer.remove_columns_with_missing_data(
+                'combined_data', Config.MISSING_DATA_THRESHOLD
+            )
+            print(f"Kept {len(kept_columns)} columns after cleaning")
+            # Create dimensional model
+            modeler = DimensionalModeler(pipeline.conn, pipeline.logger)
+            modeler.create_all_dimensions()
+            modeler.create_fact_table()
+            modeler.create_lookup_tables()
+            # Optimize database
+            optimizer = DatabaseOptimizer(pipeline.conn, pipeline.logger)
+            optimizer.create_indexes()
+            # Run quality checks
+            quality_checker = DataQualityChecker(pipeline.conn, pipeline.logger)
+            quality_checker.run_all_checks()
+            # Save to persistent database
+            persistence = DatabasePersistence(pipeline.logger)
+            persistence.save_to_persistent_database(
+                pipeline.conn, Config.PERSISTENT_DB_PATH
+            )
+            # Final memory check
+            MemoryManager.check_memory_usage()
+            MemoryManager.clear_memory()
+        print("Pipeline completed successfully!")
+    except Exception as e:
+        print(f"Pipeline failed with error: {e}")
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()