Spaces:

LeonceNsh
/

baho

Sleeping

App Files Files Community

LeonceNsh commited on Dec 15, 2024

Commit

b1a9f46

verified ·

1 Parent(s): b19ad01

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -324

app.py CHANGED Viewed

@@ -1,326 +1,88 @@
-import os
-import json
-import openai
-import duckdb
 import gradio as gr
-from functools import lru_cache
-from dotenv import load_dotenv
-from e2b_code_interpreter import Sandbox
-# =========================
-# Configuration and Setup
-# =========================
-# Load environment variables
-load_dotenv()
-# Initialize OpenAI API key
-openai.api_key = os.getenv("OPENAI_API_KEY")
-if not openai.api_key:
-    raise ValueError("Please set the OPENAI_API_KEY environment variable.")
-# Initialize the Sandbox
-sbx = Sandbox()  # By default, the sandbox is alive for 5 minutes
-# Path to your Parquet dataset
-DATASET_PATH = 'hsas.parquet'  # Update with your Parquet file path
-# Define the schema of your dataset
-SCHEMA = [
-    {"column_name": "total_charges", "column_type": "BIGINT"},
-    {"column_name": "medicare_prov_num", "column_type": "BIGINT"},
-    {"column_name": "zip_cd_of_residence", "column_type": "VARCHAR"},
-    {"column_name": "total_days_of_care", "column_type": "BIGINT"},
-    {"column_name": "total_cases", "column_type": "BIGINT"},
-]
-@lru_cache(maxsize=1)
-def get_schema():
-    """Returns the schema of the dataset."""
-    return SCHEMA
-COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()}
-# =========================
-# OpenAI API Integration
-# =========================
-def parse_query(nl_query):
-    """
-    Converts a natural language query into an SQL query using OpenAI's GPT model.
-    Args:
-        nl_query (str): The natural language query.
-    Returns:
-        tuple: A tuple containing the SQL query and an error message (if any).
-    """
-    messages = [
-        {
-            "role": "system",
-            "content": (
-                "You are an assistant that converts natural language queries into SQL queries for the 'hsa_data' table. "
-                "Ensure the SQL query is syntactically correct and uses only the columns provided in the schema."
-            ),
-        },
-        {
-            "role": "user",
-            "content": f"Schema:\n{json.dumps(get_schema(), indent=2)}\n\nQuery:\n\"{nl_query}\"\n\nSQL:",
-        },
-    ]
-    try:
-        response =  openai.chat.completions.create(
-            model="gpt-4o-mini",  # Use a valid and accessible model
-            messages=messages,
-            temperature=0,
-            max_tokens=150,
-        )
-        sql_query = response.choices[0].message.content.strip()
-        return sql_query, ""
-    except Exception as e:
-        return "", f"Error generating SQL query: {e}"
-# =========================
-# Database Interaction
-# =========================
-def init_db():
-    """
-    Initializes the DuckDB in-memory database and loads the dataset.
-    Returns:
-        duckdb.DuckDBPyConnection: The DuckDB connection object.
-    """
-    try:
-        con = duckdb.connect(database=':memory:')
-        con.execute(f"CREATE OR REPLACE VIEW hsa_data AS SELECT * FROM read_parquet('{DATASET_PATH}')")
-        return con
-    except Exception as e:
-        raise RuntimeError(f"Failed to initialize DuckDB: {e}")
-# Initialize the database connection once
-db_connection = init_db()
-def execute_sql_query(sql_query):
-    """
-    Executes an SQL query against the DuckDB database.
-    Args:
-        sql_query (str): The SQL query to execute.
-    Returns:
-        tuple: A tuple containing the result dataframe and an error message (if any).
-    """
-    try:
-        result_df = db_connection.execute(sql_query).fetchdf()
-        return result_df, ""
-    except Exception as e:
-        return None, f"Error executing query: {e}"
-# =========================
-# Gradio Application UI
-# =========================
-with gr.Blocks(css="""
-    .error-message {
-        color: red;
-        font-weight: bold;
-    }
-    .gradio-container {
-        max-width: 1200px;
-        margin: auto;
-        font-family: -apple-system, BlinkMacSystemFont, 'San Francisco', 'Helvetica Neue', Helvetica, Arial, sans-serif;
-    }
-    .header {
-        text-align: center;
-        padding: 30px 0;
-    }
-    .instructions {
-        margin: 20px 0;
-        font-size: 18px;
-        line-height: 1.6;
-    }
-    .example-queries {
-        margin-bottom: 20px;
-    }
-    .button-row {
-        margin-top: 20px;
-    }
-    .input-area {
-        margin-bottom: 20px;
-    }
-    .schema-tab {
-        padding: 20px;
-    }
-    .results {
-        margin-top: 20px;
-    }
-    .copy-button {
-        margin-top: 10px;
-    }
-""") as demo:
-    # Header
-    gr.Markdown("""
-    # 🏥 Text-to-SQL Healthcare Data Analyst Agent
-    Analyze data from the U.S. Center of Medicare and Medicaid using natural language queries.
-    """, elem_classes="header")
-    # Instructions
-    gr.Markdown("""
-    ### Instructions
-    1. **Describe the data you want**: e.g., *"Show total days of care by zip code"*
-    2. **Use Example Queries**: Click on any example query button below to execute
-    3. **Generate SQL**: Or, enter your own query and click **Generate SQL Query**
-    4. **Execute the Query**: Review the generated SQL and click **Execute Query** to see the results
-    """, elem_classes="instructions")
-    with gr.Row():
-        with gr.Column(scale=1, min_width=350):
-            gr.Markdown("### 💡 Example Queries", elem_classes="example-queries")
-            query_buttons = [
-                "Calculate the average total_charges by zip_cd_of_residence",
-                "For each zip_cd_of_residence, calculate the sum of total_charges",
-                "SELECT * FROM hsa_data WHERE total_days_of_care > 40 LIMIT 30;",
-            ]
-            btn_queries = [gr.Button(q, variant="secondary") for q in query_buttons]
-            query_input = gr.Textbox(
-                label="🔍 Your Query",
-                placeholder='e.g., "Show total charges over 1M by zip code"',
-                lines=2,
-                interactive=True,
-                elem_classes="input-area"
-            )
-            with gr.Row(elem_classes="button-row"):
-                btn_generate_sql = gr.Button("Generate SQL Query", variant="primary")
-                btn_execute_query = gr.Button("Execute Query", variant="primary")
-            sql_query_out = gr.Code(label="📝 Generated SQL Query", language="sql")
-            error_out = gr.HTML(elem_classes="error-message", visible=False)
-        with gr.Column(scale=2, min_width=650):
-            gr.Markdown("### 📊 Query Results", elem_classes="results")
-            results_out = gr.Dataframe(label="Query Results", interactive=False)
-            # Copy to Clipboard Button
-            btn_copy_results = gr.Button("Copy Results to Clipboard", variant="secondary", elem_classes="copy-button")
-            # JavaScript for copying to clipboard
-            copy_script = gr.HTML("""
-                <script>
-                    function copyToClipboard() {
-                        const resultsContainer = document.querySelector('div[data-testid="dataframe"] table');
-                        if (resultsContainer) {
-                            const text = Array.from(resultsContainer.rows)
-                                .map(row => Array.from(row.cells)
-                                    .map(cell => cell.innerText).join("\\t"))
-                                .join("\\n");
-                            navigator.clipboard.writeText(text).then(function() {
-                                alert("Copied results to clipboard!");
-                            }, function(err) {
-                                alert("Failed to copy results: " + err);
-                            });
-                        } else {
-                            alert("No results to copy!");
-                        }
-                    }
-                    // Attach the copy function to the button
-                    document.addEventListener('DOMContentLoaded', function() {
-                        const copyButton = document.querySelector('.copy-button button');
-                        if (copyButton) {
-                            copyButton.addEventListener('click', copyToClipboard);
-                        }
-                    });
-                </script>
-            """)
-            # Include the JavaScript in the app
-            copy_script
-    # Dataset Schema Tab
-    with gr.Tab("📋 Dataset Schema", elem_classes="schema-tab"):
-        gr.Markdown("### Dataset Schema")
-        schema_display = gr.JSON(label="Schema", value=get_schema())
-    # =========================
-    # Event Functions
-    # =========================
-    def generate_sql(nl_query):
-        if not nl_query.strip():
-            return "", "<p>Please enter a query.</p>", gr.update(visible=True)
-        sql_query, error = parse_query(nl_query)
-        if error:
-            return sql_query, f"<p>{error}</p>", gr.update(visible=True)
-        else:
-            return sql_query, "", gr.update(visible=False)
-    def execute_query(sql_query):
-        if not sql_query.strip():
-            return None, "<p>No SQL query to execute.</p>", gr.update(visible=True)
-        result_df, error = execute_sql_query(sql_query)
-        if error:
-            return None, f"<p>{error}</p>", gr.update(visible=True)
-        else:
-            return result_df, "", gr.update(visible=False)
-    def handle_example_click(example_query):
-        if example_query.strip().upper().startswith("SELECT"):
-            sql_query = example_query
-            result_df, error = execute_sql_query(sql_query)
-            if error:
-                return sql_query, f"<p>{error}</p>", None, gr.update(visible=True)
-            else:
-                return sql_query, "", result_df, gr.update(visible=False)
-        else:
-            sql_query, error = parse_query(example_query)
-            if error:
-                return sql_query, f"<p>{error}</p>", None, gr.update(visible=True)
-            result_df, exec_error = execute_sql_query(sql_query)
-            if exec_error:
-                return sql_query, f"<p>{exec_error}</p>", None, gr.update(visible=True)
-            else:
-                return sql_query, "", result_df, gr.update(visible=False)
-    # Button Click Event Handlers
-    btn_generate_sql.click(
-        fn=generate_sql,
-        inputs=query_input,
-        outputs=[sql_query_out, error_out, error_out],
-    )
-    btn_execute_query.click(
-        fn=execute_query,
-        inputs=sql_query_out,
-        outputs=[results_out, error_out, error_out],
-    )
-    for btn in btn_queries:
-        btn.click(
-            fn=lambda q=btn.value: handle_example_click(q),
-            inputs=None,
-            outputs=[sql_query_out, error_out, results_out, error_out],
-        )
-    # Hide error message when inputs change
-    query_input.change(fn=lambda: gr.update(visible=False), inputs=None, outputs=[error_out])
-    sql_query_out.change(fn=lambda: gr.update(visible=False), inputs=None, outputs=[error_out])
-# =========================
-# Launch the Gradio App
-# =========================
 if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        debug=True,
-    )

 import gradio as gr
+import pandas as pd
+import duckdb
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from sklearn.feature_selection import SelectKBest, f_regression
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+# Function to load data from a Parquet file into a DuckDB in-memory database
+def load_data(parquet_file):
+    con = duckdb.connect(database=':memory:')
+    con.execute(f"CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')")
+    df = con.execute("SELECT * FROM data").fetchdf()
+    return df
+# Function to preprocess data and perform PCA
+def preprocess_and_pca(df, target_column, n_components=5):
+    # Drop non-numeric columns
+    X = df.select_dtypes(include=[float, int])
+    y = df[target_column]
+    # Replace infinity values with NaN
+    X.replace([np.inf, -np.inf], np.nan, inplace=True)
+    # Handle missing values by imputing with the median
+    X = X.fillna(X.median())
+    y = y.fillna(y.median())
+    # Standardize the data
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+    # Apply PCA
+    pca = PCA(n_components=n_components)
+    X_pca = pca.fit_transform(X_scaled)
+    return X_pca, y
+# Function to visualize the PCA components
+def visualize_pca(X_pca, y):
+    # Visualize the first two principal components
+    plt.figure(figsize=(10, 6))
+    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
+    plt.xlabel('Principal Component 1')
+    plt.ylabel('Principal Component 2')
+    plt.title('PCA - First Two Principal Components')
+    plt.colorbar(label='Median Income Household')
+    plt.savefig('pca_scatter.png')
+    plt.close()
+    # Create a DataFrame with the first few principal components for pair plot
+    pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])
+    pca_df['Median_Income_Household'] = y
+    # Pair plot of the first few principal components
+    sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(5)], hue='Median_Income_Household', palette='viridis')
+    plt.suptitle('Pair Plot of Principal Components', y=1.02)
+    plt.savefig('pca_pairplot.png')
+    plt.close()
+    return 'pca_scatter.png', 'pca_pairplot.png'
+# Gradio interface function
+def gradio_interface(target_column):
+    df = load_data('df_usa_health_features.parquet')
+    X_pca, y = preprocess_and_pca(df, target_column)
+    scatter_plot, pair_plot = visualize_pca(X_pca, y)
+    return scatter_plot, pair_plot
+# Create Gradio interface
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.inputs.Textbox(label="Target Column")
+    ],
+    outputs=[
+        gr.outputs.Image(type="file", label="PCA Scatter Plot"),
+        gr.outputs.Image(type="file", label="PCA Pair Plot")
+    ],
+    title="PCA Visualization with DuckDB and Gradio",
+    description="Specify the target column to visualize PCA components from the df_usa_health_features.parquet file."
+)
+# Launch the Gradio app
 if __name__ == "__main__":
+    iface.launch()