Spaces:

youdata-ai
/

MOSPI_analysis_tool

Sleeping

File size: 5,589 Bytes

eef9e83

import pandas as pd
from pygwalker.api.streamlit import StreamlitRenderer
from io import BytesIO
import requests
import streamlit as st
from pymongo import MongoClient
import os
from dotenv import load_dotenv
import json

# Load environment variables
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = os.getenv("DB_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")

mongo_client = MongoClient(MONGO_URI)
db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]




# Load the CSV from a URL (replace with actual CSV download from S3)
def load_csv_from_url(object_url):
    response = requests.get(object_url)
    response.raise_for_status()  # Ensure the request was successful
    csv_data = pd.read_csv(BytesIO(response.content))
    return csv_data


# Analyzing each column based on data type
def analyze_column_data(df):
    analysis = {}
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            analysis[col] = {
                "Mean": df[col].mean(),
                "Median": df[col].median(),
                "Mode": df[col].mode()[0] if not df[col].mode().empty else None,
                "Unique Values": df[col].nunique(),
                "Null Values": df[col].isnull().sum()
            }
        else:
            analysis[col] = {
                "Unique Values": df[col].nunique(),
                "Null Values": df[col].isnull().sum(),
                "Top Categories": df[col].value_counts().head(5).to_dict()
            }
    return analysis


# Main function to render the View Table Analysis page
def view_table_analysis_page(url):

    if st.button("Back",key="back_button"):
        st.session_state.page="view_image"
        st.rerun()

    image=collection.find_one({"object_url":url})
    csv_url=image.get("csv_object_url")

    # Load CSV data
    df = load_csv_from_url(csv_url)
    # Check if the last row has any cell containing the word "total" (case-insensitive)
    if df.iloc[-1].apply(lambda x: "total" in str(x).lower()).any():
        df = df.iloc[:-1]  # Drop the last row if "total" is found in any cell




    # Page title
    st.title("Table Analysis")

    # CSV Preview
    st.subheader("CSV Preview")
    st.write("Below is a preview of the uploaded CSV file:")
    st.dataframe(df)  # Interactive, scrollable table

    # Download Button
    excel_buffer = BytesIO()
    with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name="Sheet1")
    excel_buffer.seek(0)  # Reset buffer position

    # Download Button
    st.download_button(
        label="Download Full Excel Sheet",
        data=excel_buffer,
        file_name="table_data.xlsx",
        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    )

    st.markdown("<hr>", unsafe_allow_html=True)
    table_description=image.get("table_data").get("description",None)

    if table_description:
        # Table Description
        st.subheader("Table Description")
        st.write(table_description)

    # Column Summary
    st.markdown("<hr>",unsafe_allow_html=True)
    st.subheader("Column Summary")
    with st.container(height=400, border=False):

        column_summary = image.get("table_data").get("column_summary", None)

        if column_summary:
            # Column-level descriptions and analysis
            column_analysis = analyze_column_data(df)


            col1, col2 = st.columns(2)
            for idx, (col_name, col_description) in enumerate(column_summary.items()):
                # Determine which column to use based on the index

                with col1 if idx % 2 == 0 else col2:
                    st.markdown(f"Column Name : **{col_name}**")
                    st.write(f"Column Description : {col_description}")

                    # Display basic analysis
                    analysis = column_analysis.get(col_name, {})
                    if pd.api.types.is_numeric_dtype(df[col_name]):
                        # Numeric column analysis
                        st.write({
                            "Mean": analysis.get("Mean"),
                            "Median": analysis.get("Median"),
                            "Mode": analysis.get("Mode"),
                            "Unique Values": analysis.get("Unique Values"),
                            "Null Values": analysis.get("Null Values")
                        })
                    else:
                        # Categorical column analysis
                        st.write({
                            "Unique Values": analysis.get("Unique Values"),
                            "Null Values": analysis.get("Null Values"),
                            "Top Categories": analysis.get("Top Categories")
                        })

    st.markdown("<hr>", unsafe_allow_html=True)
    st.subheader("Graphical Analysis of Table")

    # Default configuration for initial visualization
    best_col1=image.get("table_data").get("best_col1")
    best_col2 = image.get("table_data").get("best_col2")
    default_chart_config = {
        "mark": "bar",
        "encoding": {
            "x": {"field": best_col1, "type": "nominal"},
            "y": {"field": best_col2, "type": "quantitative"}
        }
    }

    # Convert default_chart_config to JSON string for Pygwalker spec parameter
    pyg_app = StreamlitRenderer(df, spec=json.dumps(default_chart_config))
    pyg_app.explorer()