MOSPI_analysis_tool / table_analysis_for_image.py
akshansh36's picture
Upload 10 files
eef9e83 verified
import pandas as pd
from pygwalker.api.streamlit import StreamlitRenderer
from io import BytesIO
import requests
import streamlit as st
from pymongo import MongoClient
import os
from dotenv import load_dotenv
import json
# Load environment variables
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = os.getenv("DB_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
mongo_client = MongoClient(MONGO_URI)
db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]
# Load the CSV from a URL (replace with actual CSV download from S3)
def load_csv_from_url(object_url):
response = requests.get(object_url)
response.raise_for_status() # Ensure the request was successful
csv_data = pd.read_csv(BytesIO(response.content))
return csv_data
# Analyzing each column based on data type
def analyze_column_data(df):
analysis = {}
for col in df.columns:
if pd.api.types.is_numeric_dtype(df[col]):
analysis[col] = {
"Mean": df[col].mean(),
"Median": df[col].median(),
"Mode": df[col].mode()[0] if not df[col].mode().empty else None,
"Unique Values": df[col].nunique(),
"Null Values": df[col].isnull().sum()
}
else:
analysis[col] = {
"Unique Values": df[col].nunique(),
"Null Values": df[col].isnull().sum(),
"Top Categories": df[col].value_counts().head(5).to_dict()
}
return analysis
# Main function to render the View Table Analysis page
def view_table_analysis_page(url):
if st.button("Back",key="back_button"):
st.session_state.page="view_image"
st.rerun()
image=collection.find_one({"object_url":url})
csv_url=image.get("csv_object_url")
# Load CSV data
df = load_csv_from_url(csv_url)
# Check if the last row has any cell containing the word "total" (case-insensitive)
if df.iloc[-1].apply(lambda x: "total" in str(x).lower()).any():
df = df.iloc[:-1] # Drop the last row if "total" is found in any cell
# Page title
st.title("Table Analysis")
# CSV Preview
st.subheader("CSV Preview")
st.write("Below is a preview of the uploaded CSV file:")
st.dataframe(df) # Interactive, scrollable table
# Download Button
excel_buffer = BytesIO()
with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name="Sheet1")
excel_buffer.seek(0) # Reset buffer position
# Download Button
st.download_button(
label="Download Full Excel Sheet",
data=excel_buffer,
file_name="table_data.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
st.markdown("<hr>", unsafe_allow_html=True)
table_description=image.get("table_data").get("description",None)
if table_description:
# Table Description
st.subheader("Table Description")
st.write(table_description)
# Column Summary
st.markdown("<hr>",unsafe_allow_html=True)
st.subheader("Column Summary")
with st.container(height=400, border=False):
column_summary = image.get("table_data").get("column_summary", None)
if column_summary:
# Column-level descriptions and analysis
column_analysis = analyze_column_data(df)
col1, col2 = st.columns(2)
for idx, (col_name, col_description) in enumerate(column_summary.items()):
# Determine which column to use based on the index
with col1 if idx % 2 == 0 else col2:
st.markdown(f"Column Name : **{col_name}**")
st.write(f"Column Description : {col_description}")
# Display basic analysis
analysis = column_analysis.get(col_name, {})
if pd.api.types.is_numeric_dtype(df[col_name]):
# Numeric column analysis
st.write({
"Mean": analysis.get("Mean"),
"Median": analysis.get("Median"),
"Mode": analysis.get("Mode"),
"Unique Values": analysis.get("Unique Values"),
"Null Values": analysis.get("Null Values")
})
else:
# Categorical column analysis
st.write({
"Unique Values": analysis.get("Unique Values"),
"Null Values": analysis.get("Null Values"),
"Top Categories": analysis.get("Top Categories")
})
st.markdown("<hr>", unsafe_allow_html=True)
st.subheader("Graphical Analysis of Table")
# Default configuration for initial visualization
best_col1=image.get("table_data").get("best_col1")
best_col2 = image.get("table_data").get("best_col2")
default_chart_config = {
"mark": "bar",
"encoding": {
"x": {"field": best_col1, "type": "nominal"},
"y": {"field": best_col2, "type": "quantitative"}
}
}
# Convert default_chart_config to JSON string for Pygwalker spec parameter
pyg_app = StreamlitRenderer(df, spec=json.dumps(default_chart_config))
pyg_app.explorer()