import pandas as pd from io import BytesIO import requests import streamlit as st from pymongo import MongoClient import os from dotenv import load_dotenv import json from pygwalker.api.streamlit import StreamlitRenderer # Load environment variables load_dotenv() MONGO_URI = os.getenv("MONGO_URI") DB_NAME = os.getenv("DB_NAME") COLLECTION_NAME = os.getenv("COLLECTION_NAME") mongo_client = MongoClient(MONGO_URI) db = mongo_client[DB_NAME] collection = db[COLLECTION_NAME] # Load CSV from S3 URL def load_csv_from_url(object_url): response = requests.get(object_url) response.raise_for_status() return pd.read_csv(BytesIO(response.content)) # Analyze column data def analyze_column_data(df): analysis = {} for col in df.columns: if pd.api.types.is_numeric_dtype(df[col]): analysis[col] = { "Mean": df[col].mean(), "Median": df[col].median(), "Mode": df[col].mode()[0] if not df[col].mode().empty else None, "Unique Values": df[col].nunique(), "Null Values": df[col].isnull().sum() } else: analysis[col] = { "Unique Values": df[col].nunique(), "Null Values": df[col].isnull().sum(), "Top Categories": df[col].value_counts().head(5).to_dict() } return analysis # Display analysis for a selected table def display_table_analysis(table): # Load CSV data df = load_csv_from_url(table['csv_object_url']) # Check for "total" row if df.iloc[-1].astype(str).str.contains("total", case=False).any(): df = df.iloc[:-1] # Drop last row if "total" found # Table preview st.subheader("CSV Preview") st.dataframe(df, height=300) # Download Button st.download_button( label="Download CSV", data=requests.get(table['csv_object_url']).content, file_name="table_data.csv", mime="text/csv" ) # Table Description if 'description' in table: st.subheader("Table Description") st.write(table['description']) # Column Summary st.subheader("Column Summary") column_summary = table.get('column_summary', {}) column_analysis = analyze_column_data(df) col1, col2 = st.columns(2) for idx, (col_name, col_description) in enumerate(column_summary.items()): with col1 if idx % 2 == 0 else col2: st.markdown(f"Column Name: **{col_name}**") st.write(f"Description: {col_description}") analysis = column_analysis.get(col_name, {}) if pd.api.types.is_numeric_dtype(df[col_name]): st.write({ "Mean": analysis.get("Mean"), "Median": analysis.get("Median"), "Mode": analysis.get("Mode"), "Unique Values": analysis.get("Unique Values"), "Null Values": analysis.get("Null Values") }) else: st.write({ "Unique Values": analysis.get("Unique Values"), "Null Values": analysis.get("Null Values"), "Top Categories": analysis.get("Top Categories") }) # Graphical Analysis using Pygwalker st.subheader("Graphical Analysis of Table") pyg_app = StreamlitRenderer(df) pyg_app.explorer() # Main function to render the View Table Analysis page for PDF tables def view_pdf_table_analysis_page(url): if st.button("Back", key="back_button"): st.session_state.page = "view_pdf" st.rerun() # Retrieve table data for the PDF pdf_data = collection.find_one({"object_url": url}) tables = pdf_data.get("table_data", []) # Display the total number of tables st.title("PDF Table Analysis") st.write(f"Total tables found: {len(tables)}") if "selected_table" not in st.session_state or st.session_state.selected_table is None or st.session_state.selected_table >= len(tables): st.session_state.selected_table = 0 selected_table_idx = st.radio( "Select a table to analyze", options=range(len(tables)), format_func=lambda x: f"Analyze Table {x + 1}", index=st.session_state.selected_table # Safely use the default if uninitialized ) st.session_state.selected_table = selected_table_idx if st.session_state.selected_table is not None: selected_table_data = tables[st.session_state.selected_table] st.subheader(f"Analysis for Table {st.session_state.selected_table + 1}") csv_url = selected_table_data['csv_object_url'] df = load_csv_from_url(csv_url) if df.iloc[-1].apply(lambda x: "total" in str(x).lower()).any(): df = df.iloc[:-1] st.dataframe(df) # Interactive, scrollable table excel_buffer = BytesIO() with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer: df.to_excel(writer, index=False, sheet_name="Sheet1") excel_buffer.seek(0) # Reset buffer position # Download Button st.download_button( label="Download Full Excel Sheet", data=excel_buffer, file_name="table_data.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) st.markdown("
", unsafe_allow_html=True) table_description = selected_table_data.get("description", None) if table_description: # Table Description st.subheader("Table Description") st.write(table_description) # Column Summary st.markdown("
", unsafe_allow_html=True) st.subheader("Column Summary") with st.container(height=400, border=False): column_summary = selected_table_data.get("column_summary", None) if column_summary: # Column-level descriptions and analysis column_analysis = analyze_column_data(df) col1, col2 = st.columns(2) for idx, (col_name, col_description) in enumerate(column_summary.items()): # Determine which column to use based on the index with col1 if idx % 2 == 0 else col2: st.markdown(f"Column Name : **{col_name}**") st.write(f"Column Description : {col_description}") # Display basic analysis analysis = column_analysis.get(col_name, {}) if pd.api.types.is_numeric_dtype(df[col_name]): # Numeric column analysis st.write({ "Mean": analysis.get("Mean"), "Median": analysis.get("Median"), "Mode": analysis.get("Mode"), "Unique Values": analysis.get("Unique Values"), "Null Values": analysis.get("Null Values") }) else: # Categorical column analysis st.write({ "Unique Values": analysis.get("Unique Values"), "Null Values": analysis.get("Null Values"), "Top Categories": analysis.get("Top Categories") }) st.markdown("
", unsafe_allow_html=True) st.subheader("Graphical Analysis of Table") best_col1 = selected_table_data.get("best_col1") best_col2 = selected_table_data .get("best_col2") default_chart_config = { "mark": "bar", "encoding": { "x": {"field": best_col1, "type": "nominal"}, "y": {"field": best_col2, "type": "quantitative"} } } # Convert default_chart_config to JSON string for Pygwalker spec parameter pyg_app = StreamlitRenderer(df, spec=json.dumps(default_chart_config)) pyg_app.explorer()