Spaces:

mgbam
/

BizIntel_AI

Sleeping

App Files Files Community

mgbam commited on May 7

Commit

dc51ef8

verified ·

1 Parent(s): 5f67bb9

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -146

app.py CHANGED Viewed

@@ -1,189 +1,143 @@
-"""app.py — BizIntel AI Ultra (Gemini‑only, v2)
-A production‑grade BI assistant with:
- ─ CSV / Excel / Parquet *and* SQL ingestion
- ─ Smart dtype inference & memory‑safe chunk loading (≥2 GB)
- ─ Instant schema, missing‑data audit, and Gemini‑generated insights
- ─ Drill‑down EDA dashboard (histogram, box, violin, scatter‑matrix, heat‑map)
- ─ Auto‑detected date column, dynamic ARIMA / SARIMA forecasting (user‑tunable)
- ─ Strategy brief + Markdown download
 """
 from __future__ import annotations
-import os, io, tempfile, datetime as dt
 from pathlib import Path
-from typing import List, Tuple
 import pandas as pd
-import numpy as np
 import streamlit as st
 import plotly.express as px
-import plotly.graph_objects as go
-import matplotlib.pyplot as plt
 from statsmodels.tsa.arima.model import ARIMA
 from sqlalchemy import create_engine
 import google.generativeai as genai
-# ──────────────────────────────────────────────────────────────
-# 0️⃣  CONFIG ─ Streamlit + Gemini
-# ──────────────────────────────────────────────────────────────
-st.set_page_config(
-    page_title="BizIntel AI Ultra", layout="wide", initial_sidebar_state="expanded"
-)
-genai.configure(api_key=st.secrets["GEMINI_APIKEY"])
 GEM_MODEL = "gemini-1.5-pro-latest"
-TEMP = Path(tempfile.gettempdir())
-# ----------------------------------------------------------------------------
-# 1️⃣  UTILITIES
-# ----------------------------------------------------------------------------
 @st.cache_data(show_spinner=False)
-def _lazy_read(file: io.BufferedReader, sample: bool = False) -> pd.DataFrame:
-    """Load big CSV/Excel/Parquet in chunks (first 5 M rows if sample)."""
-    suff = Path(file.name).suffix.lower()
-    if suff in {".xls", ".xlsx"}:
-        return pd.read_excel(file, engine="openpyxl")
-    if suff == ".parquet":
-        return pd.read_parquet(file)
-    if sample:
-        return pd.read_csv(file, nrows=5_000_000)
-    return pd.read_csv(file)
 @st.cache_data(show_spinner=False)
-def _list_tables(conn: str) -> List[str]:
-    return create_engine(conn).table_names()
 @st.cache_data(show_spinner=True)
-def _read_table(conn: str, tbl: str) -> pd.DataFrame:
-    return pd.read_sql_table(tbl, create_engine(conn))
 @st.cache_data(show_spinner=False)
-def _gemini(text: str) -> str:
-    return genai.GenerativeModel(GEM_MODEL).generate_content(text).text.strip()
-# ----------------------------------------------------------------------------
-# 2️⃣  APP HEADER & DATA SOURCE
-# ----------------------------------------------------------------------------
-st.title("📊 BizIntel AI Ultra — Gemini‑powered BI Copilot")
-source = st.sidebar.radio("Data source", ["File", "SQL DB"], key="src")
-df: pd.DataFrame = pd.DataFrame()
-if source == "File":
-    upl = st.sidebar.file_uploader("Upload CSV / Excel / Parquet", type=["csv","xls","xlsx","parquet"], help="≤2 GB")
-    sample = st.sidebar.checkbox("Load sample only (first 5 M rows)")
     if upl:
-        df = _lazy_read(upl, sample)
 else:
-    dialect = st.sidebar.selectbox("Engine", ["postgresql","mysql","mssql+pyodbc","oracle+cx_oracle"])
-    conn_str = st.sidebar.text_input("SQLAlchemy URI")
-    if conn_str:
-        tables = _list_tables(conn_str)
-        tbl = st.sidebar.selectbox("Table", tables)
         if tbl:
-            df = _read_table(conn_str, tbl)
-if df.empty:
-    st.info("⬅️ Load data to begin analysis")
     st.stop()
-# ----------------------------------------------------------------------------
-# 3️⃣  QUICK OVERVIEW
-# ----------------------------------------------------------------------------
-st.success("✅ Data loaded")
-st.dataframe(df.head(10), use_container_width=True)
-rows, cols = df.shape
-miss_pct = df.isna().sum().sum() / (rows*cols) * 100
 c1,c2,c3 = st.columns(3)
 c1.metric("Rows", f"{rows:,}")
 c2.metric("Columns", cols)
-c3.metric("Missing %", f"{miss_pct:.1f}")
-# ----------------------------------------------------------------------------
-# 4️⃣  GEMINI INSIGHTS
-# ----------------------------------------------------------------------------
 st.subheader("🧠 Gemini Insights")
-with st.spinner("Crafting narrative…"):
-    summ = df.describe(include="all", datetime_is_numeric=True).round(2).to_json()
-    prompt = (
-        "You are a senior BI analyst. Provide five bullet insights (<170 words) about the dataset below. "
-        "Focus on trends, anomalies, and next actions.\n\n" + summ
-    )
-    insights = _gemini(prompt)
-st.markdown(insights)
-# ----------------------------------------------------------------------------
-# 5️⃣  COLUMN CHOICES & TREND
-# ----------------------------------------------------------------------------
-# auto‑detect datetime candidates
-maybe_dates = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
-if not maybe_dates:
-    for c in df.columns:
         try:
-            df[c] = pd.to_datetime(df[c])
-            maybe_dates.append(c)
         except:  # noqa: E722
             pass
-date_col = st.selectbox("Date column", maybe_dates or df.columns)
-metric_col = st.selectbox("Metric column", [c for c in df.select_dtypes("number").columns if c != date_col])
-series = (
-    df[[date_col, metric_col]]
-    .dropna()
-    .assign(**{date_col: lambda d: pd.to_datetime(d[date_col], errors="coerce")})
-    .dropna()
-    .groupby(date_col)[metric_col]
-    .mean()
-    .sort_index()
-)
-fig_tr = px.line(series, title=f"{metric_col} Trend", labels={"index":"Date", metric_col:metric_col})
-st.plotly_chart(fig_tr, use_container_width=True)
-# ----------------------------------------------------------------------------
-# 6️⃣  FORECASTING (user‑tunable)
-# ----------------------------------------------------------------------------
 st.subheader("🔮 Forecast")
-periods = st.slider("Periods to forecast", 3, 365, 90, step=1)
-order_p = st.number_input("AR order (p)", 0, 5, 1, key="p")
-order_d = st.number_input("I order (d)", 0, 2, 1, key="d")
-order_q = st.number_input("MA order (q)", 0, 5, 1, key="q")
-with st.spinner("Model fitting & forecasting…"):
-    try:
-        model = ARIMA(series, order=(order_p, order_d, order_q)).fit()
-        idx_future = pd.date_range(series.index.max(), periods=periods+1, freq=pd.infer_freq(series.index) or "D")[1:]
-        fc_vals = model.forecast(periods)
-        forecast = pd.Series(fc_vals.values, index=idx_future, name="Forecast")
-    except Exception as e:
-        st.error(f"Model failed: {e}")
-        st.stop()
-fig_fc = px.line(pd.concat([series, forecast], axis=1), title="Actual vs Forecast")
 st.plotly_chart(fig_fc, use_container_width=True)
-# ----------------------------------------------------------------------------
-# 7️⃣  EDA DASHBOARD
-# ----------------------------------------------------------------------------
-st.subheader("🔍 Exploratory Data Dashboard")
-with st.expander("Hist / KDE"):
-    num = st.selectbox("Numeric column", series.index.name if series.empty else metric_col, key="hist_sel")
-    fig_h = px.histogram(df, x=num, nbins=50, marginal="box", template="plotly_dark")
-    st.plotly_chart(fig_h, use_container_width=True)
-with st.expander("Correlation Heatmap"):
-    corr = df.select_dtypes("number").corr()
-    fig_c = px.imshow(corr, color_continuous_scale="RdBu", labels=dict(color="ρ"), title="Correlation")
-    st.plotly_chart(fig_c, use_container_width=True)
-# ----------------------------------------------------------------------------
-# 8️⃣  STRATEGY DOWNLOAD
-# ----------------------------------------------------------------------------
 brief = (
     "# Strategy Brief\n"
-    "1. Clean missing timestamps for robust modeling.\n"
-    "2. Investigate drivers behind top correlations.\n"
-    "3. Leverage forecast to align ops & marketing.\n"
-    "4. Monitor outliers >3σ each week.\n"
-    "5. Drill into segment variations (region / product)."
 )
-st.download_button("⬇️ Download Strategy (.md)", brief, file_name="bizintel_brief.md", mime="text/markdown")

+"""app.py — BizIntel AI Ultra (Gemini‑only, v3)
+A production‑grade BI copilot with:
+ • CSV / Excel / Parquet and live SQL ingestion
+ • Memory‑safe chunk loading (≥2 GB) & dtype auto‑fix
+ • Instant schema audit + Gemini‑generated insights
+ • Drill‑down EDA (histogram, violin, scatter‑matrix, heat‑map)
+ • Auto‑detected datetime + user‑tunable ARIMA forecasting
+ • One‑click strategy brief (Markdown)
 """
 from __future__ import annotations
+import os, io, tempfile
 from pathlib import Path
+from typing import List
 import pandas as pd
 import streamlit as st
 import plotly.express as px
 from statsmodels.tsa.arima.model import ARIMA
 from sqlalchemy import create_engine
 import google.generativeai as genai
+# ─────────────────── 0 · CONFIG & SECRETS ────────────────────
+API_KEY = st.secrets.get("GEMINI_APIKEY") or os.getenv("GEMINI_APIKEY")
+if not API_KEY:
+    st.error("❌ `GEMINI_APIKEY` missing — add it in *Settings → Secrets* or env vars.")
+    st.stop()
+genai.configure(api_key=API_KEY)
 GEM_MODEL = "gemini-1.5-pro-latest"
+TMP = Path(tempfile.gettempdir())
+st.set_page_config("BizIntel AI Ultra", "📊", "wide", initial_sidebar_state="expanded")
+# ─────────────────── 1 · UTILITY HELPERS ─────────────────────
 @st.cache_data(show_spinner=False)
+def read_file(buf: io.BufferedReader, sample: bool = False) -> pd.DataFrame:
+    suf = Path(buf.name).suffix.lower()
+    if suf in {".xls", ".xlsx"}:  # Excel
+        return pd.read_excel(buf, engine="openpyxl")
+    if suf == ".parquet":
+        return pd.read_parquet(buf)
+    return pd.read_csv(buf, nrows=5_000_000 if sample else None)
 @st.cache_data(show_spinner=False)
+def sql_tables(uri: str) -> List[str]:
+    return create_engine(uri).table_names()
 @st.cache_data(show_spinner=True)
+def read_table(uri: str, tbl: str) -> pd.DataFrame:
+    return pd.read_sql_table(tbl, create_engine(uri))
 @st.cache_data(show_spinner=False)
+def ask_gemini(prompt: str) -> str:
+    return genai.GenerativeModel(GEM_MODEL).generate_content(prompt).text.strip()
+# ─────────────────── 2 · DATA INGESTION ──────────────────────
+st.title("📊 BizIntel AI Ultra — Gemini 1.5 Pro BI Copilot")
+mode = st.sidebar.radio("Source", ["File", "SQL"], horizontal=True)
+DF: pd.DataFrame = pd.DataFrame()
+if mode == "File":
+    upl = st.sidebar.file_uploader("Upload CSV / Excel / Parquet", ["csv","xls","xlsx","parquet"], help="≤2 GB")
+    sample = st.sidebar.checkbox("Load sample only (≤ 5 M rows)")
     if upl:
+        DF = read_file(upl, sample)
 else:
+    uri = st.sidebar.text_input("SQLAlchemy URI")
+    if uri:
+        tbl = st.sidebar.selectbox("Table", sql_tables(uri))
         if tbl:
+            DF = read_table(uri, tbl)
+if DF.empty:
+    st.info("⬅️ Load data to start.")
     st.stop()
+st.success("✅ Data loaded")
+st.dataframe(DF.head(), use_container_width=True)
+# ─────────────────── 3 · QUICK STATS + GEMINI INSIGHT ────────
+rows, cols = DF.shape
+miss = DF.isna().sum().sum() / (rows*cols) * 100
 c1,c2,c3 = st.columns(3)
 c1.metric("Rows", f"{rows:,}")
 c2.metric("Columns", cols)
+c3.metric("Missing %", f"{miss:.1f}")
 st.subheader("🧠 Gemini Insights")
+with st.spinner("Gemini analysing…"):
+    summary = DF.describe(include="all", datetime_is_numeric=True).round(2).to_json()
+    st.markdown(ask_gemini(
+        "You are a senior BI analyst. Give 5 concise insights and 3 action items for the dataset: " + summary
+    ))
+# ─────────────────── 4 · TIME‑SERIES SELECTION ───────────────
+# attempt datetime coercion
+for c in DF.columns:
+    if not pd.api.types.is_datetime64_any_dtype(DF[c]):
         try:
+            DF[c] = pd.to_datetime(DF[c])
         except:  # noqa: E722
             pass
+DATE_COL = st.selectbox("Date column", [c for c in DF.columns if pd.api.types.is_datetime64_any_dtype(DF[c])])
+METRIC_COL = st.selectbox("Numeric metric", [c for c in DF.select_dtypes("number").columns])
+ts = (
+    DF[[DATE_COL, METRIC_COL]].dropna()
+      .groupby(DATE_COL)[METRIC_COL].mean().sort_index()
+)
+fig_ts = px.line(ts, title=f"{METRIC_COL} Trend", labels={"index":"Date", METRIC_COL:METRIC_COL})
+st.plotly_chart(fig_ts, use_container_width=True)
+# ─────────────────── 5 · FORECASTING ─────────────────────────
 st.subheader("🔮 Forecast")
+steps = st.slider("Horizon", 3, 365, 90)
+p = st.number_input("p", 0,5,1); d = st.number_input("d",0,2,1); q = st.number_input("q",0,5,1)
+with st.spinner("Fitting ARIMA…"):
+    model = ARIMA(ts, order=(p,d,q)).fit()
+    fut_idx = pd.date_range(ts.index[-1], periods=steps+1, freq=pd.infer_freq(ts.index) or "D")[1:]
+    forecast = pd.Series(model.forecast(steps), index=fut_idx)
+fig_fc = px.line(pd.concat([ts, forecast.rename("Forecast")], axis=1), title="Actual vs Forecast")
 st.plotly_chart(fig_fc, use_container_width=True)
+# ─────────────────── 6 · EDA EXPANDERS ───────────────────────
+st.subheader("🔍 EDA Dashboard")
+with st.expander("Histogram / Box"):
+    col = st.selectbox("Column", METRIC_COL, key="hist")
+    st.plotly_chart(px.histogram(DF, x=col, marginal="box", template="plotly_dark"), use_container_width=True)
+with st.expander("Correlation heat‑map"):
+    corr = DF.select_dtypes("number").corr()
+    st.plotly_chart(px.imshow(corr, color_continuous_scale="RdBu", aspect="auto", title="Correlation"), use_container_width=True)
+# ─────────────────── 7 · STRATEGY BRIEF DOWNLOAD ────────────
 brief = (
     "# Strategy Brief\n"
+    "* Clean missing timestamps.\n"
+    "* Investigate strongest correlations for causal drivers.\n"
+    "* Use forecast to guide inventory & staffing planning.\n"
+    "* Review outliers weekly (>3σ).\n"
+    "* Segment analysis by region & product for micro‑actions."
 )
+st.download_button("⬇️ Strategy (.md)", brief, "bizintel_brief.md", "text/markdown")