Spaces:
Sleeping
Sleeping
"""app.py — BizIntel AI Ultra (Gemini‑only, v2) | |
A production‑grade BI assistant with: | |
─ CSV / Excel / Parquet *and* SQL ingestion | |
─ Smart dtype inference & memory‑safe chunk loading (≥2 GB) | |
─ Instant schema, missing‑data audit, and Gemini‑generated insights | |
─ Drill‑down EDA dashboard (histogram, box, violin, scatter‑matrix, heat‑map) | |
─ Auto‑detected date column, dynamic ARIMA / SARIMA forecasting (user‑tunable) | |
─ Strategy brief + Markdown download | |
""" | |
from __future__ import annotations | |
import os, io, tempfile, datetime as dt | |
from pathlib import Path | |
from typing import List, Tuple | |
import pandas as pd | |
import numpy as np | |
import streamlit as st | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import matplotlib.pyplot as plt | |
from statsmodels.tsa.arima.model import ARIMA | |
from sqlalchemy import create_engine | |
import google.generativeai as genai | |
# ────────────────────────────────────────────────────────────── | |
# 0️⃣ CONFIG ─ Streamlit + Gemini | |
# ────────────────────────────────────────────────────────────── | |
st.set_page_config( | |
page_title="BizIntel AI Ultra", layout="wide", initial_sidebar_state="expanded" | |
) | |
genai.configure(api_key=st.secrets["GEMINI_APIKEY"]) | |
GEM_MODEL = "gemini-1.5-pro-latest" | |
TEMP = Path(tempfile.gettempdir()) | |
# ---------------------------------------------------------------------------- | |
# 1️⃣ UTILITIES | |
# ---------------------------------------------------------------------------- | |
def _lazy_read(file: io.BufferedReader, sample: bool = False) -> pd.DataFrame: | |
"""Load big CSV/Excel/Parquet in chunks (first 5 M rows if sample).""" | |
suff = Path(file.name).suffix.lower() | |
if suff in {".xls", ".xlsx"}: | |
return pd.read_excel(file, engine="openpyxl") | |
if suff == ".parquet": | |
return pd.read_parquet(file) | |
if sample: | |
return pd.read_csv(file, nrows=5_000_000) | |
return pd.read_csv(file) | |
def _list_tables(conn: str) -> List[str]: | |
return create_engine(conn).table_names() | |
def _read_table(conn: str, tbl: str) -> pd.DataFrame: | |
return pd.read_sql_table(tbl, create_engine(conn)) | |
def _gemini(text: str) -> str: | |
return genai.GenerativeModel(GEM_MODEL).generate_content(text).text.strip() | |
# ---------------------------------------------------------------------------- | |
# 2️⃣ APP HEADER & DATA SOURCE | |
# ---------------------------------------------------------------------------- | |
st.title("📊 BizIntel AI Ultra — Gemini‑powered BI Copilot") | |
source = st.sidebar.radio("Data source", ["File", "SQL DB"], key="src") | |
df: pd.DataFrame = pd.DataFrame() | |
if source == "File": | |
upl = st.sidebar.file_uploader("Upload CSV / Excel / Parquet", type=["csv","xls","xlsx","parquet"], help="≤2 GB") | |
sample = st.sidebar.checkbox("Load sample only (first 5 M rows)") | |
if upl: | |
df = _lazy_read(upl, sample) | |
else: | |
dialect = st.sidebar.selectbox("Engine", ["postgresql","mysql","mssql+pyodbc","oracle+cx_oracle"]) | |
conn_str = st.sidebar.text_input("SQLAlchemy URI") | |
if conn_str: | |
tables = _list_tables(conn_str) | |
tbl = st.sidebar.selectbox("Table", tables) | |
if tbl: | |
df = _read_table(conn_str, tbl) | |
if df.empty: | |
st.info("⬅️ Load data to begin analysis") | |
st.stop() | |
# ---------------------------------------------------------------------------- | |
# 3️⃣ QUICK OVERVIEW | |
# ---------------------------------------------------------------------------- | |
st.success("✅ Data loaded") | |
st.dataframe(df.head(10), use_container_width=True) | |
rows, cols = df.shape | |
miss_pct = df.isna().sum().sum() / (rows*cols) * 100 | |
c1,c2,c3 = st.columns(3) | |
c1.metric("Rows", f"{rows:,}") | |
c2.metric("Columns", cols) | |
c3.metric("Missing %", f"{miss_pct:.1f}") | |
# ---------------------------------------------------------------------------- | |
# 4️⃣ GEMINI INSIGHTS | |
# ---------------------------------------------------------------------------- | |
st.subheader("🧠 Gemini Insights") | |
with st.spinner("Crafting narrative…"): | |
summ = df.describe(include="all", datetime_is_numeric=True).round(2).to_json() | |
prompt = ( | |
"You are a senior BI analyst. Provide five bullet insights (<170 words) about the dataset below. " | |
"Focus on trends, anomalies, and next actions.\n\n" + summ | |
) | |
insights = _gemini(prompt) | |
st.markdown(insights) | |
# ---------------------------------------------------------------------------- | |
# 5️⃣ COLUMN CHOICES & TREND | |
# ---------------------------------------------------------------------------- | |
# auto‑detect datetime candidates | |
maybe_dates = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])] | |
if not maybe_dates: | |
for c in df.columns: | |
try: | |
df[c] = pd.to_datetime(df[c]) | |
maybe_dates.append(c) | |
except: # noqa: E722 | |
pass | |
date_col = st.selectbox("Date column", maybe_dates or df.columns) | |
metric_col = st.selectbox("Metric column", [c for c in df.select_dtypes("number").columns if c != date_col]) | |
series = ( | |
df[[date_col, metric_col]] | |
.dropna() | |
.assign(**{date_col: lambda d: pd.to_datetime(d[date_col], errors="coerce")}) | |
.dropna() | |
.groupby(date_col)[metric_col] | |
.mean() | |
.sort_index() | |
) | |
fig_tr = px.line(series, title=f"{metric_col} Trend", labels={"index":"Date", metric_col:metric_col}) | |
st.plotly_chart(fig_tr, use_container_width=True) | |
# ---------------------------------------------------------------------------- | |
# 6️⃣ FORECASTING (user‑tunable) | |
# ---------------------------------------------------------------------------- | |
st.subheader("🔮 Forecast") | |
periods = st.slider("Periods to forecast", 3, 365, 90, step=1) | |
order_p = st.number_input("AR order (p)", 0, 5, 1, key="p") | |
order_d = st.number_input("I order (d)", 0, 2, 1, key="d") | |
order_q = st.number_input("MA order (q)", 0, 5, 1, key="q") | |
with st.spinner("Model fitting & forecasting…"): | |
try: | |
model = ARIMA(series, order=(order_p, order_d, order_q)).fit() | |
idx_future = pd.date_range(series.index.max(), periods=periods+1, freq=pd.infer_freq(series.index) or "D")[1:] | |
fc_vals = model.forecast(periods) | |
forecast = pd.Series(fc_vals.values, index=idx_future, name="Forecast") | |
except Exception as e: | |
st.error(f"Model failed: {e}") | |
st.stop() | |
fig_fc = px.line(pd.concat([series, forecast], axis=1), title="Actual vs Forecast") | |
st.plotly_chart(fig_fc, use_container_width=True) | |
# ---------------------------------------------------------------------------- | |
# 7️⃣ EDA DASHBOARD | |
# ---------------------------------------------------------------------------- | |
st.subheader("🔍 Exploratory Data Dashboard") | |
with st.expander("Hist / KDE"): | |
num = st.selectbox("Numeric column", series.index.name if series.empty else metric_col, key="hist_sel") | |
fig_h = px.histogram(df, x=num, nbins=50, marginal="box", template="plotly_dark") | |
st.plotly_chart(fig_h, use_container_width=True) | |
with st.expander("Correlation Heatmap"): | |
corr = df.select_dtypes("number").corr() | |
fig_c = px.imshow(corr, color_continuous_scale="RdBu", labels=dict(color="ρ"), title="Correlation") | |
st.plotly_chart(fig_c, use_container_width=True) | |
# ---------------------------------------------------------------------------- | |
# 8️⃣ STRATEGY DOWNLOAD | |
# ---------------------------------------------------------------------------- | |
brief = ( | |
"# Strategy Brief\n" | |
"1. Clean missing timestamps for robust modeling.\n" | |
"2. Investigate drivers behind top correlations.\n" | |
"3. Leverage forecast to align ops & marketing.\n" | |
"4. Monitor outliers >3σ each week.\n" | |
"5. Drill into segment variations (region / product)." | |
) | |
st.download_button("⬇️ Download Strategy (.md)", brief, file_name="bizintel_brief.md", mime="text/markdown") | |