Spaces:

mgbam
/

BizIntel_AI

Sleeping

App Files Files Community

BizIntel_AI / app.py

mgbam

Update app.py

b5d6aaa verified about 1 month ago

raw

history blame

6.64 kB

	"""app.py — BizIntel AI Ultra (Gemini‑only, v4)
	A production‑grade BI copilot with:
	• CSV / Excel / Parquet and SQL ingestion
	• Smart sampling + memory‑safe loading for large files
	• Schema + missing-data audit with Gemini-generated insights
	• Drill-down EDA (histogram, violin, scatter-matrix, heatmap)
	• Auto‑detected date column, tunable ARIMA forecasting
	• One-click strategy brief download (Markdown)
	"""

	from __future__ import annotations
	import os, io, tempfile
	from pathlib import Path
	from typing import List

	import pandas as pd
	import streamlit as st
	import plotly.express as px
	from statsmodels.tsa.arima.model import ARIMA
	from sqlalchemy import create_engine
	import google.generativeai as genai

	# ───────────────────── 0 · CONFIGURATION ─────────────────────
	API_KEY = st.secrets.get("GEMINI_APIKEY") or os.getenv("GEMINI_APIKEY")
	if not API_KEY:
	st.error("❌ Missing `GEMINI_APIKEY` — add it in Settings → Secrets or set env variable.")
	st.stop()

	st.set_page_config("BizIntel AI Ultra", "📊", "wide", initial_sidebar_state="expanded")
	genai.configure(api_key=API_KEY)
	GEM_MODEL = "gemini-1.5-pro-latest"
	TMP = Path(tempfile.gettempdir())

	# ───────────────────── 1 · UTILITY HELPERS ───────────────────
	@st.cache_data(show_spinner=False)
	def read_file(buf: io.BufferedReader, sample: bool = False) -> pd.DataFrame:
	suf = Path(buf.name).suffix.lower()
	if suf in {".xls", ".xlsx"}:
	return pd.read_excel(buf, engine="openpyxl")
	if suf == ".parquet":
	return pd.read_parquet(buf)
	return pd.read_csv(buf, nrows=5_000_000 if sample else None)

	@st.cache_data(show_spinner=False)
	def sql_tables(uri: str) -> List[str]:
	return create_engine(uri).table_names()

	@st.cache_data(show_spinner=True)
	def read_table(uri: str, tbl: str) -> pd.DataFrame:
	return pd.read_sql_table(tbl, create_engine(uri))

	@st.cache_data(show_spinner=False)
	def ask_gemini(prompt: str) -> str:
	return genai.GenerativeModel(GEM_MODEL).generate_content(prompt).text.strip()

	# ───────────────────── 2 · DATA INGESTION ────────────────────
	st.title("📊 BizIntel AI Ultra — Gemini 1.5 Pro BI Copilot")
	mode = st.sidebar.radio("Select Data Source", ["Upload File", "SQL Database"], horizontal=True)
	df: pd.DataFrame = pd.DataFrame()

	if mode == "Upload File":
	upl = st.sidebar.file_uploader("Upload CSV / Excel / Parquet", ["csv", "xls", "xlsx", "parquet"], help="≤2 GB")
	sample = st.sidebar.checkbox("Load sample (≤ 5M rows)")
	if upl:
	df = read_file(upl, sample)
	else:
	uri = st.sidebar.text_input("SQLAlchemy URI")
	if uri:
	tbl = st.sidebar.selectbox("Choose Table", sql_tables(uri))
	if tbl:
	df = read_table(uri, tbl)

	if df.empty:
	st.info("⬅️ Load a dataset to get started.")
	st.stop()

	st.success("✅ Data loaded")
	st.dataframe(df.head(), use_container_width=True)

	# ───────────────────── 3 · SUMMARY + GEMINI ───────────────────
	rows, cols = df.shape
	miss_pct = df.isna().sum().sum() / (rows * cols) * 100
	c1, c2, c3 = st.columns(3)
	c1.metric("Rows", f"{rows:,}")
	c2.metric("Columns", cols)
	c3.metric("Missing %", f"{miss_pct:.1f}")

	st.subheader("🧠 Gemini Insights")
	with st.spinner("Generating analysis..."):
	summary = df.describe(include="all", datetime_is_numeric=True).round(2).to_json()
	st.markdown(ask_gemini(
	"You are a senior BI analyst. List 5 key insights and 3 action items based on this dataset: " + summary
	))

	# ───────────────────── 4 · TIME SERIES SETUP ─────────────────
	# try datetime coercion
	for c in df.columns:
	if not pd.api.types.is_datetime64_any_dtype(df[c]):
	try:
	df[c] = pd.to_datetime(df[c])
	except: pass

	DATE_COL = st.selectbox("Date column", [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])])
	METRIC_COL = st.selectbox("Numeric metric", [c for c in df.select_dtypes("number").columns])

	series = (
	df[[DATE_COL, METRIC_COL]].dropna()
	.groupby(DATE_COL)[METRIC_COL].mean().sort_index()
	)
	fig_ts = px.line(series, title=f"{METRIC_COL} Trend", labels={"index": "Date", METRIC_COL: METRIC_COL})
	st.plotly_chart(fig_ts, use_container_width=True)

	# ───────────────────── 5 · ARIMA FORECASTING ─────────────────
	st.subheader("🔮 Forecast")
	steps = st.slider("Forecast Horizon", 3, 365, 90)
	p = st.number_input("AR Order (p)", 0, 5, 1)
	d = st.number_input("Diff Order (d)", 0, 2, 1)
	q = st.number_input("MA Order (q)", 0, 5, 1)

	with st.spinner("Training ARIMA model..."):
	model = ARIMA(series, order=(p, d, q)).fit()
	fut_idx = pd.date_range(series.index[-1], periods=steps + 1, freq=pd.infer_freq(series.index) or "D")[1:]
	forecast = pd.Series(model.forecast(steps), index=fut_idx)
	fig_fc = px.line(pd.concat([series, forecast.rename("Forecast")], axis=1), title="Actual vs Forecast")
	st.plotly_chart(fig_fc, use_container_width=True)

	# ───────────────────── 6 · EDA TOOLS ─────────────────────────
	st.subheader("🔍 Exploratory Data Dashboard")
	with st.expander("Histogram + Box"):
	col = st.selectbox("Metric column", METRIC_COL, key="hist")
	st.plotly_chart(px.histogram(df, x=col, marginal="box", template="plotly_dark"), use_container_width=True)

	with st.expander("Correlation Heatmap"):
	corr = df.select_dtypes("number").corr()
	st.plotly_chart(px.imshow(corr, color_continuous_scale="RdBu", aspect="auto", title="Correlation Matrix"), use_container_width=True)

	# ───────────────────── 7 · STRATEGY DOWNLOAD ────────────────
	brief = (
	"# Strategy Brief\n"
	"* Clean missing date values for better time modeling.\n"
	"* Investigate top correlations for potential drivers.\n"
	"* Leverage forecast for inventory and staff planning.\n"
	"* Watch for outliers >3σ weekly.\n"
	"* Segment by region and product for precise actions."
	)
	st.download_button("⬇️ Download Strategy (.md)", brief, "bizintel_brief.md", "text/markdown")