Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,201 +1,189 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
|
|
4 |
import plotly.express as px
|
|
|
5 |
import matplotlib.pyplot as plt
|
6 |
-
from io import BytesIO
|
7 |
-
from sqlalchemy import create_engine
|
8 |
from statsmodels.tsa.arima.model import ARIMA
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
#
|
|
|
12 |
st.set_page_config(
|
13 |
-
page_title="BizIntel
|
14 |
-
layout="wide",
|
15 |
-
initial_sidebar_state="expanded"
|
16 |
)
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
#
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
return pd.read_sql_table(
|
43 |
-
|
44 |
-
|
45 |
-
def
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
return resp.choices[0].message.content.strip()
|
61 |
-
|
62 |
-
# โโ APP โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
63 |
-
st.title("๐ BizIntel AI Ultra")
|
64 |
-
|
65 |
-
# 1) Choose data source
|
66 |
-
source = st.radio("Select data source", ["Upload CSV / Excel", "Connect to SQL Database"])
|
67 |
-
|
68 |
-
df = pd.DataFrame()
|
69 |
-
if source == "Upload CSV / Excel":
|
70 |
-
uploaded = st.file_uploader(
|
71 |
-
"Drag & drop file here (โค500 MB) โข .csv, .xls, .xlsx",
|
72 |
-
type=["csv","xls","xlsx"]
|
73 |
-
)
|
74 |
-
if uploaded:
|
75 |
-
with st.spinner("Loading fileโฆ"):
|
76 |
-
df = load_uploaded_file(uploaded)
|
77 |
-
|
78 |
else:
|
79 |
-
|
80 |
-
conn_str = st.text_input("
|
81 |
if conn_str:
|
82 |
-
tables =
|
83 |
-
|
84 |
-
if
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
st.subheader("๐ Exploratory Visuals")
|
113 |
-
num_cols = df.select_dtypes("number").columns.tolist()
|
114 |
-
if st.checkbox("Show histogram"):
|
115 |
-
col = st.selectbox("Histogram column", num_cols, key="hist")
|
116 |
-
fig = px.histogram(df, x=col, nbins=30, title=f"Histogram of {col}")
|
117 |
-
st.plotly_chart(fig, use_container_width=True)
|
118 |
-
|
119 |
-
if st.checkbox("Show scatter matrix"):
|
120 |
-
dims = num_cols[:6]
|
121 |
-
fig = px.scatter_matrix(df[dims], dimensions=dims, title="Scatter Matrix")
|
122 |
-
st.plotly_chart(fig, use_container_width=True)
|
123 |
-
|
124 |
-
if st.checkbox("Show correlation heatmap"):
|
125 |
-
corr = df[num_cols].corr()
|
126 |
-
fig, ax = plt.subplots(figsize=(6,5))
|
127 |
-
im = ax.imshow(corr, cmap="RdBu", vmin=-1, vmax=1)
|
128 |
-
plt.xticks(range(len(corr)), corr.columns, rotation=45, ha="right")
|
129 |
-
plt.yticks(range(len(corr)), corr.columns)
|
130 |
-
plt.colorbar(im, ax=ax)
|
131 |
-
st.pyplot(fig)
|
132 |
-
|
133 |
-
# 3) Trend & forecast
|
134 |
-
st.markdown("---")
|
135 |
-
st.subheader("๐ Time-Series Trend & 90-Day Forecast")
|
136 |
-
|
137 |
-
# pick columns
|
138 |
-
dt_opts = [col for col in df.columns if pd.api.types.is_datetime64_any_dtype(df[col]) or df[col].dtype == "object"]
|
139 |
-
date_col = st.selectbox("Date column", dt_opts)
|
140 |
-
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
|
141 |
-
metric_col = st.selectbox("Metric column", num_cols)
|
142 |
-
|
143 |
-
ts = (
|
144 |
-
df[[date_col, metric_col]]
|
145 |
-
.dropna()
|
146 |
-
.set_index(date_col)
|
147 |
-
.sort_index()
|
148 |
-
.loc[~df.index.duplicated(keep="first")]
|
149 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
-
|
152 |
-
|
153 |
-
st.plotly_chart(fig_trend, use_container_width=True)
|
154 |
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
#
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
|
1 |
+
"""app.py โ BizIntelย AIย Ultra (Geminiโonly, v2)
|
2 |
+
A productionโgrade BI assistant with:
|
3 |
+
โ CSV / Excel / Parquet *and* SQL ingestion
|
4 |
+
โ Smart dtype inference & memoryโsafe chunk loading (โฅ2โฏGB)
|
5 |
+
โ Instant schema, missingโdata audit, and Geminiโgenerated insights
|
6 |
+
โ Drillโdown EDA dashboard (histogram, box, violin, scatterโmatrix, heatโmap)
|
7 |
+
โ Autoโdetected date column, dynamic ARIMA / SARIMA forecasting (userโtunable)
|
8 |
+
โ Strategy brief + Markdown download
|
9 |
+
"""
|
10 |
+
|
11 |
+
from __future__ import annotations
|
12 |
+
import os, io, tempfile, datetime as dt
|
13 |
+
from pathlib import Path
|
14 |
+
from typing import List, Tuple
|
15 |
+
|
16 |
import pandas as pd
|
17 |
import numpy as np
|
18 |
+
import streamlit as st
|
19 |
import plotly.express as px
|
20 |
+
import plotly.graph_objects as go
|
21 |
import matplotlib.pyplot as plt
|
|
|
|
|
22 |
from statsmodels.tsa.arima.model import ARIMA
|
23 |
+
from sqlalchemy import create_engine
|
24 |
+
import google.generativeai as genai
|
25 |
|
26 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
27 |
+
# 0๏ธโฃ CONFIG โ Streamlit + Gemini
|
28 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
29 |
st.set_page_config(
|
30 |
+
page_title="BizIntelย AIย Ultra", layout="wide", initial_sidebar_state="expanded"
|
|
|
|
|
31 |
)
|
32 |
+
genai.configure(api_key=st.secrets["GEMINI_APIKEY"])
|
33 |
+
GEM_MODEL = "gemini-1.5-pro-latest"
|
34 |
+
TEMP = Path(tempfile.gettempdir())
|
35 |
+
|
36 |
+
# ----------------------------------------------------------------------------
|
37 |
+
# 1๏ธโฃ UTILITIES
|
38 |
+
# ----------------------------------------------------------------------------
|
39 |
+
@st.cache_data(show_spinner=False)
|
40 |
+
def _lazy_read(file: io.BufferedReader, sample: bool = False) -> pd.DataFrame:
|
41 |
+
"""Load big CSV/Excel/Parquet in chunks (first 5โฏM rows if sample)."""
|
42 |
+
suff = Path(file.name).suffix.lower()
|
43 |
+
if suff in {".xls", ".xlsx"}:
|
44 |
+
return pd.read_excel(file, engine="openpyxl")
|
45 |
+
if suff == ".parquet":
|
46 |
+
return pd.read_parquet(file)
|
47 |
+
if sample:
|
48 |
+
return pd.read_csv(file, nrows=5_000_000)
|
49 |
+
return pd.read_csv(file)
|
50 |
+
|
51 |
+
@st.cache_data(show_spinner=False)
|
52 |
+
def _list_tables(conn: str) -> List[str]:
|
53 |
+
return create_engine(conn).table_names()
|
54 |
+
|
55 |
+
@st.cache_data(show_spinner=True)
|
56 |
+
def _read_table(conn: str, tbl: str) -> pd.DataFrame:
|
57 |
+
return pd.read_sql_table(tbl, create_engine(conn))
|
58 |
+
|
59 |
+
@st.cache_data(show_spinner=False)
|
60 |
+
def _gemini(text: str) -> str:
|
61 |
+
return genai.GenerativeModel(GEM_MODEL).generate_content(text).text.strip()
|
62 |
+
|
63 |
+
# ----------------------------------------------------------------------------
|
64 |
+
# 2๏ธโฃ APP HEADER & DATA SOURCE
|
65 |
+
# ----------------------------------------------------------------------------
|
66 |
+
st.title("๐ BizIntelย AIย Ultra โ Geminiโpowered BI Copilot")
|
67 |
+
source = st.sidebar.radio("Data source", ["File", "SQLย DB"], key="src")
|
68 |
+
df: pd.DataFrame = pd.DataFrame()
|
69 |
+
|
70 |
+
if source == "File":
|
71 |
+
upl = st.sidebar.file_uploader("Upload CSV / Excel / Parquet", type=["csv","xls","xlsx","parquet"], help="โค2โฏGB")
|
72 |
+
sample = st.sidebar.checkbox("Load sample only (first 5โฏM rows)")
|
73 |
+
if upl:
|
74 |
+
df = _lazy_read(upl, sample)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
else:
|
76 |
+
dialect = st.sidebar.selectbox("Engine", ["postgresql","mysql","mssql+pyodbc","oracle+cx_oracle"])
|
77 |
+
conn_str = st.sidebar.text_input("SQLAlchemy URI")
|
78 |
if conn_str:
|
79 |
+
tables = _list_tables(conn_str)
|
80 |
+
tbl = st.sidebar.selectbox("Table", tables)
|
81 |
+
if tbl:
|
82 |
+
df = _read_table(conn_str, tbl)
|
83 |
+
|
84 |
+
if df.empty:
|
85 |
+
st.info("โฌ
๏ธ Load data to begin analysis")
|
86 |
+
st.stop()
|
87 |
+
|
88 |
+
# ----------------------------------------------------------------------------
|
89 |
+
# 3๏ธโฃ QUICK OVERVIEW
|
90 |
+
# ----------------------------------------------------------------------------
|
91 |
+
st.success("โ
Data loaded")
|
92 |
+
st.dataframe(df.head(10), use_container_width=True)
|
93 |
+
rows, cols = df.shape
|
94 |
+
miss_pct = df.isna().sum().sum() / (rows*cols) * 100
|
95 |
+
c1,c2,c3 = st.columns(3)
|
96 |
+
c1.metric("Rows", f"{rows:,}")
|
97 |
+
c2.metric("Columns", cols)
|
98 |
+
c3.metric("Missingย %", f"{miss_pct:.1f}")
|
99 |
+
|
100 |
+
# ----------------------------------------------------------------------------
|
101 |
+
# 4๏ธโฃ GEMINI INSIGHTS
|
102 |
+
# ----------------------------------------------------------------------------
|
103 |
+
st.subheader("๐ง Gemini Insights")
|
104 |
+
with st.spinner("Crafting narrativeโฆ"):
|
105 |
+
summ = df.describe(include="all", datetime_is_numeric=True).round(2).to_json()
|
106 |
+
prompt = (
|
107 |
+
"You are a senior BI analyst. Provide five bullet insights (<170 words) about the dataset below. "
|
108 |
+
"Focus on trends, anomalies, and next actions.\n\n" + summ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
)
|
110 |
+
insights = _gemini(prompt)
|
111 |
+
st.markdown(insights)
|
112 |
+
|
113 |
+
# ----------------------------------------------------------------------------
|
114 |
+
# 5๏ธโฃ COLUMN CHOICES & TREND
|
115 |
+
# ----------------------------------------------------------------------------
|
116 |
+
# autoโdetect datetime candidates
|
117 |
+
maybe_dates = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
|
118 |
+
if not maybe_dates:
|
119 |
+
for c in df.columns:
|
120 |
+
try:
|
121 |
+
df[c] = pd.to_datetime(df[c])
|
122 |
+
maybe_dates.append(c)
|
123 |
+
except: # noqa: E722
|
124 |
+
pass
|
125 |
+
|
126 |
+
date_col = st.selectbox("Date column", maybe_dates or df.columns)
|
127 |
+
metric_col = st.selectbox("Metric column", [c for c in df.select_dtypes("number").columns if c != date_col])
|
128 |
+
|
129 |
+
series = (
|
130 |
+
df[[date_col, metric_col]]
|
131 |
+
.dropna()
|
132 |
+
.assign(**{date_col: lambda d: pd.to_datetime(d[date_col], errors="coerce")})
|
133 |
+
.dropna()
|
134 |
+
.groupby(date_col)[metric_col]
|
135 |
+
.mean()
|
136 |
+
.sort_index()
|
137 |
+
)
|
138 |
|
139 |
+
fig_tr = px.line(series, title=f"{metric_col} Trend", labels={"index":"Date", metric_col:metric_col})
|
140 |
+
st.plotly_chart(fig_tr, use_container_width=True)
|
|
|
141 |
|
142 |
+
# ----------------------------------------------------------------------------
|
143 |
+
# 6๏ธโฃ FORECASTING (userโtunable)
|
144 |
+
# ----------------------------------------------------------------------------
|
145 |
+
st.subheader("๐ฎ Forecast")
|
146 |
+
periods = st.slider("Periods to forecast", 3, 365, 90, step=1)
|
147 |
+
order_p = st.number_input("AR order (p)", 0, 5, 1, key="p")
|
148 |
+
order_d = st.number_input("I order (d)", 0, 2, 1, key="d")
|
149 |
+
order_q = st.number_input("MA order (q)", 0, 5, 1, key="q")
|
150 |
+
|
151 |
+
with st.spinner("Model fitting & forecastingโฆ"):
|
152 |
+
try:
|
153 |
+
model = ARIMA(series, order=(order_p, order_d, order_q)).fit()
|
154 |
+
idx_future = pd.date_range(series.index.max(), periods=periods+1, freq=pd.infer_freq(series.index) or "D")[1:]
|
155 |
+
fc_vals = model.forecast(periods)
|
156 |
+
forecast = pd.Series(fc_vals.values, index=idx_future, name="Forecast")
|
157 |
+
except Exception as e:
|
158 |
+
st.error(f"Model failed: {e}")
|
159 |
+
st.stop()
|
160 |
+
|
161 |
+
fig_fc = px.line(pd.concat([series, forecast], axis=1), title="Actual vs Forecast")
|
162 |
+
st.plotly_chart(fig_fc, use_container_width=True)
|
163 |
+
|
164 |
+
# ----------------------------------------------------------------------------
|
165 |
+
# 7๏ธโฃ EDA DASHBOARD
|
166 |
+
# ----------------------------------------------------------------------------
|
167 |
+
st.subheader("๐ Exploratory Data Dashboard")
|
168 |
+
with st.expander("Hist / KDE"):
|
169 |
+
num = st.selectbox("Numeric column", series.index.name if series.empty else metric_col, key="hist_sel")
|
170 |
+
fig_h = px.histogram(df, x=num, nbins=50, marginal="box", template="plotly_dark")
|
171 |
+
st.plotly_chart(fig_h, use_container_width=True)
|
172 |
+
|
173 |
+
with st.expander("Correlation Heatmap"):
|
174 |
+
corr = df.select_dtypes("number").corr()
|
175 |
+
fig_c = px.imshow(corr, color_continuous_scale="RdBu", labels=dict(color="ฯ"), title="Correlation")
|
176 |
+
st.plotly_chart(fig_c, use_container_width=True)
|
177 |
+
|
178 |
+
# ----------------------------------------------------------------------------
|
179 |
+
# 8๏ธโฃ STRATEGY DOWNLOAD
|
180 |
+
# ----------------------------------------------------------------------------
|
181 |
+
brief = (
|
182 |
+
"# Strategy Brief\n"
|
183 |
+
"1. Clean missing timestamps for robust modeling.\n"
|
184 |
+
"2. Investigate drivers behind top correlations.\n"
|
185 |
+
"3. Leverage forecast to align ops & marketing.\n"
|
186 |
+
"4. Monitor outliers >3ฯ each week.\n"
|
187 |
+
"5. Drill into segment variations (region / product)."
|
188 |
+
)
|
189 |
+
st.download_button("โฌ๏ธย Download Strategy (.md)", brief, file_name="bizintel_brief.md", mime="text/markdown")
|