Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,189 +1,143 @@
|
|
1 |
-
"""app.py
|
2 |
-
A productionโgrade BI
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
"""
|
10 |
-
|
11 |
from __future__ import annotations
|
12 |
-
import os, io, tempfile
|
13 |
from pathlib import Path
|
14 |
-
from typing import List
|
15 |
|
16 |
import pandas as pd
|
17 |
-
import numpy as np
|
18 |
import streamlit as st
|
19 |
import plotly.express as px
|
20 |
-
import plotly.graph_objects as go
|
21 |
-
import matplotlib.pyplot as plt
|
22 |
from statsmodels.tsa.arima.model import ARIMA
|
23 |
from sqlalchemy import create_engine
|
24 |
import google.generativeai as genai
|
25 |
|
26 |
-
#
|
27 |
-
|
28 |
-
|
29 |
-
st.
|
30 |
-
|
31 |
-
|
32 |
-
genai.configure(api_key=
|
33 |
GEM_MODEL = "gemini-1.5-pro-latest"
|
34 |
-
|
|
|
|
|
35 |
|
36 |
-
#
|
37 |
-
# 1๏ธโฃ UTILITIES
|
38 |
-
# ----------------------------------------------------------------------------
|
39 |
@st.cache_data(show_spinner=False)
|
40 |
-
def
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
if sample:
|
48 |
-
return pd.read_csv(file, nrows=5_000_000)
|
49 |
-
return pd.read_csv(file)
|
50 |
|
51 |
@st.cache_data(show_spinner=False)
|
52 |
-
def
|
53 |
-
return create_engine(
|
54 |
|
55 |
@st.cache_data(show_spinner=True)
|
56 |
-
def
|
57 |
-
return pd.read_sql_table(tbl, create_engine(
|
58 |
|
59 |
@st.cache_data(show_spinner=False)
|
60 |
-
def
|
61 |
-
return genai.GenerativeModel(GEM_MODEL).generate_content(
|
62 |
-
|
63 |
-
#
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
upl = st.sidebar.file_uploader("Upload CSV / Excel / Parquet", type=["csv","xls","xlsx","parquet"], help="โค2โฏGB")
|
72 |
-
sample = st.sidebar.checkbox("Load sample only (first 5โฏM rows)")
|
73 |
if upl:
|
74 |
-
|
75 |
else:
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
tables = _list_tables(conn_str)
|
80 |
-
tbl = st.sidebar.selectbox("Table", tables)
|
81 |
if tbl:
|
82 |
-
|
83 |
|
84 |
-
if
|
85 |
-
st.info("โฌ
๏ธ
|
86 |
st.stop()
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
rows
|
94 |
-
miss_pct = df.isna().sum().sum() / (rows*cols) * 100
|
95 |
c1,c2,c3 = st.columns(3)
|
96 |
c1.metric("Rows", f"{rows:,}")
|
97 |
c2.metric("Columns", cols)
|
98 |
-
c3.metric("Missingย %", f"{
|
99 |
|
100 |
-
# ----------------------------------------------------------------------------
|
101 |
-
# 4๏ธโฃ GEMINI INSIGHTS
|
102 |
-
# ----------------------------------------------------------------------------
|
103 |
st.subheader("๐ง Gemini Insights")
|
104 |
-
with st.spinner("
|
105 |
-
|
106 |
-
|
107 |
-
"You are a senior BI analyst.
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
# 5๏ธโฃ COLUMN CHOICES & TREND
|
115 |
-
# ----------------------------------------------------------------------------
|
116 |
-
# autoโdetect datetime candidates
|
117 |
-
maybe_dates = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
|
118 |
-
if not maybe_dates:
|
119 |
-
for c in df.columns:
|
120 |
try:
|
121 |
-
|
122 |
-
maybe_dates.append(c)
|
123 |
except: # noqa: E722
|
124 |
pass
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
series = (
|
130 |
-
df[[date_col, metric_col]]
|
131 |
-
.dropna()
|
132 |
-
.assign(**{date_col: lambda d: pd.to_datetime(d[date_col], errors="coerce")})
|
133 |
-
.dropna()
|
134 |
-
.groupby(date_col)[metric_col]
|
135 |
-
.mean()
|
136 |
-
.sort_index()
|
137 |
-
)
|
138 |
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
141 |
|
142 |
-
#
|
143 |
-
# 6๏ธโฃ FORECASTING (userโtunable)
|
144 |
-
# ----------------------------------------------------------------------------
|
145 |
st.subheader("๐ฎ Forecast")
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
model = ARIMA(series, order=(order_p, order_d, order_q)).fit()
|
154 |
-
idx_future = pd.date_range(series.index.max(), periods=periods+1, freq=pd.infer_freq(series.index) or "D")[1:]
|
155 |
-
fc_vals = model.forecast(periods)
|
156 |
-
forecast = pd.Series(fc_vals.values, index=idx_future, name="Forecast")
|
157 |
-
except Exception as e:
|
158 |
-
st.error(f"Model failed: {e}")
|
159 |
-
st.stop()
|
160 |
-
|
161 |
-
fig_fc = px.line(pd.concat([series, forecast], axis=1), title="Actual vs Forecast")
|
162 |
st.plotly_chart(fig_fc, use_container_width=True)
|
163 |
|
164 |
-
#
|
165 |
-
|
166 |
-
|
167 |
-
st.
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
st.plotly_chart(
|
172 |
-
|
173 |
-
|
174 |
-
corr = df.select_dtypes("number").corr()
|
175 |
-
fig_c = px.imshow(corr, color_continuous_scale="RdBu", labels=dict(color="ฯ"), title="Correlation")
|
176 |
-
st.plotly_chart(fig_c, use_container_width=True)
|
177 |
-
|
178 |
-
# ----------------------------------------------------------------------------
|
179 |
-
# 8๏ธโฃ STRATEGY DOWNLOAD
|
180 |
-
# ----------------------------------------------------------------------------
|
181 |
brief = (
|
182 |
"# Strategy Brief\n"
|
183 |
-
"
|
184 |
-
"
|
185 |
-
"
|
186 |
-
"
|
187 |
-
"
|
188 |
)
|
189 |
-
st.download_button("โฌ๏ธย
|
|
|
1 |
+
"""app.pyย โย BizIntelย AIย Ultraย (Geminiโonly,โฏv3)
|
2 |
+
A productionโgrade BI copilot with:
|
3 |
+
โข CSVโฏ/โฏExcelโฏ/โฏParquetย and live SQL ingestion
|
4 |
+
โข Memoryโsafe chunk loading (โฅ2โฏGB) & dtype autoโfix
|
5 |
+
โข Instant schema audit + Geminiโgenerated insights
|
6 |
+
โข Drillโdown EDA (histogram, violin, scatterโmatrix, heatโmap)
|
7 |
+
โข Autoโdetected datetime + userโtunable ARIMA forecasting
|
8 |
+
โข Oneโclick strategy brief (Markdown)
|
9 |
"""
|
|
|
10 |
from __future__ import annotations
|
11 |
+
import os, io, tempfile
|
12 |
from pathlib import Path
|
13 |
+
from typing import List
|
14 |
|
15 |
import pandas as pd
|
|
|
16 |
import streamlit as st
|
17 |
import plotly.express as px
|
|
|
|
|
18 |
from statsmodels.tsa.arima.model import ARIMA
|
19 |
from sqlalchemy import create_engine
|
20 |
import google.generativeai as genai
|
21 |
|
22 |
+
# โโโโโโโโโโโโโโโโโโโ 0โฏยทโฏCONFIG & SECRETS โโโโโโโโโโโโโโโโโโโโ
|
23 |
+
API_KEY = st.secrets.get("GEMINI_APIKEY") or os.getenv("GEMINI_APIKEY")
|
24 |
+
if not API_KEY:
|
25 |
+
st.error("โย `GEMINI_APIKEY` missing โ add it in *Settings โ Secrets* or env vars.")
|
26 |
+
st.stop()
|
27 |
+
|
28 |
+
genai.configure(api_key=API_KEY)
|
29 |
GEM_MODEL = "gemini-1.5-pro-latest"
|
30 |
+
TMP = Path(tempfile.gettempdir())
|
31 |
+
|
32 |
+
st.set_page_config("BizIntelย AIย Ultra", "๐", "wide", initial_sidebar_state="expanded")
|
33 |
|
34 |
+
# โโโโโโโโโโโโโโโโโโโ 1โฏยทโฏUTILITY HELPERS โโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
|
35 |
@st.cache_data(show_spinner=False)
|
36 |
+
def read_file(buf: io.BufferedReader, sample: bool = False) -> pd.DataFrame:
|
37 |
+
suf = Path(buf.name).suffix.lower()
|
38 |
+
if suf in {".xls", ".xlsx"}: # Excel
|
39 |
+
return pd.read_excel(buf, engine="openpyxl")
|
40 |
+
if suf == ".parquet":
|
41 |
+
return pd.read_parquet(buf)
|
42 |
+
return pd.read_csv(buf, nrows=5_000_000 if sample else None)
|
|
|
|
|
|
|
43 |
|
44 |
@st.cache_data(show_spinner=False)
|
45 |
+
def sql_tables(uri: str) -> List[str]:
|
46 |
+
return create_engine(uri).table_names()
|
47 |
|
48 |
@st.cache_data(show_spinner=True)
|
49 |
+
def read_table(uri: str, tbl: str) -> pd.DataFrame:
|
50 |
+
return pd.read_sql_table(tbl, create_engine(uri))
|
51 |
|
52 |
@st.cache_data(show_spinner=False)
|
53 |
+
def ask_gemini(prompt: str) -> str:
|
54 |
+
return genai.GenerativeModel(GEM_MODEL).generate_content(prompt).text.strip()
|
55 |
+
|
56 |
+
# โโโโโโโโโโโโโโโโโโโ 2โฏยทโฏDATA INGESTION โโโโโโโโโโโโโโโโโโโโโโ
|
57 |
+
st.title("๐ BizIntelย AIย Ultra โ Geminiย 1.5ย Pro BI Copilot")
|
58 |
+
mode = st.sidebar.radio("Source", ["File", "SQL"], horizontal=True)
|
59 |
+
DF: pd.DataFrame = pd.DataFrame()
|
60 |
+
|
61 |
+
if mode == "File":
|
62 |
+
upl = st.sidebar.file_uploader("Upload CSVย /ย Excelย /ย Parquet", ["csv","xls","xlsx","parquet"], help="โค2โฏGB")
|
63 |
+
sample = st.sidebar.checkbox("Load sample only (โคโฏ5โฏM rows)")
|
|
|
|
|
64 |
if upl:
|
65 |
+
DF = read_file(upl, sample)
|
66 |
else:
|
67 |
+
uri = st.sidebar.text_input("SQLAlchemy URI")
|
68 |
+
if uri:
|
69 |
+
tbl = st.sidebar.selectbox("Table", sql_tables(uri))
|
|
|
|
|
70 |
if tbl:
|
71 |
+
DF = read_table(uri, tbl)
|
72 |
|
73 |
+
if DF.empty:
|
74 |
+
st.info("โฌ
๏ธย Load data to start.")
|
75 |
st.stop()
|
76 |
|
77 |
+
st.success("โ
ย Data loaded")
|
78 |
+
st.dataframe(DF.head(), use_container_width=True)
|
79 |
+
|
80 |
+
# โโโโโโโโโโโโโโโโโโโ 3โฏยทโฏQUICKโฏSTATS + GEMINI INSIGHT โโโโโโโโ
|
81 |
+
rows, cols = DF.shape
|
82 |
+
miss = DF.isna().sum().sum() / (rows*cols) * 100
|
|
|
83 |
c1,c2,c3 = st.columns(3)
|
84 |
c1.metric("Rows", f"{rows:,}")
|
85 |
c2.metric("Columns", cols)
|
86 |
+
c3.metric("Missingย %", f"{miss:.1f}")
|
87 |
|
|
|
|
|
|
|
88 |
st.subheader("๐ง Gemini Insights")
|
89 |
+
with st.spinner("Gemini analysingโฆ"):
|
90 |
+
summary = DF.describe(include="all", datetime_is_numeric=True).round(2).to_json()
|
91 |
+
st.markdown(ask_gemini(
|
92 |
+
"You are a senior BI analyst. Give 5 concise insights and 3 action items for the dataset: " + summary
|
93 |
+
))
|
94 |
+
|
95 |
+
# โโโโโโโโโโโโโโโโโโโ 4โฏยทโฏTIMEโSERIES SELECTION โโโโโโโโโโโโโโโ
|
96 |
+
# attempt datetime coercion
|
97 |
+
for c in DF.columns:
|
98 |
+
if not pd.api.types.is_datetime64_any_dtype(DF[c]):
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
try:
|
100 |
+
DF[c] = pd.to_datetime(DF[c])
|
|
|
101 |
except: # noqa: E722
|
102 |
pass
|
103 |
|
104 |
+
DATE_COL = st.selectbox("Date column", [c for c in DF.columns if pd.api.types.is_datetime64_any_dtype(DF[c])])
|
105 |
+
METRIC_COL = st.selectbox("Numeric metric", [c for c in DF.select_dtypes("number").columns])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
+
ts = (
|
108 |
+
DF[[DATE_COL, METRIC_COL]].dropna()
|
109 |
+
.groupby(DATE_COL)[METRIC_COL].mean().sort_index()
|
110 |
+
)
|
111 |
+
fig_ts = px.line(ts, title=f"{METRIC_COL} Trend", labels={"index":"Date", METRIC_COL:METRIC_COL})
|
112 |
+
st.plotly_chart(fig_ts, use_container_width=True)
|
113 |
|
114 |
+
# โโโโโโโโโโโโโโโโโโโ 5โฏยทโฏFORECASTING โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
|
115 |
st.subheader("๐ฎ Forecast")
|
116 |
+
steps = st.slider("Horizon", 3, 365, 90)
|
117 |
+
p = st.number_input("p", 0,5,1); d = st.number_input("d",0,2,1); q = st.number_input("q",0,5,1)
|
118 |
+
with st.spinner("Fitting ARIMAโฆ"):
|
119 |
+
model = ARIMA(ts, order=(p,d,q)).fit()
|
120 |
+
fut_idx = pd.date_range(ts.index[-1], periods=steps+1, freq=pd.infer_freq(ts.index) or "D")[1:]
|
121 |
+
forecast = pd.Series(model.forecast(steps), index=fut_idx)
|
122 |
+
fig_fc = px.line(pd.concat([ts, forecast.rename("Forecast")], axis=1), title="Actual vs Forecast")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
st.plotly_chart(fig_fc, use_container_width=True)
|
124 |
|
125 |
+
# โโโโโโโโโโโโโโโโโโโ 6โฏยทโฏEDA EXPANDERS โโโโโโโโโโโโโโโโโโโโโโโ
|
126 |
+
st.subheader("๐ EDA Dashboard")
|
127 |
+
with st.expander("Histogram / Box"):
|
128 |
+
col = st.selectbox("Column", METRIC_COL, key="hist")
|
129 |
+
st.plotly_chart(px.histogram(DF, x=col, marginal="box", template="plotly_dark"), use_container_width=True)
|
130 |
+
with st.expander("Correlation heatโmap"):
|
131 |
+
corr = DF.select_dtypes("number").corr()
|
132 |
+
st.plotly_chart(px.imshow(corr, color_continuous_scale="RdBu", aspect="auto", title="Correlation"), use_container_width=True)
|
133 |
+
|
134 |
+
# โโโโโโโโโโโโโโโโโโโ 7โฏยทโฏSTRATEGY BRIEF DOWNLOAD โโโโโโโโโโโโ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
brief = (
|
136 |
"# Strategy Brief\n"
|
137 |
+
"* Clean missing timestamps.\n"
|
138 |
+
"* Investigate strongest correlations for causal drivers.\n"
|
139 |
+
"* Use forecast to guide inventory & staffing planning.\n"
|
140 |
+
"* Review outliers weekly (>3ฯ).\n"
|
141 |
+
"* Segment analysis by region & product for microโactions."
|
142 |
)
|
143 |
+
st.download_button("โฌ๏ธย Strategy (.md)", brief, "bizintel_brief.md", "text/markdown")
|