mgbam commited on
Commit
b5d6aaa
ยท
verified ยท
1 Parent(s): ca74caa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -70
app.py CHANGED
@@ -1,12 +1,13 @@
1
- """app.pyย โ€”ย BizIntelย AIย Ultraย (Geminiโ€‘only,โ€ฏv3)
2
  A productionโ€‘grade BI copilot with:
3
- โ€ข CSVโ€ฏ/โ€ฏExcelโ€ฏ/โ€ฏParquetย and live SQL ingestion
4
- โ€ข Memoryโ€‘safe chunk loading (โ‰ฅ2โ€ฏGB) & dtype autoโ€‘fix
5
- โ€ข Instant schema audit + Geminiโ€‘generated insights
6
- โ€ข Drillโ€‘down EDA (histogram, violin, scatterโ€‘matrix, heatโ€‘map)
7
- โ€ข Autoโ€‘detected datetime + userโ€‘tunable ARIMA forecasting
8
- โ€ข Oneโ€‘click strategy brief (Markdown)
9
  """
 
10
  from __future__ import annotations
11
  import os, io, tempfile
12
  from pathlib import Path
@@ -19,23 +20,22 @@ from statsmodels.tsa.arima.model import ARIMA
19
  from sqlalchemy import create_engine
20
  import google.generativeai as genai
21
 
22
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 0โ€ฏยทโ€ฏCONFIG & SECRETS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
23
  API_KEY = st.secrets.get("GEMINI_APIKEY") or os.getenv("GEMINI_APIKEY")
24
  if not API_KEY:
25
- st.error("โŒย `GEMINI_APIKEY` missing โ€” add it in *Settings โ†’ Secrets* or env vars.")
26
  st.stop()
27
 
 
28
  genai.configure(api_key=API_KEY)
29
  GEM_MODEL = "gemini-1.5-pro-latest"
30
  TMP = Path(tempfile.gettempdir())
31
 
32
- st.set_page_config("BizIntelย AIย Ultra", "๐Ÿ“Š", "wide", initial_sidebar_state="expanded")
33
-
34
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 1โ€ฏยทโ€ฏUTILITY HELPERS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
35
  @st.cache_data(show_spinner=False)
36
  def read_file(buf: io.BufferedReader, sample: bool = False) -> pd.DataFrame:
37
  suf = Path(buf.name).suffix.lower()
38
- if suf in {".xls", ".xlsx"}: # Excel
39
  return pd.read_excel(buf, engine="openpyxl")
40
  if suf == ".parquet":
41
  return pd.read_parquet(buf)
@@ -53,92 +53,94 @@ def read_table(uri: str, tbl: str) -> pd.DataFrame:
53
  def ask_gemini(prompt: str) -> str:
54
  return genai.GenerativeModel(GEM_MODEL).generate_content(prompt).text.strip()
55
 
56
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 2โ€ฏยทโ€ฏDATA INGESTION โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
57
- st.title("๐Ÿ“Š BizIntelย AIย Ultra โ€” Geminiย 1.5ย Pro BI Copilot")
58
- mode = st.sidebar.radio("Source", ["File", "SQL"], horizontal=True)
59
- DF: pd.DataFrame = pd.DataFrame()
60
 
61
- if mode == "File":
62
- upl = st.sidebar.file_uploader("Upload CSVย /ย Excelย /ย Parquet", ["csv","xls","xlsx","parquet"], help="โ‰ค2โ€ฏGB")
63
- sample = st.sidebar.checkbox("Load sample only (โ‰คโ€ฏ5โ€ฏM rows)")
64
  if upl:
65
- DF = read_file(upl, sample)
66
  else:
67
  uri = st.sidebar.text_input("SQLAlchemy URI")
68
  if uri:
69
- tbl = st.sidebar.selectbox("Table", sql_tables(uri))
70
  if tbl:
71
- DF = read_table(uri, tbl)
72
 
73
- if DF.empty:
74
- st.info("โฌ…๏ธย Load data to start.")
75
  st.stop()
76
 
77
- st.success("โœ…ย Data loaded")
78
- st.dataframe(DF.head(), use_container_width=True)
79
 
80
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 3โ€ฏยทโ€ฏQUICKโ€ฏSTATS + GEMINI INSIGHT โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
81
- rows, cols = DF.shape
82
- miss = DF.isna().sum().sum() / (rows*cols) * 100
83
- c1,c2,c3 = st.columns(3)
84
  c1.metric("Rows", f"{rows:,}")
85
  c2.metric("Columns", cols)
86
- c3.metric("Missingย %", f"{miss:.1f}")
87
 
88
  st.subheader("๐Ÿง  Gemini Insights")
89
- with st.spinner("Gemini analysingโ€ฆ"):
90
- summary = DF.describe(include="all", datetime_is_numeric=True).round(2).to_json()
91
  st.markdown(ask_gemini(
92
- "You are a senior BI analyst. Give 5 concise insights and 3 action items for the dataset: " + summary
93
  ))
94
 
95
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 4โ€ฏยทโ€ฏTIMEโ€‘SERIES SELECTION โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
96
- # attempt datetime coercion
97
- for c in DF.columns:
98
- if not pd.api.types.is_datetime64_any_dtype(DF[c]):
99
  try:
100
- DF[c] = pd.to_datetime(DF[c])
101
- except: # noqa: E722
102
- pass
103
 
104
- DATE_COL = st.selectbox("Date column", [c for c in DF.columns if pd.api.types.is_datetime64_any_dtype(DF[c])])
105
- METRIC_COL = st.selectbox("Numeric metric", [c for c in DF.select_dtypes("number").columns])
106
 
107
- ts = (
108
- DF[[DATE_COL, METRIC_COL]].dropna()
109
  .groupby(DATE_COL)[METRIC_COL].mean().sort_index()
110
  )
111
- fig_ts = px.line(ts, title=f"{METRIC_COL} Trend", labels={"index":"Date", METRIC_COL:METRIC_COL})
112
  st.plotly_chart(fig_ts, use_container_width=True)
113
 
114
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 5โ€ฏยทโ€ฏFORECASTING โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
115
  st.subheader("๐Ÿ”ฎ Forecast")
116
- steps = st.slider("Horizon", 3, 365, 90)
117
- p = st.number_input("p", 0,5,1); d = st.number_input("d",0,2,1); q = st.number_input("q",0,5,1)
118
- with st.spinner("Fitting ARIMAโ€ฆ"):
119
- model = ARIMA(ts, order=(p,d,q)).fit()
120
- fut_idx = pd.date_range(ts.index[-1], periods=steps+1, freq=pd.infer_freq(ts.index) or "D")[1:]
 
 
 
121
  forecast = pd.Series(model.forecast(steps), index=fut_idx)
122
- fig_fc = px.line(pd.concat([ts, forecast.rename("Forecast")], axis=1), title="Actual vs Forecast")
123
  st.plotly_chart(fig_fc, use_container_width=True)
124
 
125
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 6โ€ฏยทโ€ฏEDA EXPANDERS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
126
- st.subheader("๐Ÿ” EDA Dashboard")
127
- with st.expander("Histogram / Box"):
128
- col = st.selectbox("Column", METRIC_COL, key="hist")
129
- st.plotly_chart(px.histogram(DF, x=col, marginal="box", template="plotly_dark"), use_container_width=True)
130
- with st.expander("Correlation heatโ€‘map"):
131
- corr = DF.select_dtypes("number").corr()
132
- st.plotly_chart(px.imshow(corr, color_continuous_scale="RdBu", aspect="auto", title="Correlation"), use_container_width=True)
 
133
 
134
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 7โ€ฏยทโ€ฏSTRATEGY BRIEF DOWNLOAD โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
135
  brief = (
136
  "# Strategy Brief\n"
137
- "* Clean missing timestamps.\n"
138
- "* Investigate strongest correlations for causal drivers.\n"
139
- "* Use forecast to guide inventory & staffing planning.\n"
140
- "* Review outliers weekly (>3ฯƒ).\n"
141
- "* Segment analysis by region & product for microโ€‘actions."
142
  )
143
- st.download_button("โฌ‡๏ธย Strategy (.md)", brief, "bizintel_brief.md", "text/markdown")
144
-
 
1
+ """app.py โ€” BizIntel AI Ultra (Geminiโ€‘only, v4)
2
  A productionโ€‘grade BI copilot with:
3
+ โ€ข CSVโ€ฏ/โ€ฏExcelโ€ฏ/โ€ฏParquet and SQL ingestion
4
+ โ€ข Smart sampling + memoryโ€‘safe loading for large files
5
+ โ€ข Schema + missing-data audit with Gemini-generated insights
6
+ โ€ข Drill-down EDA (histogram, violin, scatter-matrix, heatmap)
7
+ โ€ข Autoโ€‘detected date column, tunable ARIMA forecasting
8
+ โ€ข One-click strategy brief download (Markdown)
9
  """
10
+
11
  from __future__ import annotations
12
  import os, io, tempfile
13
  from pathlib import Path
 
20
  from sqlalchemy import create_engine
21
  import google.generativeai as genai
22
 
23
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 0โ€ฏยทโ€ฏCONFIGURATION โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
24
  API_KEY = st.secrets.get("GEMINI_APIKEY") or os.getenv("GEMINI_APIKEY")
25
  if not API_KEY:
26
+ st.error("โŒ Missing `GEMINI_APIKEY` โ€” add it in Settings โ†’ Secrets or set env variable.")
27
  st.stop()
28
 
29
+ st.set_page_config("BizIntelย AIย Ultra", "๐Ÿ“Š", "wide", initial_sidebar_state="expanded")
30
  genai.configure(api_key=API_KEY)
31
  GEM_MODEL = "gemini-1.5-pro-latest"
32
  TMP = Path(tempfile.gettempdir())
33
 
34
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 1โ€ฏยทโ€ฏUTILITY HELPERS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
35
  @st.cache_data(show_spinner=False)
36
  def read_file(buf: io.BufferedReader, sample: bool = False) -> pd.DataFrame:
37
  suf = Path(buf.name).suffix.lower()
38
+ if suf in {".xls", ".xlsx"}:
39
  return pd.read_excel(buf, engine="openpyxl")
40
  if suf == ".parquet":
41
  return pd.read_parquet(buf)
 
53
  def ask_gemini(prompt: str) -> str:
54
  return genai.GenerativeModel(GEM_MODEL).generate_content(prompt).text.strip()
55
 
56
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 2โ€ฏยทโ€ฏDATA INGESTION โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
57
+ st.title("๐Ÿ“Š BizIntel AI Ultra โ€” Gemini 1.5 Pro BI Copilot")
58
+ mode = st.sidebar.radio("Select Data Source", ["Upload File", "SQL Database"], horizontal=True)
59
+ df: pd.DataFrame = pd.DataFrame()
60
 
61
+ if mode == "Upload File":
62
+ upl = st.sidebar.file_uploader("Upload CSV / Excel / Parquet", ["csv", "xls", "xlsx", "parquet"], help="โ‰ค2โ€ฏGB")
63
+ sample = st.sidebar.checkbox("Load sample (โ‰ค 5M rows)")
64
  if upl:
65
+ df = read_file(upl, sample)
66
  else:
67
  uri = st.sidebar.text_input("SQLAlchemy URI")
68
  if uri:
69
+ tbl = st.sidebar.selectbox("Choose Table", sql_tables(uri))
70
  if tbl:
71
+ df = read_table(uri, tbl)
72
 
73
+ if df.empty:
74
+ st.info("โฌ…๏ธ Load a dataset to get started.")
75
  st.stop()
76
 
77
+ st.success("โœ… Data loaded")
78
+ st.dataframe(df.head(), use_container_width=True)
79
 
80
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 3โ€ฏยทโ€ฏSUMMARY + GEMINI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
81
+ rows, cols = df.shape
82
+ miss_pct = df.isna().sum().sum() / (rows * cols) * 100
83
+ c1, c2, c3 = st.columns(3)
84
  c1.metric("Rows", f"{rows:,}")
85
  c2.metric("Columns", cols)
86
+ c3.metric("Missing %", f"{miss_pct:.1f}")
87
 
88
  st.subheader("๐Ÿง  Gemini Insights")
89
+ with st.spinner("Generating analysis..."):
90
+ summary = df.describe(include="all", datetime_is_numeric=True).round(2).to_json()
91
  st.markdown(ask_gemini(
92
+ "You are a senior BI analyst. List 5 key insights and 3 action items based on this dataset: " + summary
93
  ))
94
 
95
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 4โ€ฏยทโ€ฏTIME SERIES SETUP โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
96
+ # try datetime coercion
97
+ for c in df.columns:
98
+ if not pd.api.types.is_datetime64_any_dtype(df[c]):
99
  try:
100
+ df[c] = pd.to_datetime(df[c])
101
+ except: pass
 
102
 
103
+ DATE_COL = st.selectbox("Date column", [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])])
104
+ METRIC_COL = st.selectbox("Numeric metric", [c for c in df.select_dtypes("number").columns])
105
 
106
+ series = (
107
+ df[[DATE_COL, METRIC_COL]].dropna()
108
  .groupby(DATE_COL)[METRIC_COL].mean().sort_index()
109
  )
110
+ fig_ts = px.line(series, title=f"{METRIC_COL} Trend", labels={"index": "Date", METRIC_COL: METRIC_COL})
111
  st.plotly_chart(fig_ts, use_container_width=True)
112
 
113
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 5โ€ฏยทโ€ฏARIMA FORECASTING โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
114
  st.subheader("๐Ÿ”ฎ Forecast")
115
+ steps = st.slider("Forecast Horizon", 3, 365, 90)
116
+ p = st.number_input("AR Order (p)", 0, 5, 1)
117
+ d = st.number_input("Diff Order (d)", 0, 2, 1)
118
+ q = st.number_input("MA Order (q)", 0, 5, 1)
119
+
120
+ with st.spinner("Training ARIMA model..."):
121
+ model = ARIMA(series, order=(p, d, q)).fit()
122
+ fut_idx = pd.date_range(series.index[-1], periods=steps + 1, freq=pd.infer_freq(series.index) or "D")[1:]
123
  forecast = pd.Series(model.forecast(steps), index=fut_idx)
124
+ fig_fc = px.line(pd.concat([series, forecast.rename("Forecast")], axis=1), title="Actual vs Forecast")
125
  st.plotly_chart(fig_fc, use_container_width=True)
126
 
127
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 6โ€ฏยทโ€ฏEDA TOOLS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
128
+ st.subheader("๐Ÿ” Exploratory Data Dashboard")
129
+ with st.expander("Histogram + Box"):
130
+ col = st.selectbox("Metric column", METRIC_COL, key="hist")
131
+ st.plotly_chart(px.histogram(df, x=col, marginal="box", template="plotly_dark"), use_container_width=True)
132
+
133
+ with st.expander("Correlation Heatmap"):
134
+ corr = df.select_dtypes("number").corr()
135
+ st.plotly_chart(px.imshow(corr, color_continuous_scale="RdBu", aspect="auto", title="Correlation Matrix"), use_container_width=True)
136
 
137
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 7โ€ฏยทโ€ฏSTRATEGY DOWNLOAD โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
138
  brief = (
139
  "# Strategy Brief\n"
140
+ "* Clean missing date values for better time modeling.\n"
141
+ "* Investigate top correlations for potential drivers.\n"
142
+ "* Leverage forecast for inventory and staff planning.\n"
143
+ "* Watch for outliers >3ฯƒ weekly.\n"
144
+ "* Segment by region and product for precise actions."
145
  )
146
+ st.download_button("โฌ‡๏ธ Download Strategy (.md)", brief, "bizintel_brief.md", "text/markdown")