mgbam commited on
Commit
dc51ef8
ยท
verified ยท
1 Parent(s): 5f67bb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -146
app.py CHANGED
@@ -1,189 +1,143 @@
1
- """app.py โ€” BizIntelย AIย Ultra (Geminiโ€‘only, v2)
2
- A productionโ€‘grade BI assistant with:
3
- โ”€ CSV / Excel / Parquet *and* SQL ingestion
4
- โ”€ Smart dtype inference & memoryโ€‘safe chunk loading (โ‰ฅ2โ€ฏGB)
5
- โ”€ Instant schema, missingโ€‘data audit, and Geminiโ€‘generated insights
6
- โ”€ Drillโ€‘down EDA dashboard (histogram, box, violin, scatterโ€‘matrix, heatโ€‘map)
7
- โ”€ Autoโ€‘detected date column, dynamic ARIMA / SARIMA forecasting (userโ€‘tunable)
8
- โ”€ Strategy brief + Markdown download
9
  """
10
-
11
  from __future__ import annotations
12
- import os, io, tempfile, datetime as dt
13
  from pathlib import Path
14
- from typing import List, Tuple
15
 
16
  import pandas as pd
17
- import numpy as np
18
  import streamlit as st
19
  import plotly.express as px
20
- import plotly.graph_objects as go
21
- import matplotlib.pyplot as plt
22
  from statsmodels.tsa.arima.model import ARIMA
23
  from sqlalchemy import create_engine
24
  import google.generativeai as genai
25
 
26
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
27
- # 0๏ธโƒฃ CONFIG โ”€ Streamlit + Gemini
28
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
29
- st.set_page_config(
30
- page_title="BizIntelย AIย Ultra", layout="wide", initial_sidebar_state="expanded"
31
- )
32
- genai.configure(api_key=st.secrets["GEMINI_APIKEY"])
33
  GEM_MODEL = "gemini-1.5-pro-latest"
34
- TEMP = Path(tempfile.gettempdir())
 
 
35
 
36
- # ----------------------------------------------------------------------------
37
- # 1๏ธโƒฃ UTILITIES
38
- # ----------------------------------------------------------------------------
39
  @st.cache_data(show_spinner=False)
40
- def _lazy_read(file: io.BufferedReader, sample: bool = False) -> pd.DataFrame:
41
- """Load big CSV/Excel/Parquet in chunks (first 5โ€ฏM rows if sample)."""
42
- suff = Path(file.name).suffix.lower()
43
- if suff in {".xls", ".xlsx"}:
44
- return pd.read_excel(file, engine="openpyxl")
45
- if suff == ".parquet":
46
- return pd.read_parquet(file)
47
- if sample:
48
- return pd.read_csv(file, nrows=5_000_000)
49
- return pd.read_csv(file)
50
 
51
  @st.cache_data(show_spinner=False)
52
- def _list_tables(conn: str) -> List[str]:
53
- return create_engine(conn).table_names()
54
 
55
  @st.cache_data(show_spinner=True)
56
- def _read_table(conn: str, tbl: str) -> pd.DataFrame:
57
- return pd.read_sql_table(tbl, create_engine(conn))
58
 
59
  @st.cache_data(show_spinner=False)
60
- def _gemini(text: str) -> str:
61
- return genai.GenerativeModel(GEM_MODEL).generate_content(text).text.strip()
62
-
63
- # ----------------------------------------------------------------------------
64
- # 2๏ธโƒฃ APP HEADER & DATA SOURCE
65
- # ----------------------------------------------------------------------------
66
- st.title("๐Ÿ“Š BizIntelย AIย Ultra โ€” Geminiโ€‘powered BI Copilot")
67
- source = st.sidebar.radio("Data source", ["File", "SQLย DB"], key="src")
68
- df: pd.DataFrame = pd.DataFrame()
69
-
70
- if source == "File":
71
- upl = st.sidebar.file_uploader("Upload CSV / Excel / Parquet", type=["csv","xls","xlsx","parquet"], help="โ‰ค2โ€ฏGB")
72
- sample = st.sidebar.checkbox("Load sample only (first 5โ€ฏM rows)")
73
  if upl:
74
- df = _lazy_read(upl, sample)
75
  else:
76
- dialect = st.sidebar.selectbox("Engine", ["postgresql","mysql","mssql+pyodbc","oracle+cx_oracle"])
77
- conn_str = st.sidebar.text_input("SQLAlchemy URI")
78
- if conn_str:
79
- tables = _list_tables(conn_str)
80
- tbl = st.sidebar.selectbox("Table", tables)
81
  if tbl:
82
- df = _read_table(conn_str, tbl)
83
 
84
- if df.empty:
85
- st.info("โฌ…๏ธ Load data to begin analysis")
86
  st.stop()
87
 
88
- # ----------------------------------------------------------------------------
89
- # 3๏ธโƒฃ QUICK OVERVIEW
90
- # ----------------------------------------------------------------------------
91
- st.success("โœ… Data loaded")
92
- st.dataframe(df.head(10), use_container_width=True)
93
- rows, cols = df.shape
94
- miss_pct = df.isna().sum().sum() / (rows*cols) * 100
95
  c1,c2,c3 = st.columns(3)
96
  c1.metric("Rows", f"{rows:,}")
97
  c2.metric("Columns", cols)
98
- c3.metric("Missingย %", f"{miss_pct:.1f}")
99
 
100
- # ----------------------------------------------------------------------------
101
- # 4๏ธโƒฃ GEMINI INSIGHTS
102
- # ----------------------------------------------------------------------------
103
  st.subheader("๐Ÿง  Gemini Insights")
104
- with st.spinner("Crafting narrativeโ€ฆ"):
105
- summ = df.describe(include="all", datetime_is_numeric=True).round(2).to_json()
106
- prompt = (
107
- "You are a senior BI analyst. Provide five bullet insights (<170 words) about the dataset below. "
108
- "Focus on trends, anomalies, and next actions.\n\n" + summ
109
- )
110
- insights = _gemini(prompt)
111
- st.markdown(insights)
112
-
113
- # ----------------------------------------------------------------------------
114
- # 5๏ธโƒฃ COLUMN CHOICES & TREND
115
- # ----------------------------------------------------------------------------
116
- # autoโ€‘detect datetime candidates
117
- maybe_dates = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
118
- if not maybe_dates:
119
- for c in df.columns:
120
  try:
121
- df[c] = pd.to_datetime(df[c])
122
- maybe_dates.append(c)
123
  except: # noqa: E722
124
  pass
125
 
126
- date_col = st.selectbox("Date column", maybe_dates or df.columns)
127
- metric_col = st.selectbox("Metric column", [c for c in df.select_dtypes("number").columns if c != date_col])
128
-
129
- series = (
130
- df[[date_col, metric_col]]
131
- .dropna()
132
- .assign(**{date_col: lambda d: pd.to_datetime(d[date_col], errors="coerce")})
133
- .dropna()
134
- .groupby(date_col)[metric_col]
135
- .mean()
136
- .sort_index()
137
- )
138
 
139
- fig_tr = px.line(series, title=f"{metric_col} Trend", labels={"index":"Date", metric_col:metric_col})
140
- st.plotly_chart(fig_tr, use_container_width=True)
 
 
 
 
141
 
142
- # ----------------------------------------------------------------------------
143
- # 6๏ธโƒฃ FORECASTING (userโ€‘tunable)
144
- # ----------------------------------------------------------------------------
145
  st.subheader("๐Ÿ”ฎ Forecast")
146
- periods = st.slider("Periods to forecast", 3, 365, 90, step=1)
147
- order_p = st.number_input("AR order (p)", 0, 5, 1, key="p")
148
- order_d = st.number_input("I order (d)", 0, 2, 1, key="d")
149
- order_q = st.number_input("MA order (q)", 0, 5, 1, key="q")
150
-
151
- with st.spinner("Model fitting & forecastingโ€ฆ"):
152
- try:
153
- model = ARIMA(series, order=(order_p, order_d, order_q)).fit()
154
- idx_future = pd.date_range(series.index.max(), periods=periods+1, freq=pd.infer_freq(series.index) or "D")[1:]
155
- fc_vals = model.forecast(periods)
156
- forecast = pd.Series(fc_vals.values, index=idx_future, name="Forecast")
157
- except Exception as e:
158
- st.error(f"Model failed: {e}")
159
- st.stop()
160
-
161
- fig_fc = px.line(pd.concat([series, forecast], axis=1), title="Actual vs Forecast")
162
  st.plotly_chart(fig_fc, use_container_width=True)
163
 
164
- # ----------------------------------------------------------------------------
165
- # 7๏ธโƒฃ EDA DASHBOARD
166
- # ----------------------------------------------------------------------------
167
- st.subheader("๐Ÿ” Exploratory Data Dashboard")
168
- with st.expander("Hist / KDE"):
169
- num = st.selectbox("Numeric column", series.index.name if series.empty else metric_col, key="hist_sel")
170
- fig_h = px.histogram(df, x=num, nbins=50, marginal="box", template="plotly_dark")
171
- st.plotly_chart(fig_h, use_container_width=True)
172
-
173
- with st.expander("Correlation Heatmap"):
174
- corr = df.select_dtypes("number").corr()
175
- fig_c = px.imshow(corr, color_continuous_scale="RdBu", labels=dict(color="ฯ"), title="Correlation")
176
- st.plotly_chart(fig_c, use_container_width=True)
177
-
178
- # ----------------------------------------------------------------------------
179
- # 8๏ธโƒฃ STRATEGY DOWNLOAD
180
- # ----------------------------------------------------------------------------
181
  brief = (
182
  "# Strategy Brief\n"
183
- "1. Clean missing timestamps for robust modeling.\n"
184
- "2. Investigate drivers behind top correlations.\n"
185
- "3. Leverage forecast to align ops & marketing.\n"
186
- "4. Monitor outliers >3ฯƒ each week.\n"
187
- "5. Drill into segment variations (region / product)."
188
  )
189
- st.download_button("โฌ‡๏ธย Download Strategy (.md)", brief, file_name="bizintel_brief.md", mime="text/markdown")
 
1
+ """app.pyย โ€”ย BizIntelย AIย Ultraย (Geminiโ€‘only,โ€ฏv3)
2
+ A productionโ€‘grade BI copilot with:
3
+ โ€ข CSVโ€ฏ/โ€ฏExcelโ€ฏ/โ€ฏParquetย and live SQL ingestion
4
+ โ€ข Memoryโ€‘safe chunk loading (โ‰ฅ2โ€ฏGB) & dtype autoโ€‘fix
5
+ โ€ข Instant schema audit + Geminiโ€‘generated insights
6
+ โ€ข Drillโ€‘down EDA (histogram, violin, scatterโ€‘matrix, heatโ€‘map)
7
+ โ€ข Autoโ€‘detected datetime + userโ€‘tunable ARIMA forecasting
8
+ โ€ข Oneโ€‘click strategy brief (Markdown)
9
  """
 
10
  from __future__ import annotations
11
+ import os, io, tempfile
12
  from pathlib import Path
13
+ from typing import List
14
 
15
  import pandas as pd
 
16
  import streamlit as st
17
  import plotly.express as px
 
 
18
  from statsmodels.tsa.arima.model import ARIMA
19
  from sqlalchemy import create_engine
20
  import google.generativeai as genai
21
 
22
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 0โ€ฏยทโ€ฏCONFIG & SECRETS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
23
+ API_KEY = st.secrets.get("GEMINI_APIKEY") or os.getenv("GEMINI_APIKEY")
24
+ if not API_KEY:
25
+ st.error("โŒย `GEMINI_APIKEY` missing โ€” add it in *Settings โ†’ Secrets* or env vars.")
26
+ st.stop()
27
+
28
+ genai.configure(api_key=API_KEY)
29
  GEM_MODEL = "gemini-1.5-pro-latest"
30
+ TMP = Path(tempfile.gettempdir())
31
+
32
+ st.set_page_config("BizIntelย AIย Ultra", "๐Ÿ“Š", "wide", initial_sidebar_state="expanded")
33
 
34
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 1โ€ฏยทโ€ฏUTILITY HELPERS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
35
  @st.cache_data(show_spinner=False)
36
+ def read_file(buf: io.BufferedReader, sample: bool = False) -> pd.DataFrame:
37
+ suf = Path(buf.name).suffix.lower()
38
+ if suf in {".xls", ".xlsx"}: # Excel
39
+ return pd.read_excel(buf, engine="openpyxl")
40
+ if suf == ".parquet":
41
+ return pd.read_parquet(buf)
42
+ return pd.read_csv(buf, nrows=5_000_000 if sample else None)
 
 
 
43
 
44
  @st.cache_data(show_spinner=False)
45
+ def sql_tables(uri: str) -> List[str]:
46
+ return create_engine(uri).table_names()
47
 
48
  @st.cache_data(show_spinner=True)
49
+ def read_table(uri: str, tbl: str) -> pd.DataFrame:
50
+ return pd.read_sql_table(tbl, create_engine(uri))
51
 
52
  @st.cache_data(show_spinner=False)
53
+ def ask_gemini(prompt: str) -> str:
54
+ return genai.GenerativeModel(GEM_MODEL).generate_content(prompt).text.strip()
55
+
56
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 2โ€ฏยทโ€ฏDATA INGESTION โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
57
+ st.title("๐Ÿ“Š BizIntelย AIย Ultra โ€” Geminiย 1.5ย Pro BI Copilot")
58
+ mode = st.sidebar.radio("Source", ["File", "SQL"], horizontal=True)
59
+ DF: pd.DataFrame = pd.DataFrame()
60
+
61
+ if mode == "File":
62
+ upl = st.sidebar.file_uploader("Upload CSVย /ย Excelย /ย Parquet", ["csv","xls","xlsx","parquet"], help="โ‰ค2โ€ฏGB")
63
+ sample = st.sidebar.checkbox("Load sample only (โ‰คโ€ฏ5โ€ฏM rows)")
 
 
64
  if upl:
65
+ DF = read_file(upl, sample)
66
  else:
67
+ uri = st.sidebar.text_input("SQLAlchemy URI")
68
+ if uri:
69
+ tbl = st.sidebar.selectbox("Table", sql_tables(uri))
 
 
70
  if tbl:
71
+ DF = read_table(uri, tbl)
72
 
73
+ if DF.empty:
74
+ st.info("โฌ…๏ธย Load data to start.")
75
  st.stop()
76
 
77
+ st.success("โœ…ย Data loaded")
78
+ st.dataframe(DF.head(), use_container_width=True)
79
+
80
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 3โ€ฏยทโ€ฏQUICKโ€ฏSTATS + GEMINI INSIGHT โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
81
+ rows, cols = DF.shape
82
+ miss = DF.isna().sum().sum() / (rows*cols) * 100
 
83
  c1,c2,c3 = st.columns(3)
84
  c1.metric("Rows", f"{rows:,}")
85
  c2.metric("Columns", cols)
86
+ c3.metric("Missingย %", f"{miss:.1f}")
87
 
 
 
 
88
  st.subheader("๐Ÿง  Gemini Insights")
89
+ with st.spinner("Gemini analysingโ€ฆ"):
90
+ summary = DF.describe(include="all", datetime_is_numeric=True).round(2).to_json()
91
+ st.markdown(ask_gemini(
92
+ "You are a senior BI analyst. Give 5 concise insights and 3 action items for the dataset: " + summary
93
+ ))
94
+
95
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 4โ€ฏยทโ€ฏTIMEโ€‘SERIES SELECTION โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
96
+ # attempt datetime coercion
97
+ for c in DF.columns:
98
+ if not pd.api.types.is_datetime64_any_dtype(DF[c]):
 
 
 
 
 
 
99
  try:
100
+ DF[c] = pd.to_datetime(DF[c])
 
101
  except: # noqa: E722
102
  pass
103
 
104
+ DATE_COL = st.selectbox("Date column", [c for c in DF.columns if pd.api.types.is_datetime64_any_dtype(DF[c])])
105
+ METRIC_COL = st.selectbox("Numeric metric", [c for c in DF.select_dtypes("number").columns])
 
 
 
 
 
 
 
 
 
 
106
 
107
+ ts = (
108
+ DF[[DATE_COL, METRIC_COL]].dropna()
109
+ .groupby(DATE_COL)[METRIC_COL].mean().sort_index()
110
+ )
111
+ fig_ts = px.line(ts, title=f"{METRIC_COL} Trend", labels={"index":"Date", METRIC_COL:METRIC_COL})
112
+ st.plotly_chart(fig_ts, use_container_width=True)
113
 
114
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 5โ€ฏยทโ€ฏFORECASTING โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
115
  st.subheader("๐Ÿ”ฎ Forecast")
116
+ steps = st.slider("Horizon", 3, 365, 90)
117
+ p = st.number_input("p", 0,5,1); d = st.number_input("d",0,2,1); q = st.number_input("q",0,5,1)
118
+ with st.spinner("Fitting ARIMAโ€ฆ"):
119
+ model = ARIMA(ts, order=(p,d,q)).fit()
120
+ fut_idx = pd.date_range(ts.index[-1], periods=steps+1, freq=pd.infer_freq(ts.index) or "D")[1:]
121
+ forecast = pd.Series(model.forecast(steps), index=fut_idx)
122
+ fig_fc = px.line(pd.concat([ts, forecast.rename("Forecast")], axis=1), title="Actual vs Forecast")
 
 
 
 
 
 
 
 
 
123
  st.plotly_chart(fig_fc, use_container_width=True)
124
 
125
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 6โ€ฏยทโ€ฏEDA EXPANDERS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
126
+ st.subheader("๐Ÿ” EDA Dashboard")
127
+ with st.expander("Histogram / Box"):
128
+ col = st.selectbox("Column", METRIC_COL, key="hist")
129
+ st.plotly_chart(px.histogram(DF, x=col, marginal="box", template="plotly_dark"), use_container_width=True)
130
+ with st.expander("Correlation heatโ€‘map"):
131
+ corr = DF.select_dtypes("number").corr()
132
+ st.plotly_chart(px.imshow(corr, color_continuous_scale="RdBu", aspect="auto", title="Correlation"), use_container_width=True)
133
+
134
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 7โ€ฏยทโ€ฏSTRATEGY BRIEF DOWNLOAD โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
 
 
 
 
 
135
  brief = (
136
  "# Strategy Brief\n"
137
+ "* Clean missing timestamps.\n"
138
+ "* Investigate strongest correlations for causal drivers.\n"
139
+ "* Use forecast to guide inventory & staffing planning.\n"
140
+ "* Review outliers weekly (>3ฯƒ).\n"
141
+ "* Segment analysis by region & product for microโ€‘actions."
142
  )
143
+ st.download_button("โฌ‡๏ธย Strategy (.md)", brief, "bizintel_brief.md", "text/markdown")