mgbam commited on
Commit
5f67bb9
ยท
verified ยท
1 Parent(s): 22cd17a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +176 -188
app.py CHANGED
@@ -1,201 +1,189 @@
1
- import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import numpy as np
 
4
  import plotly.express as px
 
5
  import matplotlib.pyplot as plt
6
- from io import BytesIO
7
- from sqlalchemy import create_engine
8
  from statsmodels.tsa.arima.model import ARIMA
 
 
9
 
10
-
11
- # โ”€โ”€ CONFIG โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
12
  st.set_page_config(
13
- page_title="BizIntel AI Ultra",
14
- layout="wide",
15
- initial_sidebar_state="expanded"
16
  )
17
-
18
- # You must set OPENAI_API_KEY in your Streamlit Secrets
19
- openai.api_key = st.secrets["OPENAI_API_KEY"]
20
-
21
- # โ”€โ”€ CACHEABLE HELPERS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
22
- @st.cache_data
23
- def load_uploaded_file(uploaded):
24
- """Load CSV or Excel from memory into a DataFrame."""
25
- try:
26
- if uploaded.name.lower().endswith((".xls", ".xlsx")):
27
- return pd.read_excel(uploaded, engine="openpyxl")
28
- else:
29
- return pd.read_csv(uploaded)
30
- except Exception as e:
31
- st.error(f"โš ๏ธ File parsing failed: {e}")
32
- return pd.DataFrame()
33
-
34
- @st.cache_data
35
- def list_db_tables(conn_str):
36
- engine = create_engine(conn_str)
37
- return engine.table_names()
38
-
39
- @st.cache_data
40
- def fetch_db_table(conn_str, table):
41
- engine = create_engine(conn_str)
42
- return pd.read_sql_table(table, engine)
43
-
44
- # โ”€โ”€ DATA NARRATIVE VIA OPENAI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
45
- def generate_data_narrative(df: pd.DataFrame) -> str:
46
- """Send a summary of df to OpenAI and return a polished narrative."""
47
- summary = df.describe(include="all").transpose().round(2).to_dict()
48
- prompt = (
49
- "You are a world-class data analyst. "
50
- "Below is a JSON summary of a dataset. "
51
- "Write a concise, professional narrative highlighting the top 5 business-critical insights, "
52
- "in bullet format:\n\n"
53
- f"{summary}\n\n"
54
- )
55
- resp = openai.ChatCompletion.create(
56
- model="gpt-4o-mini", # or "gpt-4o", "gpt-4o-mini-high"
57
- messages=[{"role":"user","content":prompt}],
58
- temperature=0.3,
59
- )
60
- return resp.choices[0].message.content.strip()
61
-
62
- # โ”€โ”€ APP โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
63
- st.title("๐Ÿ“Š BizIntel AI Ultra")
64
-
65
- # 1) Choose data source
66
- source = st.radio("Select data source", ["Upload CSV / Excel", "Connect to SQL Database"])
67
-
68
- df = pd.DataFrame()
69
- if source == "Upload CSV / Excel":
70
- uploaded = st.file_uploader(
71
- "Drag & drop file here (โ‰ค500 MB) โ€ข .csv, .xls, .xlsx",
72
- type=["csv","xls","xlsx"]
73
- )
74
- if uploaded:
75
- with st.spinner("Loading fileโ€ฆ"):
76
- df = load_uploaded_file(uploaded)
77
-
78
  else:
79
- engine = st.selectbox("DB engine", ["postgresql","mysql","mssql+pyodbc","oracle+cx_oracle"])
80
- conn_str = st.text_input("Connection string", placeholder="dialect+driver://user:pass@host/db")
81
  if conn_str:
82
- tables = list_db_tables(conn_str)
83
- table = st.selectbox("Choose table", tables)
84
- if table:
85
- with st.spinner(f"Fetching `{table}`โ€ฆ"):
86
- df = fetch_db_table(conn_str, table)
87
-
88
- # 2) If we have dataโ€ฆ
89
- if not df.empty:
90
- st.success("โœ… Data loaded!")
91
- st.markdown("---")
92
-
93
- # 2a) Preview & summary metrics
94
- st.subheader("๐Ÿ—‚ Data Preview & Overview")
95
- st.dataframe(df.head(5), use_container_width=True)
96
-
97
- r, c = df.shape
98
- missing_pct = (df.isna().sum().sum() / (r*c) * 100).round(1)
99
- col1, col2, col3 = st.columns(3)
100
- col1.metric("Rows", f"{r:,}")
101
- col2.metric("Cols", f"{c:,}")
102
- col3.metric("Missing %", f"{missing_pct}%")
103
- st.markdown("---")
104
-
105
- # 2b) Automated data narrative
106
- st.subheader("๐Ÿ“ Data Narrative")
107
- with st.spinner("Generating insightsโ€ฆ"):
108
- narrative = generate_data_narrative(df)
109
- st.markdown(narrative)
110
-
111
- # 2c) Optional EDA visuals
112
- st.subheader("๐Ÿ”Ž Exploratory Visuals")
113
- num_cols = df.select_dtypes("number").columns.tolist()
114
- if st.checkbox("Show histogram"):
115
- col = st.selectbox("Histogram column", num_cols, key="hist")
116
- fig = px.histogram(df, x=col, nbins=30, title=f"Histogram of {col}")
117
- st.plotly_chart(fig, use_container_width=True)
118
-
119
- if st.checkbox("Show scatter matrix"):
120
- dims = num_cols[:6]
121
- fig = px.scatter_matrix(df[dims], dimensions=dims, title="Scatter Matrix")
122
- st.plotly_chart(fig, use_container_width=True)
123
-
124
- if st.checkbox("Show correlation heatmap"):
125
- corr = df[num_cols].corr()
126
- fig, ax = plt.subplots(figsize=(6,5))
127
- im = ax.imshow(corr, cmap="RdBu", vmin=-1, vmax=1)
128
- plt.xticks(range(len(corr)), corr.columns, rotation=45, ha="right")
129
- plt.yticks(range(len(corr)), corr.columns)
130
- plt.colorbar(im, ax=ax)
131
- st.pyplot(fig)
132
-
133
- # 3) Trend & forecast
134
- st.markdown("---")
135
- st.subheader("๐Ÿ“ˆ Time-Series Trend & 90-Day Forecast")
136
-
137
- # pick columns
138
- dt_opts = [col for col in df.columns if pd.api.types.is_datetime64_any_dtype(df[col]) or df[col].dtype == "object"]
139
- date_col = st.selectbox("Date column", dt_opts)
140
- df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
141
- metric_col = st.selectbox("Metric column", num_cols)
142
-
143
- ts = (
144
- df[[date_col, metric_col]]
145
- .dropna()
146
- .set_index(date_col)
147
- .sort_index()
148
- .loc[~df.index.duplicated(keep="first")]
149
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
- # plot trend
152
- fig_trend = px.line(ts, y=metric_col, title=f"{metric_col} over Time", labels={"index":"Date"})
153
- st.plotly_chart(fig_trend, use_container_width=True)
154
 
155
- # forecast
156
- with st.spinner("Running ARIMAโ€ฆ"):
157
- try:
158
- model = ARIMA(ts, order=(1,1,1)).fit()
159
- future_idx = pd.date_range(start=ts.index.max(), periods=91, freq="D")[1:]
160
- pred = model.get_forecast(90).predicted_mean
161
- df_pred = pd.Series(pred.values, index=future_idx, name="Forecast")
162
-
163
- combo = pd.concat([ts[metric_col], df_pred], axis=1)
164
- fig_fc = px.line(
165
- combo,
166
- labels={metric_col:metric_col, "Forecast":"Forecast"},
167
- title=f"{metric_col} & 90-Day Forecast"
168
- )
169
- st.plotly_chart(fig_fc, use_container_width=True)
170
-
171
- except Exception as e:
172
- st.error(f"Forecast failed: {e}")
173
-
174
- # 4) Strategy download
175
- st.markdown("---")
176
- st.subheader("๐Ÿš€ Actionable Strategy Brief")
177
- strategy_md = """
178
- # BizIntel AI Ultra โ€“ Strategy Brief
179
-
180
- **1. Data Quality First**
181
- Ensure all dates are parsed correctlyโ€”critical for any time-series modeling.
182
-
183
- **2. Trend & Seasonality**
184
- Investigate the underlying patterns and adjust your operations calendar.
185
-
186
- **3. Outlier Management**
187
- Flag and validate extreme observations to avoid skewed forecasts.
188
-
189
- **4. Segment-Level Insights**
190
- Drill into regions or product lines for targeted interventions.
191
-
192
- **5. Predict & Act**
193
- Leverage your 90-day projections for inventory, staffing, and marketing plans.
194
- """.strip()
195
-
196
- st.download_button(
197
- "๐Ÿ“ฅ Download Strategy (.md)",
198
- data=strategy_md,
199
- file_name="bizintel_strategy.md",
200
- mime="text/markdown"
201
- )
 
 
1
+ """app.py โ€” BizIntelย AIย Ultra (Geminiโ€‘only, v2)
2
+ A productionโ€‘grade BI assistant with:
3
+ โ”€ CSV / Excel / Parquet *and* SQL ingestion
4
+ โ”€ Smart dtype inference & memoryโ€‘safe chunk loading (โ‰ฅ2โ€ฏGB)
5
+ โ”€ Instant schema, missingโ€‘data audit, and Geminiโ€‘generated insights
6
+ โ”€ Drillโ€‘down EDA dashboard (histogram, box, violin, scatterโ€‘matrix, heatโ€‘map)
7
+ โ”€ Autoโ€‘detected date column, dynamic ARIMA / SARIMA forecasting (userโ€‘tunable)
8
+ โ”€ Strategy brief + Markdown download
9
+ """
10
+
11
+ from __future__ import annotations
12
+ import os, io, tempfile, datetime as dt
13
+ from pathlib import Path
14
+ from typing import List, Tuple
15
+
16
  import pandas as pd
17
  import numpy as np
18
+ import streamlit as st
19
  import plotly.express as px
20
+ import plotly.graph_objects as go
21
  import matplotlib.pyplot as plt
 
 
22
  from statsmodels.tsa.arima.model import ARIMA
23
+ from sqlalchemy import create_engine
24
+ import google.generativeai as genai
25
 
26
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
27
+ # 0๏ธโƒฃ CONFIG โ”€ Streamlit + Gemini
28
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
29
  st.set_page_config(
30
+ page_title="BizIntelย AIย Ultra", layout="wide", initial_sidebar_state="expanded"
 
 
31
  )
32
+ genai.configure(api_key=st.secrets["GEMINI_APIKEY"])
33
+ GEM_MODEL = "gemini-1.5-pro-latest"
34
+ TEMP = Path(tempfile.gettempdir())
35
+
36
+ # ----------------------------------------------------------------------------
37
+ # 1๏ธโƒฃ UTILITIES
38
+ # ----------------------------------------------------------------------------
39
+ @st.cache_data(show_spinner=False)
40
+ def _lazy_read(file: io.BufferedReader, sample: bool = False) -> pd.DataFrame:
41
+ """Load big CSV/Excel/Parquet in chunks (first 5โ€ฏM rows if sample)."""
42
+ suff = Path(file.name).suffix.lower()
43
+ if suff in {".xls", ".xlsx"}:
44
+ return pd.read_excel(file, engine="openpyxl")
45
+ if suff == ".parquet":
46
+ return pd.read_parquet(file)
47
+ if sample:
48
+ return pd.read_csv(file, nrows=5_000_000)
49
+ return pd.read_csv(file)
50
+
51
+ @st.cache_data(show_spinner=False)
52
+ def _list_tables(conn: str) -> List[str]:
53
+ return create_engine(conn).table_names()
54
+
55
+ @st.cache_data(show_spinner=True)
56
+ def _read_table(conn: str, tbl: str) -> pd.DataFrame:
57
+ return pd.read_sql_table(tbl, create_engine(conn))
58
+
59
+ @st.cache_data(show_spinner=False)
60
+ def _gemini(text: str) -> str:
61
+ return genai.GenerativeModel(GEM_MODEL).generate_content(text).text.strip()
62
+
63
+ # ----------------------------------------------------------------------------
64
+ # 2๏ธโƒฃ APP HEADER & DATA SOURCE
65
+ # ----------------------------------------------------------------------------
66
+ st.title("๐Ÿ“Š BizIntelย AIย Ultra โ€” Geminiโ€‘powered BI Copilot")
67
+ source = st.sidebar.radio("Data source", ["File", "SQLย DB"], key="src")
68
+ df: pd.DataFrame = pd.DataFrame()
69
+
70
+ if source == "File":
71
+ upl = st.sidebar.file_uploader("Upload CSV / Excel / Parquet", type=["csv","xls","xlsx","parquet"], help="โ‰ค2โ€ฏGB")
72
+ sample = st.sidebar.checkbox("Load sample only (first 5โ€ฏM rows)")
73
+ if upl:
74
+ df = _lazy_read(upl, sample)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  else:
76
+ dialect = st.sidebar.selectbox("Engine", ["postgresql","mysql","mssql+pyodbc","oracle+cx_oracle"])
77
+ conn_str = st.sidebar.text_input("SQLAlchemy URI")
78
  if conn_str:
79
+ tables = _list_tables(conn_str)
80
+ tbl = st.sidebar.selectbox("Table", tables)
81
+ if tbl:
82
+ df = _read_table(conn_str, tbl)
83
+
84
+ if df.empty:
85
+ st.info("โฌ…๏ธ Load data to begin analysis")
86
+ st.stop()
87
+
88
+ # ----------------------------------------------------------------------------
89
+ # 3๏ธโƒฃ QUICK OVERVIEW
90
+ # ----------------------------------------------------------------------------
91
+ st.success("โœ… Data loaded")
92
+ st.dataframe(df.head(10), use_container_width=True)
93
+ rows, cols = df.shape
94
+ miss_pct = df.isna().sum().sum() / (rows*cols) * 100
95
+ c1,c2,c3 = st.columns(3)
96
+ c1.metric("Rows", f"{rows:,}")
97
+ c2.metric("Columns", cols)
98
+ c3.metric("Missingย %", f"{miss_pct:.1f}")
99
+
100
+ # ----------------------------------------------------------------------------
101
+ # 4๏ธโƒฃ GEMINI INSIGHTS
102
+ # ----------------------------------------------------------------------------
103
+ st.subheader("๐Ÿง  Gemini Insights")
104
+ with st.spinner("Crafting narrativeโ€ฆ"):
105
+ summ = df.describe(include="all", datetime_is_numeric=True).round(2).to_json()
106
+ prompt = (
107
+ "You are a senior BI analyst. Provide five bullet insights (<170 words) about the dataset below. "
108
+ "Focus on trends, anomalies, and next actions.\n\n" + summ
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  )
110
+ insights = _gemini(prompt)
111
+ st.markdown(insights)
112
+
113
+ # ----------------------------------------------------------------------------
114
+ # 5๏ธโƒฃ COLUMN CHOICES & TREND
115
+ # ----------------------------------------------------------------------------
116
+ # autoโ€‘detect datetime candidates
117
+ maybe_dates = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
118
+ if not maybe_dates:
119
+ for c in df.columns:
120
+ try:
121
+ df[c] = pd.to_datetime(df[c])
122
+ maybe_dates.append(c)
123
+ except: # noqa: E722
124
+ pass
125
+
126
+ date_col = st.selectbox("Date column", maybe_dates or df.columns)
127
+ metric_col = st.selectbox("Metric column", [c for c in df.select_dtypes("number").columns if c != date_col])
128
+
129
+ series = (
130
+ df[[date_col, metric_col]]
131
+ .dropna()
132
+ .assign(**{date_col: lambda d: pd.to_datetime(d[date_col], errors="coerce")})
133
+ .dropna()
134
+ .groupby(date_col)[metric_col]
135
+ .mean()
136
+ .sort_index()
137
+ )
138
 
139
+ fig_tr = px.line(series, title=f"{metric_col} Trend", labels={"index":"Date", metric_col:metric_col})
140
+ st.plotly_chart(fig_tr, use_container_width=True)
 
141
 
142
+ # ----------------------------------------------------------------------------
143
+ # 6๏ธโƒฃ FORECASTING (userโ€‘tunable)
144
+ # ----------------------------------------------------------------------------
145
+ st.subheader("๐Ÿ”ฎ Forecast")
146
+ periods = st.slider("Periods to forecast", 3, 365, 90, step=1)
147
+ order_p = st.number_input("AR order (p)", 0, 5, 1, key="p")
148
+ order_d = st.number_input("I order (d)", 0, 2, 1, key="d")
149
+ order_q = st.number_input("MA order (q)", 0, 5, 1, key="q")
150
+
151
+ with st.spinner("Model fitting & forecastingโ€ฆ"):
152
+ try:
153
+ model = ARIMA(series, order=(order_p, order_d, order_q)).fit()
154
+ idx_future = pd.date_range(series.index.max(), periods=periods+1, freq=pd.infer_freq(series.index) or "D")[1:]
155
+ fc_vals = model.forecast(periods)
156
+ forecast = pd.Series(fc_vals.values, index=idx_future, name="Forecast")
157
+ except Exception as e:
158
+ st.error(f"Model failed: {e}")
159
+ st.stop()
160
+
161
+ fig_fc = px.line(pd.concat([series, forecast], axis=1), title="Actual vs Forecast")
162
+ st.plotly_chart(fig_fc, use_container_width=True)
163
+
164
+ # ----------------------------------------------------------------------------
165
+ # 7๏ธโƒฃ EDA DASHBOARD
166
+ # ----------------------------------------------------------------------------
167
+ st.subheader("๐Ÿ” Exploratory Data Dashboard")
168
+ with st.expander("Hist / KDE"):
169
+ num = st.selectbox("Numeric column", series.index.name if series.empty else metric_col, key="hist_sel")
170
+ fig_h = px.histogram(df, x=num, nbins=50, marginal="box", template="plotly_dark")
171
+ st.plotly_chart(fig_h, use_container_width=True)
172
+
173
+ with st.expander("Correlation Heatmap"):
174
+ corr = df.select_dtypes("number").corr()
175
+ fig_c = px.imshow(corr, color_continuous_scale="RdBu", labels=dict(color="ฯ"), title="Correlation")
176
+ st.plotly_chart(fig_c, use_container_width=True)
177
+
178
+ # ----------------------------------------------------------------------------
179
+ # 8๏ธโƒฃ STRATEGY DOWNLOAD
180
+ # ----------------------------------------------------------------------------
181
+ brief = (
182
+ "# Strategy Brief\n"
183
+ "1. Clean missing timestamps for robust modeling.\n"
184
+ "2. Investigate drivers behind top correlations.\n"
185
+ "3. Leverage forecast to align ops & marketing.\n"
186
+ "4. Monitor outliers >3ฯƒ each week.\n"
187
+ "5. Drill into segment variations (region / product)."
188
+ )
189
+ st.download_button("โฌ‡๏ธย Download Strategy (.md)", brief, file_name="bizintel_brief.md", mime="text/markdown")