mgbam commited on
Commit
ddeba5b
·
verified ·
1 Parent(s): beb1c10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +436 -252
app.py CHANGED
@@ -6,8 +6,8 @@ import base64
6
  import io
7
  import matplotlib.pyplot as plt
8
  import seaborn as sns
9
- from abc import ABC, abstractmethod # For abstract base classes
10
- from sklearn.model_selection import train_test_split # Machine learning modules
11
  from sklearn.linear_model import LogisticRegression
12
  from sklearn.metrics import accuracy_score
13
  from statsmodels.tsa.seasonal import seasonal_decompose
@@ -17,177 +17,216 @@ from groq import Groq
17
  import os
18
  import numpy as np
19
  from scipy.stats import ttest_ind, f_oneway
 
20
 
21
  # Initialize Groq Client
22
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
23
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- # ---------------------- Base Classes and Schemas ---------------------------
26
- class ResearchInput(BaseModel):
27
- """Base schema for research tool inputs"""
28
- data_key: str = Field(..., description="Session state key containing DataFrame")
29
- columns: Optional[List[str]] = Field(None, description="List of columns to analyze")
30
 
31
- class TemporalAnalysisInput(ResearchInput):
32
- """Schema for temporal analysis"""
33
- time_col: str = Field(..., description="Name of timestamp column")
34
- value_col: str = Field(..., description="Name of value column to analyze")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- class HypothesisInput(ResearchInput):
37
- """Schema for hypothesis testing"""
38
- group_col: str = Field(..., description="Categorical column defining groups")
39
- value_col: str = Field(..., description="Numerical column to compare")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- class ModelTrainingInput(ResearchInput):
42
- """Schema for model training"""
43
- target_col: str = Field(..., description="Name of target column")
 
 
 
44
 
45
- class DataAnalyzer(ABC):
46
- """Abstract base class for data analysis modules"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  @abstractmethod
48
- def invoke(self, **kwargs) -> Dict[str, Any]:
49
- pass
50
 
51
- # ---------------------- Concrete Analyzer Implementations ---------------------------
52
- class AdvancedEDA(DataAnalyzer):
53
- """Comprehensive Exploratory Data Analysis"""
54
- def invoke(self, data_key: str, **kwargs) -> Dict[str, Any]:
55
- try:
56
- data = st.session_state[data_key]
57
- analysis = {
58
- "dimensionality": {
59
- "rows": len(data),
60
- "columns": list(data.columns),
61
- "memory_usage": f"{data.memory_usage().sum() / 1e6:.2f} MB"
62
- },
63
- "statistical_profile": data.describe(percentiles=[.25, .5, .75]).to_dict(),
64
- "temporal_analysis": {
65
- "date_ranges": {
66
- col: {
67
- "min": data[col].min(),
68
- "max": data[col].max()
69
- } for col in data.select_dtypes(include='datetime').columns
70
- }
71
- },
72
- "data_quality": {
73
- "missing_values": data.isnull().sum().to_dict(),
74
- "duplicates": data.duplicated().sum(),
75
- "cardinality": {
76
- col: data[col].nunique() for col in data.columns
77
- }
78
- }
79
- }
80
- return analysis
81
- except Exception as e:
82
- return {"error": f"EDA Failed: {str(e)}"}
83
-
84
- class DistributionVisualizer(DataAnalyzer):
85
- """Distribution visualizations"""
86
- def invoke(self, data_key: str, columns: List[str], **kwargs) -> str:
87
- try:
88
- data = st.session_state[data_key]
89
- plt.figure(figsize=(12, 6))
90
- for i, col in enumerate(columns, 1):
91
- plt.subplot(1, len(columns), i)
92
- sns.histplot(data[col], kde=True, stat="density")
93
- plt.title(f'Distribution of {col}', fontsize=10)
94
- plt.xticks(fontsize=8)
95
- plt.yticks(fontsize=8)
96
- plt.tight_layout()
97
-
98
- buf = io.BytesIO()
99
- plt.savefig(buf, format='png', dpi=300, bbox_inches='tight')
100
- plt.close()
101
- return base64.b64encode(buf.getvalue()).decode()
102
- except Exception as e:
103
- return f"Visualization Error: {str(e)}"
104
-
105
- class TemporalAnalyzer(DataAnalyzer):
106
- """Time series analysis"""
107
- def invoke(self, data_key: str, time_col: str, value_col: str, **kwargs) -> Dict[str, Any]:
108
- try:
109
- data = st.session_state[data_key]
110
- ts_data = data.set_index(pd.to_datetime(data[time_col]))[value_col]
111
- decomposition = seasonal_decompose(ts_data, period=365)
112
-
113
- plt.figure(figsize=(12, 8))
114
- decomposition.plot()
115
- plt.tight_layout()
116
-
117
- buf = io.BytesIO()
118
- plt.savefig(buf, format='png')
119
- plt.close()
120
- plot_data = base64.b64encode(buf.getvalue()).decode()
121
-
122
- return {
123
- "trend_statistics": {
124
- "stationarity": adfuller(ts_data)[1],
125
- "seasonality_strength": max(decomposition.seasonal)
126
- },
127
- "visualization": plot_data
128
- }
129
- except Exception as e:
130
- return {"error": f"Temporal Analysis Failed: {str(e)}"}
131
-
132
- class HypothesisTester(DataAnalyzer):
133
- """Statistical hypothesis testing"""
134
- def invoke(self, data_key: str, group_col: str, value_col: str, **kwargs) -> Dict[str, Any]:
135
- try:
136
- data = st.session_state[data_key]
137
- groups = data[group_col].unique()
138
-
139
- if len(groups) < 2:
140
- return {"error": "Insufficient groups for comparison"}
141
-
142
- if len(groups) == 2:
143
- group_data = [data[data[group_col] == g][value_col] for g in groups]
144
- stat, p = ttest_ind(*group_data)
145
- test_type = "Independent t-test"
146
- else:
147
- group_data = [data[data[group_col] == g][value_col] for g in groups]
148
- stat, p = f_oneway(*group_data)
149
- test_type = "ANOVA"
150
-
151
- return {
152
- "test_type": test_type,
153
- "test_statistic": stat,
154
- "p_value": p,
155
- "effect_size": {
156
- "cohens_d": abs(group_data[0].mean() - group_data[1].mean())/np.sqrt(
157
- (group_data[0].var() + group_data[1].var())/2
158
- ) if len(groups) == 2 else None
159
- },
160
- "interpretation": self.interpret_p_value(p)
161
- }
162
- except Exception as e:
163
- return {"error": f"Hypothesis Testing Failed: {str(e)}"}
164
-
165
- def interpret_p_value(self, p: float) -> str:
166
- if p < 0.001: return "Very strong evidence against H0"
167
- elif p < 0.01: return "Strong evidence against H0"
168
- elif p < 0.05: return "Evidence against H0"
169
- elif p < 0.1: return "Weak evidence against H0"
170
- else: return "No significant evidence against H0"
171
-
172
- class LogisticRegressionTrainer(DataAnalyzer):
173
- """Logistic Regression Model Trainer"""
174
- def invoke(self, data_key: str, target_col: str, columns: List[str], **kwargs) -> Dict[str, Any]:
175
- try:
176
- data = st.session_state[data_key]
177
- X = data[columns]
178
- y = data[target_col]
179
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
180
- model = LogisticRegression(max_iter=1000)
181
- model.fit(X_train, y_train)
182
- y_pred = model.predict(X_test)
183
- accuracy = accuracy_score(y_test, y_pred)
184
- return {
185
- "model_type": "Logistic Regression",
186
- "accuracy": accuracy,
187
- "model_params": model.get_params()
188
- }
189
- except Exception as e:
190
- return {"error": f"Logistic Regression Model Error: {str(e)}"}
191
 
192
  # ---------------------- Groq Research Agent ---------------------------
193
 
@@ -238,117 +277,262 @@ class GroqResearcher:
238
 
239
  except Exception as e:
240
  return f"Research Error: {str(e)}"
 
241
  # ---------------------- Main Streamlit Application ---------------------------
242
  def main():
243
- st.set_page_config(page_title="AI Data Analysis Lab", layout="wide")
244
- st.title("🧪 Advanced AI Data Analysis Laboratory")
245
 
246
  # Session State
247
  if 'data' not in st.session_state:
248
- st.session_state.data = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  if 'researcher' not in st.session_state:
250
  st.session_state.researcher = GroqResearcher()
 
251
 
252
- # Data Upload
253
  with st.sidebar:
254
- st.header("🔬 Data Management")
255
- uploaded_file = st.file_uploader("Upload research dataset", type=["csv", "parquet"])
256
- if uploaded_file:
257
- with st.spinner("Initializing dataset..."):
 
 
 
258
  try:
259
- st.session_state.data = pd.read_csv(uploaded_file)
260
- st.success(f"Loaded {len(st.session_state.data):,} research observations")
 
261
  except Exception as e:
262
- st.error(f"Error loading dataset: {e}")
 
 
 
 
 
 
 
 
 
 
 
263
 
264
 
265
- if st.session_state.data is not None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  col1, col2 = st.columns([1, 3])
 
267
  with col1:
268
  st.subheader("Dataset Metadata")
269
- st.json({
270
- "Variables": list(st.session_state.data.columns),
271
- "Time Range": {
272
- col: {
273
- "min": st.session_state.data[col].min(),
274
- "max": st.session_state.data[col].max()
275
- } for col in st.session_state.data.select_dtypes(include='datetime').columns
276
- },
277
- "Size": f"{st.session_state.data.memory_usage().sum() / 1e6:.2f} MB"
278
- })
279
-
280
- with col2:
281
- analysis_tab, research_tab = st.tabs(["Automated Analysis", "Custom Research"])
282
- with analysis_tab:
283
- analysis_type = st.selectbox("Select Analysis Mode", [
284
- "Exploratory Data Analysis",
285
- "Temporal Pattern Analysis",
286
- "Comparative Statistics",
287
- "Distribution Analysis",
288
- "Train Logistic Regression Model"
289
- ])
290
 
291
- if analysis_type == "Exploratory Data Analysis":
292
- analyzer = AdvancedEDA()
293
- eda_result = analyzer.invoke(data_key="data")
294
- st.subheader("Data Quality Report")
295
- st.json(eda_result)
296
 
297
- elif analysis_type == "Temporal Pattern Analysis":
298
- time_col = st.selectbox("Temporal Variable",
299
- st.session_state.data.select_dtypes(include='datetime').columns)
300
- value_col = st.selectbox("Analysis Variable",
301
- st.session_state.data.select_dtypes(include=np.number).columns)
302
-
303
- if time_col and value_col:
304
- analyzer = TemporalAnalyzer()
305
- result = analyzer.invoke(data_key="data", time_col=time_col, value_col=value_col)
306
- if "visualization" in result:
307
- st.image(f"data:image/png;base64,{result['visualization']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  st.json(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
- elif analysis_type == "Comparative Statistics":
311
- group_col = st.selectbox("Grouping Variable",
312
- st.session_state.data.select_dtypes(include='category').columns)
313
- value_col = st.selectbox("Metric Variable",
314
- st.session_state.data.select_dtypes(include=np.number).columns)
315
-
316
- if group_col and value_col:
317
- analyzer = HypothesisTester()
318
- result = analyzer.invoke(data_key="data", group_col=group_col, value_col=value_col)
319
- st.subheader("Statistical Test Results")
320
- st.json(result)
321
-
322
- elif analysis_type == "Distribution Analysis":
323
- num_cols = st.session_state.data.select_dtypes(include=np.number).columns.tolist()
324
- selected_cols = st.multiselect("Select Variables", num_cols)
325
- if selected_cols:
326
- analyzer = DistributionVisualizer()
327
- img_data = analyzer.invoke(data_key="data", columns=selected_cols)
328
- st.image(f"data:image/png;base64,{img_data}")
329
-
330
- elif analysis_type == "Train Logistic Regression Model":
331
- num_cols = st.session_state.data.select_dtypes(include=np.number).columns.tolist()
332
- target_col = st.selectbox("Select Target Variable",
333
- st.session_state.data.columns.tolist())
334
- selected_cols = st.multiselect("Select Feature Variables", num_cols)
335
- if selected_cols and target_col:
336
- analyzer = LogisticRegressionTrainer()
337
- result = analyzer.invoke(data_key="data", target_col=target_col, columns=selected_cols)
338
- st.subheader("Logistic Regression Model Results")
339
- st.json(result)
340
-
341
- with research_tab:
342
- research_query = st.text_area("Enter Research Question:", height=150,
343
- placeholder="E.g., 'What factors are most predictive of X outcome?'")
344
-
345
- if st.button("Execute Research"):
346
  with st.spinner("Conducting rigorous analysis..."):
347
- result = st.session_state.researcher.research(
348
- research_query, st.session_state.data
349
- )
350
- st.markdown("## Research Findings")
351
- st.markdown(result)
 
 
352
 
353
  if __name__ == "__main__":
354
  main()
 
6
  import io
7
  import matplotlib.pyplot as plt
8
  import seaborn as sns
9
+ from abc import ABC, abstractmethod
10
+ from sklearn.model_selection import train_test_split
11
  from sklearn.linear_model import LogisticRegression
12
  from sklearn.metrics import accuracy_score
13
  from statsmodels.tsa.seasonal import seasonal_decompose
 
17
  import os
18
  import numpy as np
19
  from scipy.stats import ttest_ind, f_oneway
20
+ import json
21
 
22
  # Initialize Groq Client
23
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
24
 
25
+ # ---------------------- Data Acquisition Layer ---------------------------
26
+ class DataSource(ABC):
27
+ """Base class for data sources."""
28
+ @abstractmethod
29
+ def connect(self) -> None:
30
+ """Connect to the data source."""
31
+ pass
32
+
33
+ @abstractmethod
34
+ def fetch_data(self, query: str, **kwargs) -> pd.DataFrame:
35
+ """Fetch the data based on a specific query."""
36
+ pass
37
 
 
 
 
 
 
38
 
39
+ class CSVDataSource(DataSource):
40
+ """Data source for CSV files."""
41
+ def __init__(self, file_path: str):
42
+ self.file_path = file_path
43
+ self.data: Optional[pd.DataFrame] = None
44
+
45
+ def connect(self):
46
+ self.data = pd.read_csv(self.file_path)
47
+
48
+ def fetch_data(self, query: str = None, **kwargs) -> pd.DataFrame:
49
+ if self.data is None:
50
+ raise Exception("No connection is made, call connect()")
51
+ return self.data
52
+
53
+ class DatabaseSource(DataSource):
54
+ def __init__(self, connection_string: str, database_type: str):
55
+ self.connection_string = connection_string
56
+ self.database_type = database_type
57
+ self.connection = None
58
+
59
+ def connect(self):
60
+ if self.database_type.lower() == "sql":
61
+ #Placeholder for the actual database connection
62
+ self.connection = "Connected to SQL Database"
63
+ else:
64
+ raise Exception(f"Database type '{self.database_type}' is not supported")
65
+
66
+ def fetch_data(self, query: str, **kwargs) -> pd.DataFrame:
67
+ if self.connection is None:
68
+ raise Exception("No connection is made, call connect()")
69
+ #Placeholder for the data fetching
70
+ return pd.DataFrame({"result":[f"Fetched data based on query: {query}"]})
71
+
72
 
73
+ class DataIngestion:
74
+ def __init__(self):
75
+ self.sources : Dict[str, DataSource] = {}
76
+
77
+ def add_source(self, source_name: str, source: DataSource):
78
+ self.sources[source_name] = source
79
+
80
+ def ingest_data(self, source_name: str, query: str = None, **kwargs) -> pd.DataFrame:
81
+ if source_name not in self.sources:
82
+ raise Exception(f"Source '{source_name}' not found")
83
+ source = self.sources[source_name]
84
+ source.connect()
85
+ return source.fetch_data(query, **kwargs)
86
+
87
+ class DataModel(BaseModel):
88
+ name : str
89
+ kpis : List[str] = Field(default_factory=list)
90
+ dimensions : List[str] = Field(default_factory=list)
91
+ custom_calculations : Optional[Dict[str, str]] = None
92
+ relations: Optional[Dict[str,str]] = None #Example {table1: table2}
93
+
94
+ def to_json(self):
95
+ return json.dumps(self.dict())
96
+
97
+ @staticmethod
98
+ def from_json(json_str):
99
+ return DataModel(**json.loads(json_str))
100
+
101
+ class DataModelling():
102
+ def __init__(self):
103
+ self.models : Dict[str, DataModel] = {}
104
+
105
+ def add_model(self, model:DataModel):
106
+ self.models[model.name] = model
107
+
108
+ def get_model(self, model_name: str) -> DataModel:
109
+ if model_name not in self.models:
110
+ raise Exception(f"Model '{model_name}' not found")
111
+ return self.models[model_name]
112
+ # ---------------------- Business Logic Layer ---------------------------
113
+ class BusinessRule(BaseModel):
114
+ name: str
115
+ condition: str
116
+ action: str
117
 
118
+ class BusinessRulesEngine():
119
+ def __init__(self):
120
+ self.rules: Dict[str, BusinessRule] = {}
121
+
122
+ def add_rule(self, rule: BusinessRule):
123
+ self.rules[rule.name] = rule
124
 
125
+ def execute_rules(self, data: pd.DataFrame):
126
+ results = {}
127
+ for rule_name, rule in self.rules.items():
128
+ try:
129
+ if eval(rule.condition, {}, {"df":data}):
130
+ results[rule_name] = {"rule_matched": True, "action": rule.action}
131
+ else:
132
+ results[rule_name] = {"rule_matched": False, "action": None}
133
+ except Exception as e:
134
+ results[rule_name] = {"rule_matched": False, "error": str(e)}
135
+ return results
136
+
137
+ class KPI(BaseModel):
138
+ name: str
139
+ calculation: str
140
+ threshold: Optional[float] = None
141
+
142
+ class KPIMonitoring():
143
+ def __init__(self):
144
+ self.kpis : Dict[str, KPI] = {}
145
+
146
+ def add_kpi(self, kpi:KPI):
147
+ self.kpis[kpi.name] = kpi
148
+
149
+ def calculate_kpis(self, data: pd.DataFrame):
150
+ results = {}
151
+ for kpi_name, kpi in self.kpis.items():
152
+ try:
153
+ results[kpi_name] = eval(kpi.calculation, {}, {"df": data})
154
+ except Exception as e:
155
+ results[kpi_name] = {"error": str(e)}
156
+ return results
157
+
158
+ class ForecastingEngine(ABC):
159
  @abstractmethod
160
+ def predict(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
161
+ pass
162
 
163
+ class SimpleForecasting(ForecastingEngine):
164
+ def predict(self, data: pd.DataFrame, period: int = 7, **kwargs) -> pd.DataFrame:
165
+ #Placeholder for actual forecasting
166
+ return pd.DataFrame({"forecast":[f"Forecast for the next {period} days"]})
167
+ # ---------------------- Insights and Reporting Layer ---------------------------
168
+ class AutomatedInsights():
169
+ def __init__(self):
170
+ self.analyses : Dict[str, DataAnalyzer] = {
171
+ "EDA": AdvancedEDA(),
172
+ "temporal": TemporalAnalyzer(),
173
+ "distribution": DistributionVisualizer(),
174
+ "hypothesis": HypothesisTester(),
175
+ "model": LogisticRegressionTrainer()
176
+ }
177
+
178
+ def generate_insights(self, data: pd.DataFrame, analysis_names: List[str], **kwargs):
179
+ results = {}
180
+ for name in analysis_names:
181
+ if name in self.analyses:
182
+ analyzer = self.analyses[name]
183
+ results[name] = analyzer.invoke(data=data, **kwargs)
184
+ else:
185
+ results[name] = {"error": "Analysis not found"}
186
+ return results
187
+
188
+ class Dashboard():
189
+ def __init__(self):
190
+ self.layout: Dict[str,str] = {}
191
+
192
+ def add_visualisation(self, vis_name: str, vis_type: str):
193
+ self.layout[vis_name] = vis_type
194
+
195
+ def display_dashboard(self, data_dict: Dict[str,pd.DataFrame]):
196
+ st.header("Dashboard")
197
+ for vis_name, vis_type in self.layout.items():
198
+ st.subheader(vis_name)
199
+ if vis_type == "table":
200
+ if vis_name in data_dict:
201
+ st.table(data_dict[vis_name])
202
+ else:
203
+ st.write("Data Not Found")
204
+ elif vis_type == "plot":
205
+ if vis_name in data_dict:
206
+ df = data_dict[vis_name]
207
+ if len(df.columns) > 1:
208
+ fig = plt.figure()
209
+ sns.lineplot(data=df)
210
+ st.pyplot(fig)
211
+ else:
212
+ st.write("Please have more than 1 column")
213
+ else:
214
+ st.write("Data not found")
215
+ class AutomatedReports():
216
+ def __init__(self):
217
+ self.report_definition: Dict[str,str] = {}
218
+
219
+ def create_report_definition(self, report_name: str, definition: str):
220
+ self.report_definition[report_name] = definition
221
+
222
+ def generate_report(self, report_name: str, data:Dict[str, pd.DataFrame]):
223
+ if report_name not in self.report_definition:
224
+ return {"error":"Report name not found"}
225
+ st.header(f"Report : {report_name}")
226
+ st.write(f"Report Definition: {self.report_definition[report_name]}")
227
+ for df_name, df in data.items():
228
+ st.subheader(f"Data: {df_name}")
229
+ st.table(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
  # ---------------------- Groq Research Agent ---------------------------
232
 
 
277
 
278
  except Exception as e:
279
  return f"Research Error: {str(e)}"
280
+
281
  # ---------------------- Main Streamlit Application ---------------------------
282
  def main():
283
+ st.set_page_config(page_title="AI BI Automation Platform", layout="wide")
284
+ st.title("🚀 AI-Powered Business Intelligence Automation Platform")
285
 
286
  # Session State
287
  if 'data' not in st.session_state:
288
+ st.session_state.data = {} # store pd.DataFrame under a name
289
+ if 'data_ingestion' not in st.session_state:
290
+ st.session_state.data_ingestion = DataIngestion()
291
+ if 'data_modelling' not in st.session_state:
292
+ st.session_state.data_modelling = DataModelling()
293
+ if 'business_rules' not in st.session_state:
294
+ st.session_state.business_rules = BusinessRulesEngine()
295
+ if 'kpi_monitoring' not in st.session_state:
296
+ st.session_state.kpi_monitoring = KPIMonitoring()
297
+ if 'forecasting_engine' not in st.session_state:
298
+ st.session_state.forecasting_engine = SimpleForecasting()
299
+ if 'automated_insights' not in st.session_state:
300
+ st.session_state.automated_insights = AutomatedInsights()
301
+ if 'dashboard' not in st.session_state:
302
+ st.session_state.dashboard = Dashboard()
303
+ if 'automated_reports' not in st.session_state:
304
+ st.session_state.automated_reports = AutomatedReports()
305
  if 'researcher' not in st.session_state:
306
  st.session_state.researcher = GroqResearcher()
307
+
308
 
309
+ # Sidebar for Data Management
310
  with st.sidebar:
311
+ st.header("⚙️ Data Management")
312
+ data_source_selection = st.selectbox("Select Data Source Type",["CSV","SQL Database"])
313
+ if data_source_selection == "CSV":
314
+ uploaded_file = st.file_uploader("Upload research dataset (CSV)", type=["csv"])
315
+ if uploaded_file:
316
+ source_name = st.text_input("Data Source Name")
317
+ if source_name:
318
  try:
319
+ csv_source = CSVDataSource(file_path=uploaded_file)
320
+ st.session_state.data_ingestion.add_source(source_name,csv_source)
321
+ st.success(f"Uploaded {uploaded_file.name}")
322
  except Exception as e:
323
+ st.error(f"Error loading dataset: {e}")
324
+ elif data_source_selection == "SQL Database":
325
+ conn_str = st.text_input("Enter connection string for SQL DB")
326
+ if conn_str:
327
+ source_name = st.text_input("Data Source Name")
328
+ if source_name:
329
+ try:
330
+ sql_source = DatabaseSource(connection_string=conn_str, database_type="sql")
331
+ st.session_state.data_ingestion.add_source(source_name, sql_source)
332
+ st.success(f"Added SQL DB Source {source_name}")
333
+ except Exception as e:
334
+ st.error(f"Error loading database source {e}")
335
 
336
 
337
+ if st.button("Ingest Data"):
338
+ if st.session_state.data_ingestion.sources:
339
+ source_name_to_fetch = st.selectbox("Select Data Source to Ingest", list(st.session_state.data_ingestion.sources.keys()))
340
+ query = st.text_area("Optional Query to Fetch data")
341
+ if source_name_to_fetch:
342
+ with st.spinner("Ingesting data..."):
343
+ try:
344
+ data = st.session_state.data_ingestion.ingest_data(source_name_to_fetch, query)
345
+ st.session_state.data[source_name_to_fetch] = data
346
+ st.success(f"Ingested data from {source_name_to_fetch}")
347
+ except Exception as e:
348
+ st.error(f"Ingestion failed: {e}")
349
+ else:
350
+ st.error("No data source added, please add data source")
351
+
352
+ if st.session_state.data:
353
  col1, col2 = st.columns([1, 3])
354
+
355
  with col1:
356
  st.subheader("Dataset Metadata")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
+ data_source_keys = list(st.session_state.data.keys())
359
+ selected_data_key = st.selectbox("Select Dataset", data_source_keys)
 
 
 
360
 
361
+ if selected_data_key:
362
+ data = st.session_state.data[selected_data_key]
363
+ st.json({
364
+ "Variables": list(data.columns),
365
+ "Time Range": {
366
+ col: {
367
+ "min": data[col].min(),
368
+ "max": data[col].max()
369
+ } for col in data.select_dtypes(include='datetime').columns
370
+ },
371
+ "Size": f"{data.memory_usage().sum() / 1e6:.2f} MB"
372
+ })
373
+ with col2:
374
+ analysis_tab, business_logic_tab, insights_tab, reports_tab, custom_research_tab = st.tabs([
375
+ "Data Analysis",
376
+ "Business Logic",
377
+ "Insights",
378
+ "Reports",
379
+ "Custom Research"
380
+ ])
381
+
382
+ with analysis_tab:
383
+ if selected_data_key:
384
+ analysis_type = st.selectbox("Select Analysis Mode", [
385
+ "Exploratory Data Analysis",
386
+ "Temporal Pattern Analysis",
387
+ "Comparative Statistics",
388
+ "Distribution Analysis",
389
+ "Train Logistic Regression Model"
390
+ ])
391
+ data = st.session_state.data[selected_data_key]
392
+ if analysis_type == "Exploratory Data Analysis":
393
+ analyzer = AdvancedEDA()
394
+ eda_result = analyzer.invoke(data=data)
395
+ st.subheader("Data Quality Report")
396
+ st.json(eda_result)
397
+
398
+ elif analysis_type == "Temporal Pattern Analysis":
399
+ time_col = st.selectbox("Temporal Variable",
400
+ data.select_dtypes(include='datetime').columns)
401
+ value_col = st.selectbox("Analysis Variable",
402
+ data.select_dtypes(include=np.number).columns)
403
+
404
+ if time_col and value_col:
405
+ analyzer = TemporalAnalyzer()
406
+ result = analyzer.invoke(data=data, time_col=time_col, value_col=value_col)
407
+ if "visualization" in result:
408
+ st.image(f"data:image/png;base64,{result['visualization']}")
409
+ st.json(result)
410
+
411
+ elif analysis_type == "Comparative Statistics":
412
+ group_col = st.selectbox("Grouping Variable",
413
+ data.select_dtypes(include='category').columns)
414
+ value_col = st.selectbox("Metric Variable",
415
+ data.select_dtypes(include=np.number).columns)
416
+
417
+ if group_col and value_col:
418
+ analyzer = HypothesisTester()
419
+ result = analyzer.invoke(data=data, group_col=group_col, value_col=value_col)
420
+ st.subheader("Statistical Test Results")
421
+ st.json(result)
422
+
423
+ elif analysis_type == "Distribution Analysis":
424
+ num_cols = data.select_dtypes(include=np.number).columns.tolist()
425
+ selected_cols = st.multiselect("Select Variables", num_cols)
426
+ if selected_cols:
427
+ analyzer = DistributionVisualizer()
428
+ img_data = analyzer.invoke(data=data, columns=selected_cols)
429
+ st.image(f"data:image/png;base64,{img_data}")
430
+
431
+ elif analysis_type == "Train Logistic Regression Model":
432
+ num_cols = data.select_dtypes(include=np.number).columns.tolist()
433
+ target_col = st.selectbox("Select Target Variable",
434
+ data.columns.tolist())
435
+ selected_cols = st.multiselect("Select Feature Variables", num_cols)
436
+ if selected_cols and target_col:
437
+ analyzer = LogisticRegressionTrainer()
438
+ result = analyzer.invoke(data=data, target_col=target_col, columns=selected_cols)
439
+ st.subheader("Logistic Regression Model Results")
440
+ st.json(result)
441
+ with business_logic_tab:
442
+ st.header("Business Logic")
443
+ st.subheader("Data Modelling")
444
+ model_name = st.text_input("Enter the name of the model")
445
+
446
+ if model_name:
447
+ kpis = st.text_input("Enter KPIs (comma-separated)")
448
+ dimensions = st.text_input("Enter Dimensions (comma-separated)")
449
+ custom_calculations = st.text_area("Custom calculations (JSON format), use {'df': DataFrame}")
450
+ relations = st.text_area("Relations (JSON format), use {'table1': 'table2'}")
451
+ if st.button("Add Data Model"):
452
+ try:
453
+ custom_calculations_dict = None if not custom_calculations else json.loads(custom_calculations)
454
+ relations_dict = None if not relations else json.loads(relations)
455
+ model = DataModel(name=model_name,
456
+ kpis= [kpi.strip() for kpi in kpis.split(',')] if kpis else [],
457
+ dimensions=[dim.strip() for dim in dimensions.split(',')] if dimensions else [],
458
+ custom_calculations= custom_calculations_dict,
459
+ relations = relations_dict)
460
+ st.session_state.data_modelling.add_model(model)
461
+ st.success(f"Added data model {model_name}")
462
+ except Exception as e:
463
+ st.error(f"Error creating data model: {e}")
464
+
465
+ st.subheader("Business Rules")
466
+ rule_name = st.text_input("Enter Rule Name")
467
+ condition = st.text_area("Enter Rule Condition (use 'df' for data frame), Example df['sales'] > 100")
468
+ action = st.text_area("Enter Action to be Taken on Rule Match")
469
+ if st.button("Add Business Rule"):
470
+ try:
471
+ rule = BusinessRule(name=rule_name, condition=condition, action=action)
472
+ st.session_state.business_rules.add_rule(rule)
473
+ st.success("Added Business Rule")
474
+ except Exception as e:
475
+ st.error(f"Error in rule definition: {e}")
476
+
477
+ st.subheader("KPI Definition")
478
+ kpi_name = st.text_input("Enter KPI name")
479
+ kpi_calculation = st.text_area("Enter KPI calculation (use 'df' for data frame), Example df['revenue'].sum()")
480
+ threshold = st.text_input("Enter Threshold for KPI")
481
+ if st.button("Add KPI"):
482
+ try:
483
+ threshold_value = float(threshold) if threshold else None
484
+ kpi = KPI(name=kpi_name, calculation=kpi_calculation, threshold=threshold_value)
485
+ st.session_state.kpi_monitoring.add_kpi(kpi)
486
+ st.success(f"Added KPI {kpi_name}")
487
+ except Exception as e:
488
+ st.error(f"Error creating KPI: {e}")
489
+ if selected_data_key:
490
+ data = st.session_state.data[selected_data_key]
491
+ if st.button("Execute Business Rules"):
492
+ with st.spinner("Executing Business Rules.."):
493
+ result = st.session_state.business_rules.execute_rules(data)
494
  st.json(result)
495
+ if st.button("Calculate KPIs"):
496
+ with st.spinner("Calculating KPIs..."):
497
+ result = st.session_state.kpi_monitoring.calculate_kpis(data)
498
+ st.json(result)
499
+
500
+ with insights_tab:
501
+ if selected_data_key:
502
+ data = st.session_state.data[selected_data_key]
503
+ available_analysis = ["EDA", "temporal", "distribution", "hypothesis", "model"]
504
+ selected_analysis = st.multiselect("Select Analysis", available_analysis)
505
+ if st.button("Generate Automated Insights"):
506
+ with st.spinner("Generating Insights"):
507
+ results = st.session_state.automated_insights.generate_insights(data, analysis_names=selected_analysis)
508
+ st.json(results)
509
 
510
+ with reports_tab:
511
+ st.header("Reports")
512
+ report_name = st.text_input("Report Name")
513
+ report_def = st.text_area("Report definition")
514
+ if st.button("Create Report Definition"):
515
+ st.session_state.automated_reports.create_report_definition(report_name, report_def)
516
+ st.success("Report definition created")
517
+ if selected_data_key:
518
+ data = st.session_state.data
519
+ if st.button("Generate Report"):
520
+ with st.spinner("Generating Report..."):
521
+ report = st.session_state.automated_reports.generate_report(report_name, data)
522
+
523
+ with custom_research_tab:
524
+ research_query = st.text_area("Enter Research Question:", height=150,
525
+ placeholder="E.g., 'What factors are most predictive of X outcome?'")
526
+
527
+ if st.button("Execute Custom Research"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  with st.spinner("Conducting rigorous analysis..."):
529
+ if selected_data_key:
530
+ data = st.session_state.data[selected_data_key]
531
+ result = st.session_state.researcher.research(
532
+ research_query, data
533
+ )
534
+ st.markdown("## Research Findings")
535
+ st.markdown(result)
536
 
537
  if __name__ == "__main__":
538
  main()