mgbam commited on
Commit
e7aa85e
·
verified ·
1 Parent(s): ddeba5b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +289 -128
app.py CHANGED
@@ -22,93 +22,217 @@ import json
22
  # Initialize Groq Client
23
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
24
 
25
- # ---------------------- Data Acquisition Layer ---------------------------
26
- class DataSource(ABC):
27
- """Base class for data sources."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  @abstractmethod
29
- def connect(self) -> None:
30
- """Connect to the data source."""
31
  pass
32
-
33
- @abstractmethod
34
- def fetch_data(self, query: str, **kwargs) -> pd.DataFrame:
35
- """Fetch the data based on a specific query."""
36
- pass
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- class CSVDataSource(DataSource):
40
- """Data source for CSV files."""
41
- def __init__(self, file_path: str):
42
- self.file_path = file_path
43
- self.data: Optional[pd.DataFrame] = None
44
-
45
- def connect(self):
46
- self.data = pd.read_csv(self.file_path)
47
-
48
- def fetch_data(self, query: str = None, **kwargs) -> pd.DataFrame:
49
- if self.data is None:
50
- raise Exception("No connection is made, call connect()")
51
- return self.data
52
-
53
- class DatabaseSource(DataSource):
54
- def __init__(self, connection_string: str, database_type: str):
55
- self.connection_string = connection_string
56
- self.database_type = database_type
57
- self.connection = None
58
-
59
- def connect(self):
60
- if self.database_type.lower() == "sql":
61
- #Placeholder for the actual database connection
62
- self.connection = "Connected to SQL Database"
63
- else:
64
- raise Exception(f"Database type '{self.database_type}' is not supported")
65
 
66
- def fetch_data(self, query: str, **kwargs) -> pd.DataFrame:
67
- if self.connection is None:
68
- raise Exception("No connection is made, call connect()")
69
- #Placeholder for the data fetching
70
- return pd.DataFrame({"result":[f"Fetched data based on query: {query}"]})
71
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- class DataIngestion:
74
- def __init__(self):
75
- self.sources : Dict[str, DataSource] = {}
76
-
77
- def add_source(self, source_name: str, source: DataSource):
78
- self.sources[source_name] = source
79
-
80
- def ingest_data(self, source_name: str, query: str = None, **kwargs) -> pd.DataFrame:
81
- if source_name not in self.sources:
82
- raise Exception(f"Source '{source_name}' not found")
83
- source = self.sources[source_name]
84
- source.connect()
85
- return source.fetch_data(query, **kwargs)
86
-
87
- class DataModel(BaseModel):
88
- name : str
89
- kpis : List[str] = Field(default_factory=list)
90
- dimensions : List[str] = Field(default_factory=list)
91
- custom_calculations : Optional[Dict[str, str]] = None
92
- relations: Optional[Dict[str,str]] = None #Example {table1: table2}
93
-
94
- def to_json(self):
95
- return json.dumps(self.dict())
96
-
97
- @staticmethod
98
- def from_json(json_str):
99
- return DataModel(**json.loads(json_str))
100
-
101
- class DataModelling():
102
- def __init__(self):
103
- self.models : Dict[str, DataModel] = {}
104
-
105
- def add_model(self, model:DataModel):
106
- self.models[model.name] = model
107
-
108
- def get_model(self, model_name: str) -> DataModel:
109
- if model_name not in self.models:
110
- raise Exception(f"Model '{model_name}' not found")
111
- return self.models[model_name]
 
 
 
 
 
 
 
 
 
 
112
  # ---------------------- Business Logic Layer ---------------------------
113
  class BusinessRule(BaseModel):
114
  name: str
@@ -228,56 +352,93 @@ class AutomatedReports():
228
  st.subheader(f"Data: {df_name}")
229
  st.table(df)
230
 
231
- # ---------------------- Groq Research Agent ---------------------------
 
 
 
 
 
 
 
 
 
 
 
232
 
233
- class GroqResearcher:
234
- """Advanced AI Research Engine using Groq"""
235
- def __init__(self, model_name="mixtral-8x7b-32768"):
236
- self.model_name = model_name
237
- self.system_template = """You are a senior data scientist at a research institution.
238
- Analyze this dataset with rigorous statistical methods and provide academic-quality insights:
239
- {dataset_info}
240
-
241
- User Question: {query}
242
-
243
- Required Format:
244
- - Executive Summary (1 paragraph)
245
- - Methodology (bullet points)
246
- - Key Findings (numbered list)
247
- - Limitations
248
- - Recommended Next Steps"""
249
 
250
- def research(self, query: str, data: pd.DataFrame) -> str:
251
- """Conduct academic-level analysis using Groq"""
252
- try:
253
- dataset_info = f"""
254
- Dataset Dimensions: {data.shape}
255
- Variables: {', '.join(data.columns)}
256
- Temporal Coverage: {data.select_dtypes(include='datetime').columns.tolist()}
257
- Missing Values: {data.isnull().sum().to_dict()}
258
- """
259
-
260
- prompt = PromptTemplate.from_template(self.system_template).format(
261
- dataset_info=dataset_info,
262
- query=query
263
- )
264
-
265
- completion = client.chat.completions.create(
266
- messages=[
267
- {"role": "system", "content": "You are a research AI assistant"},
268
- {"role": "user", "content": prompt}
269
- ],
270
- model=self.model_name,
271
- temperature=0.2,
272
- max_tokens=4096,
273
- stream=False
274
- )
275
-
276
- return completion.choices[0].message.content
277
-
278
- except Exception as e:
279
- return f"Research Error: {str(e)}"
 
 
 
280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  # ---------------------- Main Streamlit Application ---------------------------
282
  def main():
283
  st.set_page_config(page_title="AI BI Automation Platform", layout="wide")
 
22
  # Initialize Groq Client
23
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
24
 
25
+ # ---------------------- Base Classes and Schemas ---------------------------
26
+ class ResearchInput(BaseModel):
27
+ """Base schema for research tool inputs"""
28
+ data_key: str = Field(..., description="Session state key containing DataFrame")
29
+ columns: Optional[List[str]] = Field(None, description="List of columns to analyze")
30
+
31
+ class TemporalAnalysisInput(ResearchInput):
32
+ """Schema for temporal analysis"""
33
+ time_col: str = Field(..., description="Name of timestamp column")
34
+ value_col: str = Field(..., description="Name of value column to analyze")
35
+
36
+ class HypothesisInput(ResearchInput):
37
+ """Schema for hypothesis testing"""
38
+ group_col: str = Field(..., description="Categorical column defining groups")
39
+ value_col: str = Field(..., description="Numerical column to compare")
40
+
41
+ class ModelTrainingInput(ResearchInput):
42
+ """Schema for model training"""
43
+ target_col: str = Field(..., description="Name of target column")
44
+
45
+ class DataAnalyzer(ABC):
46
+ """Abstract base class for data analysis modules"""
47
  @abstractmethod
48
+ def invoke(self, **kwargs) -> Dict[str, Any]:
 
49
  pass
 
 
 
 
 
50
 
51
+ # ---------------------- Concrete Analyzer Implementations ---------------------------
52
+ class AdvancedEDA(DataAnalyzer):
53
+ """Comprehensive Exploratory Data Analysis"""
54
+ def invoke(self, data: pd.DataFrame, **kwargs) -> Dict[str, Any]:
55
+ try:
56
+ analysis = {
57
+ "dimensionality": {
58
+ "rows": len(data),
59
+ "columns": list(data.columns),
60
+ "memory_usage": f"{data.memory_usage().sum() / 1e6:.2f} MB"
61
+ },
62
+ "statistical_profile": data.describe(percentiles=[.25, .5, .75]).to_dict(),
63
+ "temporal_analysis": {
64
+ "date_ranges": {
65
+ col: {
66
+ "min": data[col].min(),
67
+ "max": data[col].max()
68
+ } for col in data.select_dtypes(include='datetime').columns
69
+ }
70
+ },
71
+ "data_quality": {
72
+ "missing_values": data.isnull().sum().to_dict(),
73
+ "duplicates": data.duplicated().sum(),
74
+ "cardinality": {
75
+ col: data[col].nunique() for col in data.columns
76
+ }
77
+ }
78
+ }
79
+ return analysis
80
+ except Exception as e:
81
+ return {"error": f"EDA Failed: {str(e)}"}
82
+
83
+ class DistributionVisualizer(DataAnalyzer):
84
+ """Distribution visualizations"""
85
+ def invoke(self, data: pd.DataFrame, columns: List[str], **kwargs) -> str:
86
+ try:
87
+ plt.figure(figsize=(12, 6))
88
+ for i, col in enumerate(columns, 1):
89
+ plt.subplot(1, len(columns), i)
90
+ sns.histplot(data[col], kde=True, stat="density")
91
+ plt.title(f'Distribution of {col}', fontsize=10)
92
+ plt.xticks(fontsize=8)
93
+ plt.yticks(fontsize=8)
94
+ plt.tight_layout()
95
+
96
+ buf = io.BytesIO()
97
+ plt.savefig(buf, format='png', dpi=300, bbox_inches='tight')
98
+ plt.close()
99
+ return base64.b64encode(buf.getvalue()).decode()
100
+ except Exception as e:
101
+ return f"Visualization Error: {str(e)}"
102
+
103
+ class TemporalAnalyzer(DataAnalyzer):
104
+ """Time series analysis"""
105
+ def invoke(self, data: pd.DataFrame, time_col: str, value_col: str, **kwargs) -> Dict[str, Any]:
106
+ try:
107
+ ts_data = data.set_index(pd.to_datetime(data[time_col]))[value_col]
108
+ decomposition = seasonal_decompose(ts_data, period=365)
109
+
110
+ plt.figure(figsize=(12, 8))
111
+ decomposition.plot()
112
+ plt.tight_layout()
113
+
114
+ buf = io.BytesIO()
115
+ plt.savefig(buf, format='png')
116
+ plt.close()
117
+ plot_data = base64.b64encode(buf.getvalue()).decode()
118
+
119
+ return {
120
+ "trend_statistics": {
121
+ "stationarity": adfuller(ts_data)[1],
122
+ "seasonality_strength": max(decomposition.seasonal)
123
+ },
124
+ "visualization": plot_data
125
+ }
126
+ except Exception as e:
127
+ return {"error": f"Temporal Analysis Failed: {str(e)}"}
128
 
129
+ class HypothesisTester(DataAnalyzer):
130
+ """Statistical hypothesis testing"""
131
+ def invoke(self, data: pd.DataFrame, group_col: str, value_col: str, **kwargs) -> Dict[str, Any]:
132
+ try:
133
+ groups = data[group_col].unique()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ if len(groups) < 2:
136
+ return {"error": "Insufficient groups for comparison"}
137
+
138
+ if len(groups) == 2:
139
+ group_data = [data[data[group_col] == g][value_col] for g in groups]
140
+ stat, p = ttest_ind(*group_data)
141
+ test_type = "Independent t-test"
142
+ else:
143
+ group_data = [data[data[group_col] == g][value_col] for g in groups]
144
+ stat, p = f_oneway(*group_data)
145
+ test_type = "ANOVA"
146
+
147
+ return {
148
+ "test_type": test_type,
149
+ "test_statistic": stat,
150
+ "p_value": p,
151
+ "effect_size": {
152
+ "cohens_d": abs(group_data[0].mean() - group_data[1].mean())/np.sqrt(
153
+ (group_data[0].var() + group_data[1].var())/2
154
+ ) if len(groups) == 2 else None
155
+ },
156
+ "interpretation": self.interpret_p_value(p)
157
+ }
158
+ except Exception as e:
159
+ return {"error": f"Hypothesis Testing Failed: {str(e)}"}
160
+
161
+ def interpret_p_value(self, p: float) -> str:
162
+ if p < 0.001: return "Very strong evidence against H0"
163
+ elif p < 0.01: return "Strong evidence against H0"
164
+ elif p < 0.05: return "Evidence against H0"
165
+ elif p < 0.1: return "Weak evidence against H0"
166
+ else: return "No significant evidence against H0"
167
+
168
+ class LogisticRegressionTrainer(DataAnalyzer):
169
+ """Logistic Regression Model Trainer"""
170
+ def invoke(self, data: pd.DataFrame, target_col: str, columns: List[str], **kwargs) -> Dict[str, Any]:
171
+ try:
172
+ X = data[columns]
173
+ y = data[target_col]
174
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
175
+ model = LogisticRegression(max_iter=1000)
176
+ model.fit(X_train, y_train)
177
+ y_pred = model.predict(X_test)
178
+ accuracy = accuracy_score(y_test, y_pred)
179
+ return {
180
+ "model_type": "Logistic Regression",
181
+ "accuracy": accuracy,
182
+ "model_params": model.get_params()
183
+ }
184
+ except Exception as e:
185
+ return {"error": f"Logistic Regression Model Error: {str(e)}"}
186
 
187
+ # ---------------------- Groq Research Agent ---------------------------
188
+
189
+ class GroqResearcher:
190
+ """Advanced AI Research Engine using Groq"""
191
+ def __init__(self, model_name="mixtral-8x7b-32768"):
192
+ self.model_name = model_name
193
+ self.system_template = """You are a senior data scientist at a research institution.
194
+ Analyze this dataset with rigorous statistical methods and provide academic-quality insights:
195
+ {dataset_info}
196
+
197
+ User Question: {query}
198
+
199
+ Required Format:
200
+ - Executive Summary (1 paragraph)
201
+ - Methodology (bullet points)
202
+ - Key Findings (numbered list)
203
+ - Limitations
204
+ - Recommended Next Steps"""
205
+
206
+ def research(self, query: str, data: pd.DataFrame) -> str:
207
+ """Conduct academic-level analysis using Groq"""
208
+ try:
209
+ dataset_info = f"""
210
+ Dataset Dimensions: {data.shape}
211
+ Variables: {', '.join(data.columns)}
212
+ Temporal Coverage: {data.select_dtypes(include='datetime').columns.tolist()}
213
+ Missing Values: {data.isnull().sum().to_dict()}
214
+ """
215
+
216
+ prompt = PromptTemplate.from_template(self.system_template).format(
217
+ dataset_info=dataset_info,
218
+ query=query
219
+ )
220
+
221
+ completion = client.chat.completions.create(
222
+ messages=[
223
+ {"role": "system", "content": "You are a research AI assistant"},
224
+ {"role": "user", "content": prompt}
225
+ ],
226
+ model=self.model_name,
227
+ temperature=0.2,
228
+ max_tokens=4096,
229
+ stream=False
230
+ )
231
+
232
+ return completion.choices[0].message.content
233
+
234
+ except Exception as e:
235
+ return f"Research Error: {str(e)}"
236
  # ---------------------- Business Logic Layer ---------------------------
237
  class BusinessRule(BaseModel):
238
  name: str
 
352
  st.subheader(f"Data: {df_name}")
353
  st.table(df)
354
 
355
+ # ---------------------- Data Acquisition Layer ---------------------------
356
+ class DataSource(ABC):
357
+ """Base class for data sources."""
358
+ @abstractmethod
359
+ def connect(self) -> None:
360
+ """Connect to the data source."""
361
+ pass
362
+
363
+ @abstractmethod
364
+ def fetch_data(self, query: str, **kwargs) -> pd.DataFrame:
365
+ """Fetch the data based on a specific query."""
366
+ pass
367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
+ class CSVDataSource(DataSource):
370
+ """Data source for CSV files."""
371
+ def __init__(self, file_path: str):
372
+ self.file_path = file_path
373
+ self.data: Optional[pd.DataFrame] = None
374
+
375
+ def connect(self):
376
+ self.data = pd.read_csv(self.file_path)
377
+
378
+ def fetch_data(self, query: str = None, **kwargs) -> pd.DataFrame:
379
+ if self.data is None:
380
+ raise Exception("No connection is made, call connect()")
381
+ return self.data
382
+
383
+ class DatabaseSource(DataSource):
384
+ def __init__(self, connection_string: str, database_type: str):
385
+ self.connection_string = connection_string
386
+ self.database_type = database_type
387
+ self.connection = None
388
+
389
+ def connect(self):
390
+ if self.database_type.lower() == "sql":
391
+ #Placeholder for the actual database connection
392
+ self.connection = "Connected to SQL Database"
393
+ else:
394
+ raise Exception(f"Database type '{self.database_type}' is not supported")
395
+
396
+ def fetch_data(self, query: str, **kwargs) -> pd.DataFrame:
397
+ if self.connection is None:
398
+ raise Exception("No connection is made, call connect()")
399
+ #Placeholder for the data fetching
400
+ return pd.DataFrame({"result":[f"Fetched data based on query: {query}"]})
401
+
402
 
403
+ class DataIngestion:
404
+ def __init__(self):
405
+ self.sources : Dict[str, DataSource] = {}
406
+
407
+ def add_source(self, source_name: str, source: DataSource):
408
+ self.sources[source_name] = source
409
+
410
+ def ingest_data(self, source_name: str, query: str = None, **kwargs) -> pd.DataFrame:
411
+ if source_name not in self.sources:
412
+ raise Exception(f"Source '{source_name}' not found")
413
+ source = self.sources[source_name]
414
+ source.connect()
415
+ return source.fetch_data(query, **kwargs)
416
+
417
+ class DataModel(BaseModel):
418
+ name : str
419
+ kpis : List[str] = Field(default_factory=list)
420
+ dimensions : List[str] = Field(default_factory=list)
421
+ custom_calculations : Optional[Dict[str, str]] = None
422
+ relations: Optional[Dict[str,str]] = None #Example {table1: table2}
423
+
424
+ def to_json(self):
425
+ return json.dumps(self.dict())
426
+
427
+ @staticmethod
428
+ def from_json(json_str):
429
+ return DataModel(**json.loads(json_str))
430
+
431
+ class DataModelling():
432
+ def __init__(self):
433
+ self.models : Dict[str, DataModel] = {}
434
+
435
+ def add_model(self, model:DataModel):
436
+ self.models[model.name] = model
437
+
438
+ def get_model(self, model_name: str) -> DataModel:
439
+ if model_name not in self.models:
440
+ raise Exception(f"Model '{model_name}' not found")
441
+ return self.models[model_name]
442
  # ---------------------- Main Streamlit Application ---------------------------
443
  def main():
444
  st.set_page_config(page_title="AI BI Automation Platform", layout="wide")