girishwangikar commited on
Commit
e5df187
·
verified ·
1 Parent(s): f65d1c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -6
app.py CHANGED
@@ -35,7 +35,20 @@ class GroqLLM:
35
 
36
  @tool
37
  def analyze_basic_stats(data: pd.DataFrame) -> str:
38
- """Calculate basic statistical measures for numerical columns in the dataset."""
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  stats = {}
40
  numeric_cols = data.select_dtypes(include=[np.number]).columns
41
 
@@ -52,7 +65,20 @@ def analyze_basic_stats(data: pd.DataFrame) -> str:
52
 
53
  @tool
54
  def generate_correlation_matrix(data: pd.DataFrame) -> str:
55
- """Generate a visual correlation matrix for numerical columns in the dataset."""
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  numeric_data = data.select_dtypes(include=[np.number])
57
 
58
  plt.figure(figsize=(10, 8))
@@ -66,7 +92,20 @@ def generate_correlation_matrix(data: pd.DataFrame) -> str:
66
 
67
  @tool
68
  def analyze_categorical_columns(data: pd.DataFrame) -> str:
69
- """Analyze categorical columns in the dataset for distribution and frequencies."""
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  categorical_cols = data.select_dtypes(include=['object', 'category']).columns
71
  analysis = {}
72
 
@@ -81,7 +120,20 @@ def analyze_categorical_columns(data: pd.DataFrame) -> str:
81
 
82
  @tool
83
  def suggest_features(data: pd.DataFrame) -> str:
84
- """Suggest potential feature engineering steps based on data characteristics."""
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  suggestions = []
86
  numeric_cols = data.select_dtypes(include=[np.number]).columns
87
  categorical_cols = data.select_dtypes(include=['object', 'category']).columns
@@ -124,11 +176,11 @@ def main():
124
  st.session_state['data'] = data
125
  st.session_state['file_uploaded'] = True
126
 
127
- # Initialize agent with GroqLLM instead of GroqModel
128
  st.session_state['agent'] = CodeAgent(
129
  tools=[analyze_basic_stats, generate_correlation_matrix,
130
  analyze_categorical_columns, suggest_features],
131
- model=GroqLLM(), # Fixed: Using GroqLLM instead of GroqModel
132
  additional_authorized_imports=["pandas", "numpy", "matplotlib", "seaborn"]
133
  )
134
 
 
35
 
36
  @tool
37
  def analyze_basic_stats(data: pd.DataFrame) -> str:
38
+ """Calculate basic statistical measures for numerical columns in the dataset.
39
+
40
+ This function computes fundamental statistical metrics including mean, median,
41
+ standard deviation, skewness, and counts of missing values for all numerical
42
+ columns in the provided DataFrame.
43
+
44
+ Args:
45
+ data: A pandas DataFrame containing the dataset to analyze. The DataFrame
46
+ should contain at least one numerical column for meaningful analysis.
47
+
48
+ Returns:
49
+ str: A string containing formatted basic statistics for each numerical column,
50
+ including mean, median, standard deviation, skewness, and missing value counts.
51
+ """
52
  stats = {}
53
  numeric_cols = data.select_dtypes(include=[np.number]).columns
54
 
 
65
 
66
  @tool
67
  def generate_correlation_matrix(data: pd.DataFrame) -> str:
68
+ """Generate a visual correlation matrix for numerical columns in the dataset.
69
+
70
+ This function creates a heatmap visualization showing the correlations between
71
+ all numerical columns in the dataset. The correlation values are displayed
72
+ using a color-coded matrix for easy interpretation.
73
+
74
+ Args:
75
+ data: A pandas DataFrame containing the dataset to analyze. The DataFrame
76
+ should contain at least two numerical columns for correlation analysis.
77
+
78
+ Returns:
79
+ str: A base64 encoded string representing the correlation matrix plot image,
80
+ which can be displayed in a web interface or saved as an image file.
81
+ """
82
  numeric_data = data.select_dtypes(include=[np.number])
83
 
84
  plt.figure(figsize=(10, 8))
 
92
 
93
  @tool
94
  def analyze_categorical_columns(data: pd.DataFrame) -> str:
95
+ """Analyze categorical columns in the dataset for distribution and frequencies.
96
+
97
+ This function examines categorical columns to identify unique values, top categories,
98
+ and missing value counts, providing insights into the categorical data distribution.
99
+
100
+ Args:
101
+ data: A pandas DataFrame containing the dataset to analyze. The DataFrame
102
+ should contain at least one categorical column (object or category dtype)
103
+ for meaningful analysis.
104
+
105
+ Returns:
106
+ str: A string containing formatted analysis results for each categorical column,
107
+ including unique value counts, top categories, and missing value counts.
108
+ """
109
  categorical_cols = data.select_dtypes(include=['object', 'category']).columns
110
  analysis = {}
111
 
 
120
 
121
  @tool
122
  def suggest_features(data: pd.DataFrame) -> str:
123
+ """Suggest potential feature engineering steps based on data characteristics.
124
+
125
+ This function analyzes the dataset's structure and statistical properties to
126
+ recommend possible feature engineering steps that could improve model performance.
127
+
128
+ Args:
129
+ data: A pandas DataFrame containing the dataset to analyze. The DataFrame
130
+ can contain both numerical and categorical columns for feature
131
+ engineering suggestions.
132
+
133
+ Returns:
134
+ str: A string containing line-separated suggestions for feature engineering,
135
+ based on the characteristics of the input data.
136
+ """
137
  suggestions = []
138
  numeric_cols = data.select_dtypes(include=[np.number]).columns
139
  categorical_cols = data.select_dtypes(include=['object', 'category']).columns
 
176
  st.session_state['data'] = data
177
  st.session_state['file_uploaded'] = True
178
 
179
+ # Initialize agent with GroqLLM
180
  st.session_state['agent'] = CodeAgent(
181
  tools=[analyze_basic_stats, generate_correlation_matrix,
182
  analyze_categorical_columns, suggest_features],
183
+ model=GroqLLM(),
184
  additional_authorized_imports=["pandas", "numpy", "matplotlib", "seaborn"]
185
  )
186