Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -35,7 +35,20 @@ class GroqLLM:
|
|
35 |
|
36 |
@tool
|
37 |
def analyze_basic_stats(data: pd.DataFrame) -> str:
|
38 |
-
"""Calculate basic statistical measures for numerical columns in the dataset.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
stats = {}
|
40 |
numeric_cols = data.select_dtypes(include=[np.number]).columns
|
41 |
|
@@ -52,7 +65,20 @@ def analyze_basic_stats(data: pd.DataFrame) -> str:
|
|
52 |
|
53 |
@tool
|
54 |
def generate_correlation_matrix(data: pd.DataFrame) -> str:
|
55 |
-
"""Generate a visual correlation matrix for numerical columns in the dataset.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
numeric_data = data.select_dtypes(include=[np.number])
|
57 |
|
58 |
plt.figure(figsize=(10, 8))
|
@@ -66,7 +92,20 @@ def generate_correlation_matrix(data: pd.DataFrame) -> str:
|
|
66 |
|
67 |
@tool
|
68 |
def analyze_categorical_columns(data: pd.DataFrame) -> str:
|
69 |
-
"""Analyze categorical columns in the dataset for distribution and frequencies.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
|
71 |
analysis = {}
|
72 |
|
@@ -81,7 +120,20 @@ def analyze_categorical_columns(data: pd.DataFrame) -> str:
|
|
81 |
|
82 |
@tool
|
83 |
def suggest_features(data: pd.DataFrame) -> str:
|
84 |
-
"""Suggest potential feature engineering steps based on data characteristics.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
suggestions = []
|
86 |
numeric_cols = data.select_dtypes(include=[np.number]).columns
|
87 |
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
|
@@ -124,11 +176,11 @@ def main():
|
|
124 |
st.session_state['data'] = data
|
125 |
st.session_state['file_uploaded'] = True
|
126 |
|
127 |
-
# Initialize agent with GroqLLM
|
128 |
st.session_state['agent'] = CodeAgent(
|
129 |
tools=[analyze_basic_stats, generate_correlation_matrix,
|
130 |
analyze_categorical_columns, suggest_features],
|
131 |
-
model=GroqLLM(),
|
132 |
additional_authorized_imports=["pandas", "numpy", "matplotlib", "seaborn"]
|
133 |
)
|
134 |
|
|
|
35 |
|
36 |
@tool
|
37 |
def analyze_basic_stats(data: pd.DataFrame) -> str:
|
38 |
+
"""Calculate basic statistical measures for numerical columns in the dataset.
|
39 |
+
|
40 |
+
This function computes fundamental statistical metrics including mean, median,
|
41 |
+
standard deviation, skewness, and counts of missing values for all numerical
|
42 |
+
columns in the provided DataFrame.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
data: A pandas DataFrame containing the dataset to analyze. The DataFrame
|
46 |
+
should contain at least one numerical column for meaningful analysis.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
str: A string containing formatted basic statistics for each numerical column,
|
50 |
+
including mean, median, standard deviation, skewness, and missing value counts.
|
51 |
+
"""
|
52 |
stats = {}
|
53 |
numeric_cols = data.select_dtypes(include=[np.number]).columns
|
54 |
|
|
|
65 |
|
66 |
@tool
|
67 |
def generate_correlation_matrix(data: pd.DataFrame) -> str:
|
68 |
+
"""Generate a visual correlation matrix for numerical columns in the dataset.
|
69 |
+
|
70 |
+
This function creates a heatmap visualization showing the correlations between
|
71 |
+
all numerical columns in the dataset. The correlation values are displayed
|
72 |
+
using a color-coded matrix for easy interpretation.
|
73 |
+
|
74 |
+
Args:
|
75 |
+
data: A pandas DataFrame containing the dataset to analyze. The DataFrame
|
76 |
+
should contain at least two numerical columns for correlation analysis.
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
str: A base64 encoded string representing the correlation matrix plot image,
|
80 |
+
which can be displayed in a web interface or saved as an image file.
|
81 |
+
"""
|
82 |
numeric_data = data.select_dtypes(include=[np.number])
|
83 |
|
84 |
plt.figure(figsize=(10, 8))
|
|
|
92 |
|
93 |
@tool
|
94 |
def analyze_categorical_columns(data: pd.DataFrame) -> str:
|
95 |
+
"""Analyze categorical columns in the dataset for distribution and frequencies.
|
96 |
+
|
97 |
+
This function examines categorical columns to identify unique values, top categories,
|
98 |
+
and missing value counts, providing insights into the categorical data distribution.
|
99 |
+
|
100 |
+
Args:
|
101 |
+
data: A pandas DataFrame containing the dataset to analyze. The DataFrame
|
102 |
+
should contain at least one categorical column (object or category dtype)
|
103 |
+
for meaningful analysis.
|
104 |
+
|
105 |
+
Returns:
|
106 |
+
str: A string containing formatted analysis results for each categorical column,
|
107 |
+
including unique value counts, top categories, and missing value counts.
|
108 |
+
"""
|
109 |
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
|
110 |
analysis = {}
|
111 |
|
|
|
120 |
|
121 |
@tool
|
122 |
def suggest_features(data: pd.DataFrame) -> str:
|
123 |
+
"""Suggest potential feature engineering steps based on data characteristics.
|
124 |
+
|
125 |
+
This function analyzes the dataset's structure and statistical properties to
|
126 |
+
recommend possible feature engineering steps that could improve model performance.
|
127 |
+
|
128 |
+
Args:
|
129 |
+
data: A pandas DataFrame containing the dataset to analyze. The DataFrame
|
130 |
+
can contain both numerical and categorical columns for feature
|
131 |
+
engineering suggestions.
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
str: A string containing line-separated suggestions for feature engineering,
|
135 |
+
based on the characteristics of the input data.
|
136 |
+
"""
|
137 |
suggestions = []
|
138 |
numeric_cols = data.select_dtypes(include=[np.number]).columns
|
139 |
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
|
|
|
176 |
st.session_state['data'] = data
|
177 |
st.session_state['file_uploaded'] = True
|
178 |
|
179 |
+
# Initialize agent with GroqLLM
|
180 |
st.session_state['agent'] = CodeAgent(
|
181 |
tools=[analyze_basic_stats, generate_correlation_matrix,
|
182 |
analyze_categorical_columns, suggest_features],
|
183 |
+
model=GroqLLM(),
|
184 |
additional_authorized_imports=["pandas", "numpy", "matplotlib", "seaborn"]
|
185 |
)
|
186 |
|