Spaces:

girishwangikar
/

SmolAgents_DA

Running

App Files Files Community

girishwangikar commited on Jan 11

Commit

7ef0257

verified ·

1 Parent(s): 4ed3667

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -101

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ import io
 class GroqLLM:
     """Compatible LLM interface for smolagents CodeAgent"""
-    def __init__(self, model_name="llama-3.1-8B-Instant"):
         self.client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
         self.model_name = model_name
@@ -23,7 +23,6 @@ class GroqLLM:
         try:
             # Handle different prompt formats
             if isinstance(prompt, (dict, list)):
-                # If prompt is a dictionary or list, convert it to a string representation
                 prompt_str = str(prompt)
             else:
                 prompt_str = str(prompt)
@@ -40,17 +39,41 @@ class GroqLLM:
                 stream=False
             )
-            # Extract and return the response content
-            if completion.choices and len(completion.choices) > 0:
-                return completion.choices[0].message.content
-            return "Error: No response generated"
         except Exception as e:
-            # Provide more detailed error handling
             error_msg = f"Error generating response: {str(e)}"
-            print(error_msg)  # Log the error
             return error_msg
 @tool
 def analyze_basic_stats(data: pd.DataFrame) -> str:
     """Calculate basic statistical measures for numerical columns in the dataset.
@@ -67,16 +90,20 @@ def analyze_basic_stats(data: pd.DataFrame) -> str:
         str: A string containing formatted basic statistics for each numerical column,
             including mean, median, standard deviation, skewness, and missing value counts.
     """
     stats = {}
     numeric_cols = data.select_dtypes(include=[np.number]).columns
     for col in numeric_cols:
         stats[col] = {
-            'mean': data[col].mean(),
-            'median': data[col].median(),
-            'std': data[col].std(),
-            'skew': data[col].skew(),
-            'missing': data[col].isnull().sum()
         }
     return str(stats)
@@ -97,6 +124,10 @@ def generate_correlation_matrix(data: pd.DataFrame) -> str:
         str: A base64 encoded string representing the correlation matrix plot image,
             which can be displayed in a web interface or saved as an image file.
     """
     numeric_data = data.select_dtypes(include=[np.number])
     plt.figure(figsize=(10, 8))
@@ -117,21 +148,24 @@ def analyze_categorical_columns(data: pd.DataFrame) -> str:
     Args:
         data: A pandas DataFrame containing the dataset to analyze. The DataFrame
-            should contain at least one categorical column (object or category dtype)
-            for meaningful analysis.
     Returns:
         str: A string containing formatted analysis results for each categorical column,
             including unique value counts, top categories, and missing value counts.
     """
     categorical_cols = data.select_dtypes(include=['object', 'category']).columns
     analysis = {}
     for col in categorical_cols:
         analysis[col] = {
-            'unique_values': data[col].nunique(),
             'top_categories': data[col].value_counts().head(5).to_dict(),
-            'missing': data[col].isnull().sum()
         }
     return str(analysis)
@@ -145,13 +179,16 @@ def suggest_features(data: pd.DataFrame) -> str:
     Args:
         data: A pandas DataFrame containing the dataset to analyze. The DataFrame
-            can contain both numerical and categorical columns for feature
-            engineering suggestions.
     Returns:
-        str: A string containing line-separated suggestions for feature engineering,
-            based on the characteristics of the input data.
     """
     suggestions = []
     numeric_cols = data.select_dtypes(include=[np.number]).columns
     categorical_cols = data.select_dtypes(include=['object', 'category']).columns
@@ -168,106 +205,89 @@ def suggest_features(data: pd.DataFrame) -> str:
     return '\n'.join(suggestions)
-# Initialize session state at the start
-if 'data' not in st.session_state:
-    st.session_state['data'] = None
-if 'file_uploaded' not in st.session_state:
-    st.session_state['file_uploaded'] = False
-if 'processing' not in st.session_state:
-    st.session_state['processing'] = False
-if 'agent' not in st.session_state:
-    st.session_state['agent'] = None
 def main():
     st.title("Data Analysis Assistant")
     st.write("Upload your dataset and get automated analysis with natural language interaction.")
-    # File uploader with error handling
     uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
     try:
-        if uploaded_file is not None and not st.session_state['file_uploaded']:
-            # Show loading spinner while processing the file
             with st.spinner('Loading and processing your data...'):
-                try:
-                    data = pd.read_csv(uploaded_file)
-                    st.session_state['data'] = data
-                    st.session_state['file_uploaded'] = True
-                    # Initialize agent with GroqLLM
-                    st.session_state['agent'] = CodeAgent(
-                        tools=[analyze_basic_stats, generate_correlation_matrix,
-                               analyze_categorical_columns, suggest_features],
-                        model=GroqLLM(),
-                        additional_authorized_imports=["pandas", "numpy", "matplotlib", "seaborn"]
-                    )
-                    # Show success message
-                    st.success(f'Successfully loaded dataset with {data.shape[0]} rows and {data.shape[1]} columns')
-                    # Display data preview
-                    st.subheader("Data Preview")
-                    st.dataframe(data.head())
-                except Exception as e:
-                    st.error(f"Error loading file: {str(e)}")
-                    st.session_state['file_uploaded'] = False
-                    return
-        # Only show analysis options if data is loaded
-        if st.session_state['file_uploaded'] and st.session_state['data'] is not None:
-            # Analysis options
             analysis_type = st.selectbox(
                 "Choose analysis type",
                 ["Basic Statistics", "Correlation Analysis", "Categorical Analysis",
                  "Feature Engineering", "Custom Question"]
             )
-            # Process analysis with loading indicators
-            if analysis_type:
-                with st.spinner(f'Performing {analysis_type.lower()}...'):
-                    if analysis_type == "Basic Statistics":
-                        result = st.session_state['agent'].run(
-                            f"Analyze and explain the basic statistics of this dataset. "
-                            f"Dataset info: {st.session_state['data'].info()}\n"
-                            f"Use the analyze_basic_stats tool and provide natural language explanations."
-                        )
-                        st.write(result)
-                    elif analysis_type == "Correlation Analysis":
-                        correlation_plot = st.session_state['agent'].run(
-                            "Generate and explain correlations between numerical variables. "
-                            "Use the generate_correlation_matrix tool."
-                        )
-                        if correlation_plot:
-                            st.image(f"data:image/png;base64,{correlation_plot}")
-                    elif analysis_type == "Categorical Analysis":
-                        result = st.session_state['agent'].run(
-                            "Analyze categorical variables in the dataset. "
-                            "Use the analyze_categorical_columns tool and explain the findings."
-                        )
                         st.write(result)
-                    elif analysis_type == "Feature Engineering":
-                        result = st.session_state['agent'].run(
-                            "Suggest potential feature engineering steps for this dataset. "
-                            "Use the suggest_features tool and explain your suggestions."
-                        )
                         st.write(result)
-                    elif analysis_type == "Custom Question":
-                        question = st.text_input("What would you like to know about your data?")
-                        if question:
-                            result = st.session_state['agent'].run(
-                                f"Answer this question about the dataset: {question}\n"
-                                f"Use appropriate tools to analyze and explain."
-                            )
-                            st.write(result)
     except Exception as e:
         st.error(f"An error occurred: {str(e)}")
-        st.session_state['file_uploaded'] = False
 if __name__ == "__main__":
     main()

 class GroqLLM:
     """Compatible LLM interface for smolagents CodeAgent"""
+    def __init__(self, model_name="llama2-70b-3.5"):
         self.client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
         self.model_name = model_name
         try:
             # Handle different prompt formats
             if isinstance(prompt, (dict, list)):
                 prompt_str = str(prompt)
             else:
                 prompt_str = str(prompt)
                 stream=False
             )
+            return completion.choices[0].message.content if completion.choices else "Error: No response generated"
         except Exception as e:
             error_msg = f"Error generating response: {str(e)}"
+            print(error_msg)
             return error_msg
+class DataAnalysisAgent(CodeAgent):
+    """Extended CodeAgent with dataset awareness"""
+    def __init__(self, dataset: pd.DataFrame, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._dataset = dataset
+    @property
+    def dataset(self) -> pd.DataFrame:
+        """Access the stored dataset"""
+        return self._dataset
+    def run(self, prompt: str) -> str:
+        """Override run method to include dataset context"""
+        dataset_info = f"""
+        Dataset Shape: {self.dataset.shape}
+        Columns: {', '.join(self.dataset.columns)}
+        Data Types: {self.dataset.dtypes.to_dict()}
+        """
+        enhanced_prompt = f"""
+        Analyze the following dataset:
+        {dataset_info}
+        Task: {prompt}
+        Use the provided tools to analyze this specific dataset and return detailed results.
+        """
+        return super().run(enhanced_prompt)
 @tool
 def analyze_basic_stats(data: pd.DataFrame) -> str:
     """Calculate basic statistical measures for numerical columns in the dataset.
         str: A string containing formatted basic statistics for each numerical column,
             including mean, median, standard deviation, skewness, and missing value counts.
     """
+    # Access dataset from agent if no data provided
+    if data is None:
+        data = tool.agent.dataset
     stats = {}
     numeric_cols = data.select_dtypes(include=[np.number]).columns
     for col in numeric_cols:
         stats[col] = {
+            'mean': float(data[col].mean()),
+            'median': float(data[col].median()),
+            'std': float(data[col].std()),
+            'skew': float(data[col].skew()),
+            'missing': int(data[col].isnull().sum())
         }
     return str(stats)
         str: A base64 encoded string representing the correlation matrix plot image,
             which can be displayed in a web interface or saved as an image file.
     """
+    # Access dataset from agent if no data provided
+    if data is None:
+        data = tool.agent.dataset
     numeric_data = data.select_dtypes(include=[np.number])
     plt.figure(figsize=(10, 8))
     Args:
         data: A pandas DataFrame containing the dataset to analyze. The DataFrame
+            should contain at least one categorical column for meaningful analysis.
     Returns:
         str: A string containing formatted analysis results for each categorical column,
             including unique value counts, top categories, and missing value counts.
     """
+    # Access dataset from agent if no data provided
+    if data is None:
+        data = tool.agent.dataset
     categorical_cols = data.select_dtypes(include=['object', 'category']).columns
     analysis = {}
     for col in categorical_cols:
         analysis[col] = {
+            'unique_values': int(data[col].nunique()),
             'top_categories': data[col].value_counts().head(5).to_dict(),
+            'missing': int(data[col].isnull().sum())
         }
     return str(analysis)
     Args:
         data: A pandas DataFrame containing the dataset to analyze. The DataFrame
+            can contain both numerical and categorical columns.
     Returns:
+        str: A string containing suggestions for feature engineering based on
+            the characteristics of the input data.
     """
+    # Access dataset from agent if no data provided
+    if data is None:
+        data = tool.agent.dataset
     suggestions = []
     numeric_cols = data.select_dtypes(include=[np.number]).columns
     categorical_cols = data.select_dtypes(include=['object', 'category']).columns
     return '\n'.join(suggestions)
 def main():
     st.title("Data Analysis Assistant")
     st.write("Upload your dataset and get automated analysis with natural language interaction.")
+    # Initialize session state
+    if 'data' not in st.session_state:
+        st.session_state['data'] = None
+    if 'agent' not in st.session_state:
+        st.session_state['agent'] = None
     uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
     try:
+        if uploaded_file is not None:
             with st.spinner('Loading and processing your data...'):
+                # Load the dataset
+                data = pd.read_csv(uploaded_file)
+                st.session_state['data'] = data
+                # Initialize the agent with the dataset
+                st.session_state['agent'] = DataAnalysisAgent(
+                    dataset=data,
+                    tools=[analyze_basic_stats, generate_correlation_matrix,
+                           analyze_categorical_columns, suggest_features],
+                    model=GroqLLM(),
+                    additional_authorized_imports=["pandas", "numpy", "matplotlib", "seaborn"]
+                )
+                st.success(f'Successfully loaded dataset with {data.shape[0]} rows and {data.shape[1]} columns')
+                st.subheader("Data Preview")
+                st.dataframe(data.head())
+        if st.session_state['data'] is not None:
             analysis_type = st.selectbox(
                 "Choose analysis type",
                 ["Basic Statistics", "Correlation Analysis", "Categorical Analysis",
                  "Feature Engineering", "Custom Question"]
             )
+            if analysis_type == "Basic Statistics":
+                with st.spinner('Analyzing basic statistics...'):
+                    result = st.session_state['agent'].run(
+                        "Use the analyze_basic_stats tool to analyze this dataset and "
+                        "provide insights about the numerical distributions."
+                    )
+                    st.write(result)
+            elif analysis_type == "Correlation Analysis":
+                with st.spinner('Generating correlation matrix...'):
+                    result = st.session_state['agent'].run(
+                        "Use the generate_correlation_matrix tool to analyze correlations "
+                        "and explain any strong relationships found."
+                    )
+                    if isinstance(result, str) and result.startswith('data:image') or ',' in result:
+                        st.image(f"data:image/png;base64,{result.split(',')[-1]}")
+                    else:
                         st.write(result)
+            elif analysis_type == "Categorical Analysis":
+                with st.spinner('Analyzing categorical columns...'):
+                    result = st.session_state['agent'].run(
+                        "Use the analyze_categorical_columns tool to examine the "
+                        "categorical variables and explain the distributions."
+                    )
+                    st.write(result)
+            elif analysis_type == "Feature Engineering":
+                with st.spinner('Generating feature suggestions...'):
+                    result = st.session_state['agent'].run(
+                        "Use the suggest_features tool to recommend potential "
+                        "feature engineering steps for this dataset."
+                    )
+                    st.write(result)
+            elif analysis_type == "Custom Question":
+                question = st.text_input("What would you like to know about your data?")
+                if question:
+                    with st.spinner('Analyzing...'):
+                        result = st.session_state['agent'].run(question)
                         st.write(result)
     except Exception as e:
         st.error(f"An error occurred: {str(e)}")
 if __name__ == "__main__":
     main()