Spaces:

mgbam
/

DataBiz

Sleeping

App Files Files Community

mgbam commited on Jan 28

Commit

102a9b5

verified ·

1 Parent(s): 06300b8

Update app.py

Browse files

Files changed (1) hide show

app.py +347 -167

app.py CHANGED Viewed

@@ -1,216 +1,396 @@
 import streamlit as st
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 import os
 import base64
 import io
-from groq import Groq
-from langchain.tools import tool
-from langchain.agents import AgentType, initialize_agent
-from langchain.chains import LLMChain
-from langchain.prompts import PromptTemplate
-from typing import Optional, Dict, List
-# Initialize Groq Client
-client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-class GroqAnalyst:
-    """Advanced AI Researcher & Data Analyst using Groq"""
-    def __init__(self, model_name="mixtral-8x7b-32768"):
         self.model_name = model_name
-        self.system_prompt = """
-        You are an expert AI research assistant and data scientist.
-        Provide detailed, technical analysis with professional visualizations.
-        """
-    def analyze(self, prompt: str, data: pd.DataFrame) -> str:
-        """Execute complex data analysis using Groq"""
         try:
-            dataset_info = f"""
-            Dataset Shape: {data.shape}
-            Columns: {', '.join(data.columns)}
-            Data Types: {data.dtypes.to_dict()}
-            Sample Data: {data.head(3).to_dict()}
-            """
-            completion = client.chat.completions.create(
-                messages=[
-                    {"role": "system", "content": self.system_prompt},
-                    {"role": "user", "content": f"{dataset_info}\n\nTask: {prompt}"}
-                ],
                 model=self.model_name,
-                temperature=0.3,
-                max_tokens=4096,
-                stream=False
             )
-            return completion.choices[0].message.content
         except Exception as e:
-            return f"Analysis Error: {str(e)}"
 @tool
-def advanced_eda(data: pd.DataFrame) -> Dict:
-    """Perform comprehensive exploratory data analysis.
     Args:
-        data (pd.DataFrame): Input dataset for analysis
     Returns:
-        Dict: Contains statistical summary, missing values, and data quality report
     """
-    analysis = {
-        "statistical_summary": data.describe().to_dict(),
-        "missing_values": data.isnull().sum().to_dict(),
-        "data_quality": {
-            "duplicates": data.duplicated().sum(),
-            "zero_values": (data == 0).sum().to_dict()
         }
-    }
-    return analysis
 @tool
-def visualize_distributions(data: pd.DataFrame, columns: List[str]) -> str:
-    """Generate distribution plots for specified numerical columns.
     Args:
-        data (pd.DataFrame): Input dataset
-        columns (List[str]): List of numerical columns to visualize
     Returns:
-        str: Base64 encoded image of the visualization
     """
-    plt.figure(figsize=(12, 6))
-    for i, col in enumerate(columns, 1):
-        plt.subplot(1, len(columns), i)
-        sns.histplot(data[col], kde=True)
-        plt.title(f'Distribution of {col}')
-    plt.tight_layout()
     buf = io.BytesIO()
-    plt.savefig(buf, format='png')
     plt.close()
     return base64.b64encode(buf.getvalue()).decode()
 @tool
-def temporal_analysis(data: pd.DataFrame, time_col: str, value_col: str) -> str:
-    """Analyze time series data and generate trend visualization.
     Args:
-        data (pd.DataFrame): Dataset containing time series
-        time_col (str): Name of timestamp column
-        value_col (str): Name of value column to analyze
     Returns:
-        str: Base64 encoded image of time series plot
     """
-    plt.figure(figsize=(12, 6))
-    data[time_col] = pd.to_datetime(data[time_col])
-    data.set_index(time_col)[value_col].plot()
-    plt.title(f'Temporal Trend of {value_col}')
-    plt.xlabel('Date')
-    plt.ylabel('Value')
-    buf = io.BytesIO()
-    plt.savefig(buf, format='png')
-    plt.close()
-    return base64.b64encode(buf.getvalue()).decode()
 @tool
-def hypothesis_testing(data: pd.DataFrame, group_col: str, value_col: str) -> Dict:
-    """Perform statistical hypothesis testing between groups.
     Args:
-        data (pd.DataFrame): Input dataset
-        group_col (str): Categorical column defining groups
-        value_col (str): Numerical column to compare
     Returns:
-        Dict: Contains test results, p-value, and conclusion
     """
-    from scipy.stats import ttest_ind
-    groups = data[group_col].unique()
-    if len(groups) != 2:
-        return {"error": "Hypothesis testing requires exactly two groups"}
-    group1 = data[data[group_col] == groups[0]][value_col]
-    group2 = data[data[group_col] == groups[1]][value_col]
-    t_stat, p_value = ttest_ind(group1, group2)
-    return {
-        "t_statistic": t_stat,
-        "p_value": p_value,
-        "conclusion": "Significant difference" if p_value < 0.05 else "No significant difference"
-    }
 def main():
-    st.title("🔬 AI Research Assistant with Groq")
-    st.markdown("Advanced data analysis powered by Groq's accelerated computing")
     # Initialize session state
-    if 'data' not in st.session_state:
-        st.session_state.data = None
-    if 'analyst' not in st.session_state:
-        st.session_state.analyst = GroqAnalyst()
-    # File upload section
-    with st.sidebar:
-        st.header("Data Upload")
-        uploaded_file = st.file_uploader("Upload dataset (CSV)", type="csv")
-        if uploaded_file:
-            with st.spinner("Analyzing dataset..."):
-                st.session_state.data = pd.read_csv(uploaded_file)
-                st.success(f"Loaded {len(st.session_state.data)} records")
-    # Main analysis interface
-    if st.session_state.data is not None:
-        st.subheader("Dataset Overview")
-        st.dataframe(st.session_state.data.head(), use_container_width=True)
-        analysis_type = st.selectbox("Select Analysis Type", [
-            "Exploratory Data Analysis",
-            "Temporal Analysis",
-            "Statistical Testing",
-            "Custom Research Query"
-        ])
-        if analysis_type == "Exploratory Data Analysis":
-            with st.expander("Advanced EDA"):
-                eda_result = advanced_eda(st.session_state.data)
-                st.json(eda_result)
-                num_cols = st.session_state.data.select_dtypes(include=np.number).columns.tolist()
-                if num_cols:
-                    selected_cols = st.multiselect("Select columns for distribution analysis", num_cols)
-                    if selected_cols:
-                        img_data = visualize_distributions(st.session_state.data, selected_cols)
-                        st.image(f"data:image/png;base64,{img_data}")
-        elif analysis_type == "Temporal Analysis":
-            time_col = st.selectbox("Select time column", st.session_state.data.columns)
-            value_col = st.selectbox("Select value column", st.session_state.data.select_dtypes(include=np.number).columns)
-            if time_col and value_col:
-                img_data = temporal_analysis(st.session_state.data, time_col, value_col)
-                st.image(f"data:image/png;base64,{img_data}")
-        elif analysis_type == "Statistical Testing":
-            group_col = st.selectbox("Select group column", st.session_state.data.select_dtypes(include='object').columns)
-            value_col = st.selectbox("Select metric to compare", st.session_state.data.select_dtypes(include=np.number).columns)
-            if group_col and value_col:
-                test_result = hypothesis_testing(st.session_state.data, group_col, value_col)
-                st.json(test_result)
-        elif analysis_type == "Custom Research Query":
-            research_query = st.text_area("Enter your research question:")
-            if research_query:
-                with st.spinner("Conducting advanced analysis..."):
-                    result = st.session_state.analyst.analyze(research_query, st.session_state.data)
-                    st.markdown("### Research Findings")
-                    st.markdown(result)
 if __name__ == "__main__":
     main()

 import streamlit as st
 import numpy as np
 import pandas as pd
+from smolagents import CodeAgent, tool
+from typing import Union, List, Dict, Optional
 import matplotlib.pyplot as plt
 import seaborn as sns
 import os
+from groq import Groq
+from dataclasses import dataclass
+import tempfile
 import base64
 import io
+import json
+from streamlit_ace import st_ace
+from contextlib import contextmanager
+class GroqLLM:
+    """Compatible LLM interface for smolagents CodeAgent"""
+    def __init__(self, model_name="llama-3.1-8B-Instant"):
+        self.client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
         self.model_name = model_name
+    def __call__(self, prompt: Union[str, dict, List[Dict]]) -> str:
+        """Make the class callable as required by smolagents"""
         try:
+            # Handle different prompt formats
+            if isinstance(prompt, (dict, list)):
+                prompt_str = str(prompt)
+            else:
+                prompt_str = str(prompt)
+            # Create a properly formatted message
+            completion = self.client.chat.completions.create(
                 model=self.model_name,
+                messages=[{"role": "user", "content": prompt_str}],
+                temperature=0.7,
+                max_tokens=1024,
+                stream=True,  # Enable streaming
             )
+            full_response = ""
+            for chunk in completion:
+                if chunk.choices[0].delta.content is not None:
+                    full_response += chunk.choices[0].delta.content
+            return full_response
         except Exception as e:
+            error_msg = f"Error generating response: {str(e)}"
+            print(error_msg)
+            return error_msg
+class DataAnalysisAgent(CodeAgent):
+    """Extended CodeAgent with dataset awareness"""
+    def __init__(self, dataset: pd.DataFrame, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._dataset = dataset
+    @property
+    def dataset(self) -> pd.DataFrame:
+        """Access the stored dataset"""
+        return self._dataset
+    def run(self, prompt: str, **kwargs) -> str:
+        """Override run method to include dataset context"""
+        dataset_info = f"""
+        Dataset Shape: {self.dataset.shape}
+        Columns: {', '.join(self.dataset.columns)}
+        Data Types: {self.dataset.dtypes.to_dict()}
+        """
+        enhanced_prompt = f"""
+        Analyze the following dataset:
+        {dataset_info}
+        Task: {prompt}
+        Use the provided tools to analyze this specific dataset and return detailed results.
+        """
+        return super().run(enhanced_prompt, data=self.dataset, **kwargs) # Pass data as argument
 @tool
+def analyze_basic_stats(data: pd.DataFrame) -> str:
+    """Calculate basic statistical measures for numerical columns in the dataset.
+    This function computes fundamental statistical metrics including mean, median,
+    standard deviation, skewness, and counts of missing values for all numerical
+    columns in the provided DataFrame.
     Args:
+        data: A pandas DataFrame containing the dataset to analyze. The DataFrame
+            should contain at least one numerical column for meaningful analysis.
     Returns:
+        str: A string containing formatted basic statistics for each numerical column,
+            including mean, median, standard deviation, skewness, and missing value counts.
     """
+    stats = {}
+    numeric_cols = data.select_dtypes(include=[np.number]).columns
+    for col in numeric_cols:
+        stats[col] = {
+            "mean": float(data[col].mean()),
+            "median": float(data[col].median()),
+            "std": float(data[col].std()),
+            "skew": float(data[col].skew()),
+            "missing": int(data[col].isnull().sum()),
         }
+    return str(stats)
 @tool
+def generate_correlation_matrix(data: pd.DataFrame) -> str:
+    """Generate a visual correlation matrix for numerical columns in the dataset.
+    This function creates a heatmap visualization showing the correlations between
+    all numerical columns in the dataset. The correlation values are displayed
+    using a color-coded matrix for easy interpretation.
     Args:
+        data: A pandas DataFrame containing the dataset to analyze. The DataFrame
+            should contain at least two numerical columns for correlation analysis.
     Returns:
+        str: A base64 encoded string representing the correlation matrix plot image,
+            which can be displayed in a web interface or saved as an image file.
     """
+    numeric_data = data.select_dtypes(include=[np.number])
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(numeric_data.corr(), annot=True, cmap="coolwarm")
+    plt.title("Correlation Matrix")
     buf = io.BytesIO()
+    plt.savefig(buf, format="png")
     plt.close()
     return base64.b64encode(buf.getvalue()).decode()
 @tool
+def analyze_categorical_columns(data: pd.DataFrame) -> str:
+    """Analyze categorical columns in the dataset for distribution and frequencies.
+    This function examines categorical columns to identify unique values, top categories,
+    and missing value counts, providing insights into the categorical data distribution.
     Args:
+        data: A pandas DataFrame containing the dataset to analyze. The DataFrame
+            should contain at least one categorical column for meaningful analysis.
     Returns:
+        str: A string containing formatted analysis results for each categorical column,
+            including unique value counts, top categories, and missing value counts.
     """
+    categorical_cols = data.select_dtypes(include=["object", "category"]).columns
+    analysis = {}
+    for col in categorical_cols:
+        analysis[col] = {
+            "unique_values": int(data[col].nunique()),
+            "top_categories": data[col].value_counts().head(5).to_dict(),
+            "missing": int(data[col].isnull().sum()),
+        }
+    return str(analysis)
 @tool
+def suggest_features(data: pd.DataFrame) -> str:
+    """Suggest potential feature engineering steps based on data characteristics.
+    This function analyzes the dataset's structure and statistical properties to
+    recommend possible feature engineering steps that could improve model performance.
     Args:
+        data: A pandas DataFrame containing the dataset to analyze. The DataFrame
+            can contain both numerical and categorical columns.
     Returns:
+        str: A string containing suggestions for feature engineering based on
+            the characteristics of the input data.
     """
+    suggestions = []
+    numeric_cols = data.select_dtypes(include=[np.number]).columns
+    categorical_cols = data.select_dtypes(include=["object", "category"]).columns
+    if len(numeric_cols) >= 2:
+        suggestions.append("Consider creating interaction terms between numerical features")
+    if len(categorical_cols) > 0:
+        suggestions.append("Consider one-hot encoding for categorical variables")
+    for col in numeric_cols:
+        if data[col].skew() > 1 or data[col].skew() < -1:
+            suggestions.append(f"Consider log transformation for {col} due to skewness")
+    return "\n".join(suggestions)
+@tool
+def describe_data(data: pd.DataFrame) -> str:
+    """Generates a comprehensive descriptive statistics report for the entire DataFrame.
+    Args:
+       data: A pandas DataFrame containing the dataset to analyze.
+    Returns:
+      str: String representation of the descriptive statistics
+    """
+    return data.describe(include='all').to_string()
+@tool
+def execute_code(code_string: str, data: pd.DataFrame) -> str:
+    """Executes python code and returns results as a string.
+    Args:
+        code_string (str): Python code to execute.
+        data (pd.DataFrame): The dataframe to use in the code
+    Returns:
+        str: The result of executing the code or an error message
+    """
+    try:
+        # This dictionary will be available to the code
+        local_vars = {"data": data, "pd": pd, "np": np, "plt": plt, "sns": sns}
+        # Execute the code with the passed variables
+        exec(code_string, local_vars)
+        if "result" in local_vars:
+           if isinstance(local_vars["result"], (pd.DataFrame, pd.Series)):
+              return local_vars["result"].to_string()
+           elif isinstance(local_vars["result"], plt.Figure):
+             buf = io.BytesIO()
+             local_vars["result"].savefig(buf, format='png')
+             plt.close(local_vars["result"])
+             return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}"
+           else:
+            return str(local_vars["result"])
+        else:
+             return "Code executed successfully, but no variable called 'result' was assigned."
+    except Exception as e:
+        return f"Error executing code: {str(e)}"
+@st.cache_data
+def load_data(uploaded_file):
+    """Loads data from an uploaded file with caching."""
+    try:
+        if uploaded_file.name.endswith(".csv"):
+            return pd.read_csv(uploaded_file)
+        elif uploaded_file.name.endswith((".xls", ".xlsx")):
+            return pd.read_excel(uploaded_file)
+        elif uploaded_file.name.endswith(".json"):
+            return pd.read_json(uploaded_file)
+        else:
+            raise ValueError(
+                "Unsupported file format. Please upload a CSV, Excel, or JSON file."
+            )
+    except Exception as e:
+        st.error(f"Error loading data: {e}")
+        return None
 def main():
+    st.title("Data Analysis Assistant")
+    st.write("Upload your dataset and get automated analysis with natural language interaction.")
     # Initialize session state
+    if "data" not in st.session_state:
+        st.session_state["data"] = None
+    if "agent" not in st.session_state:
+        st.session_state["agent"] = None
+    if "custom_code" not in st.session_state:
+        st.session_state['custom_code'] = ""
+    uploaded_file = st.file_uploader("Choose a CSV, Excel, or JSON file", type=["csv", "xlsx", "xls", "json"])
+    if uploaded_file:
+        with st.spinner("Loading and processing your data..."):
+            data = load_data(uploaded_file)
+            if data is not None:
+                st.session_state["data"] = data
+                st.session_state["agent"] = DataAnalysisAgent(
+                    dataset=data,
+                    tools=[
+                        analyze_basic_stats,
+                        generate_correlation_matrix,
+                        analyze_categorical_columns,
+                        suggest_features,
+                        describe_data,
+                        execute_code
+                    ],
+                    model=GroqLLM(),
+                    additional_authorized_imports=["pandas", "numpy", "matplotlib", "seaborn"],
+                )
+                st.success(
+                    f"Successfully loaded dataset with {data.shape[0]} rows and {data.shape[1]} columns"
+                )
+                st.subheader("Data Preview")
+                st.dataframe(data.head())
+    if st.session_state["data"] is not None:
+        analysis_type = st.selectbox(
+            "Choose analysis type",
+            [
+                "Basic Statistics",
+                "Correlation Analysis",
+                "Categorical Analysis",
+                "Feature Engineering",
+                 "Data Description",
+                "Custom Code",
+                "Custom Question",
+            ],
+        )
+        if analysis_type == "Basic Statistics":
+            with st.spinner("Analyzing basic statistics..."):
+                result = st.session_state["agent"].run(
+                    "Use the analyze_basic_stats tool to analyze this dataset and "
+                    "provide insights about the numerical distributions."
+                )
+                st.write(result)
+        elif analysis_type == "Correlation Analysis":
+            with st.spinner("Generating correlation matrix..."):
+                result = st.session_state["agent"].run(
+                    "Use the generate_correlation_matrix tool to analyze correlations "
+                    "and explain any strong relationships found."
+                )
+                if isinstance(result, str) and result.startswith("data:image") or "," in result:
+                    st.image(f"data:image/png;base64,{result.split(',')[-1]}")
+                else:
+                    st.write(result)
+        elif analysis_type == "Categorical Analysis":
+            with st.spinner("Analyzing categorical columns..."):
+                result = st.session_state["agent"].run(
+                    "Use the analyze_categorical_columns tool to examine the "
+                    "categorical variables and explain the distributions."
+                )
+                st.write(result)
+        elif analysis_type == "Feature Engineering":
+            with st.spinner("Generating feature suggestions..."):
+                result = st.session_state["agent"].run(
+                    "Use the suggest_features tool to recommend potential "
+                    "feature engineering steps for this dataset."
+                )
+                st.write(result)
+        elif analysis_type == "Data Description":
+            with st.spinner("Generating data description"):
+                result = st.session_state["agent"].run(
+                    "Use the describe_data tool to generate a comprehensive description "
+                    "of the data."
+                )
+                st.write(result)
+        elif analysis_type == "Custom Code":
+            st.session_state['custom_code'] = st_ace(
+                placeholder="Enter your Python code here...",
+                language="python",
+                theme="github",
+                key="code_editor",
+                value=st.session_state['custom_code'],
+            )
+            if st.button("Run Code"):
+                 with st.spinner("Executing custom code..."):
+                    result = st.session_state["agent"].run(
+                       f"Execute the following code and return any 'result' variable"
+                       f"```python\n{st.session_state['custom_code']}\n```"
+                   )
+                    if isinstance(result, str) and result.startswith("data:image"):
+                        st.image(f"{result}")
+                    else:
+                       st.write(result)
+        elif analysis_type == "Custom Question":
+             question = st.text_input("What would you like to know about your data?")
+             if question:
+                with st.spinner("Analyzing..."):
+                    result = st.session_state["agent"].run(question, stream=True)  # Pass stream argument here
+                    st.write(result)
 if __name__ == "__main__":
     main()