Spaces:

Arxived
/

pandasai

Sleeping

File size: 7,854 Bytes

import streamlit as st
import pandas as pd
import plotly.express as px
from datasets import load_dataset
from pandasai import Agent
from pandasai.llm.openai import OpenAI
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
import os
import logging

# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Set the title of the app
st.title("Data Analyzer")

# Fetch API keys from environment variables
api_key = os.getenv("OPENAI_API_KEY")
pandasai_api_key = os.getenv("PANDASAI_API_KEY")

if not api_key or not pandasai_api_key:
    st.error(
        "API keys not found in the environment. Please set the 'OPENAI_API_KEY' and 'PANDASAI_API_KEY' environment variables."
    )
    logger.error("API keys not found. Ensure they are set in the environment variables.")
else:
    def load_dataset_into_session():
        """Function to load a dataset into the session."""
        input_option = st.radio("Select Dataset Input:", ["Use Repo Dataset", "Use Hugging Face Dataset", "Upload CSV File"])

        # Option 1: Use Repo Dataset
        if input_option == "Use Repo Dataset":
            file_path = "./source/test.csv"
            if st.button("Load Repo Dataset"):
                try:
                    st.session_state.df = pd.read_csv(file_path)
                    st.success(f"File loaded successfully from '{file_path}'!")
                    st.dataframe(st.session_state.df.head(10))
                except Exception as e:
                    st.error(f"Error reading file from path: {e}")
                    logger.error(f"Error reading file from path: {e}")

        # Option 2: Use Hugging Face Dataset
        elif input_option == "Use Hugging Face Dataset":
            dataset_name = st.text_input("Enter Hugging Face Dataset Name:", value="HUPD/hupd")
            if st.button("Load Hugging Face Dataset"):
                try:
                    # Load Hugging Face dataset
                    dataset = load_dataset(dataset_name, split="train", trust_remote_code=True)

                    # Convert dataset to Pandas DataFrame
                    if isinstance(dataset, dict) or isinstance(dataset, list):
                        st.session_state.df = pd.DataFrame(dataset)
                    elif hasattr(dataset, 'to_pandas'):
                        st.session_state.df = dataset.to_pandas()
                    else:
                        raise ValueError("Invalid input data. Cannot convert it to a DataFrame.")

                    st.success(f"Dataset '{dataset_name}' loaded successfully!")
                    st.dataframe(st.session_state.df.head(10))
                except Exception as e:
                    st.error(f"Error loading dataset from Hugging Face: {e}")
                    logger.error(f"Error loading Hugging Face dataset: {e}")

        # Option 3: Upload CSV File
        elif input_option == "Upload CSV File":
            uploaded_file = st.file_uploader("Upload CSV File:", type=["csv"])
            if uploaded_file:
                try:
                    st.session_state.df = pd.read_csv(uploaded_file)
                    st.success("File uploaded successfully!")
                    st.dataframe(st.session_state.df.head(10))
                except Exception as e:
                    st.error(f"Error reading uploaded file: {e}")
                    logger.error(f"Error reading uploaded file: {e}")

    # Initialize session state for DataFrame
    if "df" not in st.session_state:
        st.session_state.df = None

    # Load dataset into session
    load_dataset_into_session()

    # Proceed only if a DataFrame is loaded
    if st.session_state.df is not None:
        df = st.session_state.df
        try:
            # Initialize PandasAI Agent
            llm = OpenAI(api_key=pandasai_api_key, max_tokens=1500, timeout=60)
            agent = Agent(df, llm=llm)

            # Convert the DataFrame into documents for RAG
            documents = [
                Document(
                    page_content=", ".join([f"{col}: {row[col]}" for col in df.columns if pd.notnull(row[col])]),
                    metadata={"index": index}
                )
                for index, row in df.iterrows()
            ]
            logger.info(f"{len(documents)} documents created for RAG.")

            # Set up RAG
            embeddings = OpenAIEmbeddings()
            vectorstore = FAISS.from_documents(documents, embeddings)
            retriever = vectorstore.as_retriever()
            qa_chain = RetrievalQA.from_chain_type(
                llm=ChatOpenAI(),
                chain_type="stuff",
                retriever=retriever
            )

            # Create tabs
            tab1, tab2, tab3 = st.tabs(["PandasAI Analysis", "RAG QA", "Data Visualization"])

            # Tab 1: PandasAI Analysis
            with tab1:
                st.header("Data Analysis using PandasAI")
                pandas_question = st.text_input("Ask a question about the data (PandasAI):")
                if pandas_question:
                    try:
                        result = agent.chat(pandas_question)
                        if result:
                            st.write("PandasAI Answer:", result)
                        else:
                            st.warning("PandasAI returned no result. Please try another question.")
                    except Exception as e:
                        st.error(f"Error from PandasAI: {e}")
                        logger.error(f"PandasAI error: {e}")

            # Tab 2: RAG QA
            with tab2:
                st.header("Question Answering using RAG")
                rag_question = st.text_input("Ask a question about the data (RAG):")
                if rag_question:
                    try:
                        result = qa_chain.run(rag_question)
                        st.write("RAG Answer:", result)
                    except Exception as e:
                        st.error(f"Error from RAG Q&A: {e}")
                        logger.error(f"RAG error: {e}")

            # Tab 3: Data Visualization
            with tab3:
                st.header("Data Visualization")
                viz_question = st.text_input("What kind of graph would you like to create? (e.g., 'Show a scatter plot of salary vs experience')")
                if viz_question:
                    try:
                        result = agent.chat(viz_question)

                        # Extract Python code for visualization
                        code_pattern = r'```python\n(.*?)\n```'
                        code_match = re.search(code_pattern, result, re.DOTALL)

                        if code_match:
                            viz_code = code_match.group(1)
                            logger.debug(f"Extracted visualization code: {viz_code}")

                            # Safeguard: Modify and validate code for Plotly
                            viz_code = viz_code.replace('plt.', 'px.')
                            exec(viz_code)  # Execute the visualization code
                            st.plotly_chart(fig)
                        else:
                            st.warning("Unable to generate a graph. Please try a different query.")
                            logger.warning("No valid visualization code found in PandasAI response.")
                    except Exception as e:
                        st.error(f"An error occurred: {e}")
                        logger.error(f"Visualization error: {e}")
        except Exception as e:
            st.error(f"An error occurred while processing the dataset: {e}")
            logger.error(f"Dataset processing error: {e}")