File size: 4,809 Bytes
d7bf121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import streamlit as st
import pandas as pd
import plotly.express as px
from pandasai import Agent
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
from datasets import load_dataset  
import os

# Title
st.title("PandasAI Data Analysis Tool with RAG")

# Fetch API keys from environment variables
api_key = os.getenv("OPENAI_API_KEY")
pandasai_api_key = os.getenv("PANDASAI_API_KEY")

# Dataset selection
st.sidebar.header("Dataset Input Options")
input_option = st.sidebar.radio("Select Dataset Input:", ["Use Hugging Face Dataset", "Upload CSV File"])

# Initialize session state for the dataframe
if "df" not in st.session_state:
    st.session_state.df = None

# Dataset loading logic
if input_option == "Use Hugging Face Dataset":
    dataset_name = st.sidebar.text_input("Enter Hugging Face Dataset Name:", value="HUPD/hupd")
    if st.sidebar.button("Load Dataset"):
        try:
            dataset = load_dataset(dataset_name, name="sample", split="train", trust_remote_code=True)
            st.session_state.df = pd.DataFrame(dataset)
            st.sidebar.success(f"Dataset '{dataset_name}' loaded successfully!")
        except Exception as e:
            st.sidebar.error(f"Error loading dataset: {e}")
elif input_option == "Upload CSV File":
    uploaded_file = st.sidebar.file_uploader("Upload CSV File:", type=["csv"])
    if uploaded_file:
        try:
            st.session_state.df = pd.read_csv(uploaded_file)
            st.sidebar.success("File uploaded successfully!")
        except Exception as e:
            st.sidebar.error(f"Error loading file: {e}")

# Show the loaded dataframe preview
if st.session_state.df is not None:
    st.subheader("Dataset Preview")
    st.dataframe(st.session_state.df.head(10))

    # Set up PandasAI Agent
    agent = Agent(st.session_state.df)

    # Convert DataFrame to documents
    documents = [
        Document(
            page_content=", ".join([f"{col}: {row[col]}" for col in st.session_state.df.columns]),
            metadata={"index": index}
        )
        for index, row in st.session_state.df.iterrows()
    ]

    # Set up RAG
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(documents, embeddings)
    retriever = vectorstore.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(),
        chain_type="stuff",
        retriever=retriever
    )

    # Create tabs for different functionality
    tab1, tab2, tab3 = st.tabs(["PandasAI Analysis", "RAG Q&A", "Data Visualization"])

    with tab1:
        st.header("Data Analysis with PandasAI")
        pandas_question = st.text_input("Ask a question about your data (PandasAI):")
        if pandas_question:
            result = agent.chat(pandas_question)
            st.write("PandasAI Answer:", result)

    with tab2:
        st.header("Q&A with RAG")
        rag_question = st.text_input("Ask a question about your data (RAG):")
        if rag_question:
            result = qa_chain.run(rag_question)
            st.write("RAG Answer:", result)

    with tab3:
        st.header("Data Visualization")
        viz_question = st.text_input("What kind of graph would you like to see? (e.g., 'Show a scatter plot of salary vs experience')")
        if viz_question:
            try:
                result = agent.chat(viz_question)
                
                # Convert the PandasAI result into executable code
                import re
                code_pattern = r'```python\n(.*?)\n```'
                code_match = re.search(code_pattern, result, re.DOTALL)
                
                if code_match:
                    viz_code = code_match.group(1)
                    # Modify the code to use 'px' instead of 'plt'
                    viz_code = viz_code.replace('plt.', 'px.')
                    viz_code = viz_code.replace('plt.show()', 'fig = px.scatter(df, x=x, y=y)')
                    
                    # Execute the code and display the graph
                    exec(viz_code)
                    st.plotly_chart(fig)
                else:
                    st.write("Failed to generate a graph. Please try asking differently.")
            except Exception as e:
                st.write(f"An error occurred: {str(e)}")
                st.write("Please try rephrasing your question.")
else:
    st.warning("No dataset loaded. Please select a dataset input option from the sidebar.")

# Error handling for missing API keys
if not api_key:
    st.error("Missing OpenAI API Key in environment variables.")
if not pandasai_api_key:
    st.error("Missing PandasAI API Key in environment variables.")