File size: 5,711 Bytes
7d0e4c5
 
 
 
f121a60
7d0e4c5
 
 
 
 
 
f121a60
 
 
 
 
7d0e4c5
 
f121a60
7d0e4c5
 
 
 
 
 
 
 
 
f121a60
7d0e4c5
 
 
 
 
f121a60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d0e4c5
f121a60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d0e4c5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import streamlit as st
import pandas as pd
import plotly.express as px
from pandasai import Agent
from pandasai.llm.openai import OpenAI
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
import os
import logging

# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Set the title of the app
st.title("Data Analyzer on Hugging Face Spaces")

# Fetch API keys from environment variables
api_key = os.getenv("OPENAI_API_KEY")
pandasai_api_key = os.getenv("PANDASAI_API_KEY")

if not api_key or not pandasai_api_key:
    st.error(
        "API keys not found in the environment. Please set the 'OPENAI_API_KEY' and 'PANDASAI_API_KEY' environment variables."
    )
    logger.error("API keys not found. Ensure they are set in the environment variables.")
else:
    # File uploader
    uploaded_file = st.file_uploader("Upload an Excel or CSV file", type=["xlsx", "csv"])

    if uploaded_file is not None:
        try:
            # Load the data
            if uploaded_file.name.endswith('.xlsx'):
                df = pd.read_excel(uploaded_file)
            else:
                df = pd.read_csv(uploaded_file)

            st.write("Data Preview:")
            st.write(df.head())
            logger.info(f"Uploaded file loaded successfully with shape: {df.shape}")

            # Initialize PandasAI Agent
            llm = OpenAI(api_key=pandasai_api_key, max_tokens=1500, timeout=60)
            agent = Agent(df, llm=llm)

            # Convert the DataFrame into documents for RAG
            documents = [
                Document(
                    page_content=", ".join([f"{col}: {row[col]}" for col in df.columns if pd.notnull(row[col])]),
                    metadata={"index": index}
                )
                for index, row in df.iterrows()
            ]
            logger.info(f"{len(documents)} documents created for RAG.")

            # Set up RAG
            embeddings = OpenAIEmbeddings()
            vectorstore = FAISS.from_documents(documents, embeddings)
            retriever = vectorstore.as_retriever()
            qa_chain = RetrievalQA.from_chain_type(
                llm=ChatOpenAI(),
                chain_type="stuff",
                retriever=retriever
            )

            # Create tabs
            tab1, tab2, tab3 = st.tabs(["PandasAI Analysis", "RAG Q&A", "Data Visualization"])

            # Tab 1: PandasAI Analysis
            with tab1:
                st.header("Data Analysis using PandasAI")
                pandas_question = st.text_input("Ask a question about the data (PandasAI):")
                if pandas_question:
                    try:
                        result = agent.chat(pandas_question)
                        if result:
                            st.write("PandasAI Answer:", result)
                        else:
                            st.warning("PandasAI returned no result. Please try another question.")
                    except Exception as e:
                        st.error(f"Error from PandasAI: {e}")
                        logger.error(f"PandasAI error: {e}")

            # Tab 2: RAG Q&A
            with tab2:
                st.header("Question Answering using RAG")
                rag_question = st.text_input("Ask a question about the data (RAG):")
                if rag_question:
                    try:
                        result = qa_chain.run(rag_question)
                        st.write("RAG Answer:", result)
                    except Exception as e:
                        st.error(f"Error from RAG Q&A: {e}")
                        logger.error(f"RAG error: {e}")

            # Tab 3: Data Visualization
            with tab3:
                st.header("Data Visualization")
                viz_question = st.text_input("What kind of graph would you like to create? (e.g., 'Show a scatter plot of salary vs experience')")
                if viz_question:
                    try:
                        result = agent.chat(viz_question)

                        # Since PandasAI output is text, extract executable code
                        import re
                        code_pattern = r'```python\n(.*?)\n```'
                        code_match = re.search(code_pattern, result, re.DOTALL)

                        if code_match:
                            viz_code = code_match.group(1)
                            logger.debug(f"Extracted visualization code: {viz_code}")

                            # Modify code to use Plotly (px) instead of matplotlib (plt)
                            viz_code = viz_code.replace('plt.', 'px.')
                            viz_code = viz_code.replace('plt.show()', 'fig = px.scatter(df, x=x, y=y)')

                            # Execute the code and display the chart
                            exec(viz_code)
                            st.plotly_chart(fig)
                        else:
                            st.warning("Unable to generate a graph. Please try a different query.")
                            logger.warning("No valid visualization code found in PandasAI response.")
                    except Exception as e:
                        st.error(f"An error occurred: {e}")
                        logger.error(f"Visualization error: {e}")
        except Exception as e:
            st.error(f"An error occurred while processing the file: {e}")
            logger.error(f"File processing error: {e}")
    else:
        st.info("Please upload a file to begin analysis.")