File size: 5,698 Bytes
944593e
 
 
1c7a5e7
7a84307
eef276d
dff1e7c
eef276d
dff1e7c
 
944593e
 
 
 
 
 
 
f2ed4e7
944593e
 
1c7a5e7
944593e
 
69c78ba
944593e
fa8f268
69c78ba
944593e
 
f080dd9
944593e
 
 
 
f080dd9
249a008
f080dd9
 
 
 
1c7a5e7
 
f2ed4e7
1c7a5e7
 
f2ed4e7
eef276d
 
f080dd9
 
 
 
 
f2ed4e7
 
f080dd9
 
eef276d
f080dd9
 
 
 
eef276d
f080dd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f90e38
f080dd9
 
eef276d
f080dd9
 
 
 
 
 
 
 
 
 
 
f2ed4e7
f080dd9
f2ed4e7
f080dd9
eef276d
f080dd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eef276d
f080dd9
 
 
 
 
 
f2ed4e7
 
 
 
eef276d
f080dd9
11dd106
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import streamlit as st
import pandas as pd
import os
from dotenv import load_dotenv
from llama_index.readers.file.paged_csv.base import PagedCSVReader
from llama_index.core import Settings, VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.ingestion import IngestionPipeline
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import FAISS as LangChainFAISS
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import faiss
import tempfile

# Load environment variables
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# Global settings for LlamaIndex
EMBED_DIMENSION = 512
Settings.llm = OpenAI(model="gpt-4o")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION)

# Streamlit app
st.title("Chat with CSV Files - LangChain vs LlamaIndex")

# File uploader
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file:
    try:
        # Read and preview CSV data using pandas
        data = pd.read_csv(uploaded_file)
        st.write("Preview of uploaded data:")
        st.dataframe(data)

        # Clean and save the file to a temporary location
        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8") as temp_file:
            temp_file_path = temp_file.name
            data.to_csv(temp_file.name, index=False, encoding="utf-8")
            temp_file.flush()  # Ensure data is written

        # Tabs for LangChain and LlamaIndex
        tab1, tab2 = st.tabs(["LangChain", "LlamaIndex"])

        # LangChain Tab
        with tab1:
            st.subheader("LangChain Query")
            try:
                # Use CSVLoader with the temporary file path
                loader = CSVLoader(file_path=temp_file_path)
                docs = loader.load_and_split()

                # Preview the first document chunk
                if docs:
                    st.write("Preview of a document chunk (LangChain):")
                    st.text(docs[0].page_content)

                # Create FAISS VectorStore
                langchain_index = faiss.IndexFlatL2(EMBED_DIMENSION)
                langchain_vector_store = LangChainFAISS(
                    embedding_function=OpenAIEmbeddings(),
                    index=langchain_index,
                )
                langchain_vector_store.add_documents(docs)

                # LangChain Retrieval Chain
                retriever = langchain_vector_store.as_retriever()
                system_prompt = (
                    "You are an assistant for question-answering tasks. "
                    "Use the following pieces of retrieved context to answer "
                    "the question. If you don't know the answer, say that you "
                    "don't know. Use three sentences maximum and keep the "
                    "answer concise.\n\n{context}"
                )
                prompt = ChatPromptTemplate.from_messages(
                    [("system", system_prompt), ("human", "{input}")]
                )
                question_answer_chain = create_stuff_documents_chain(ChatOpenAI(model="gpt-4o"), prompt)
                langchain_rag_chain = create_retrieval_chain(retriever, question_answer_chain)

                # Query input
                query = st.text_input("Ask a question about your data (LangChain):")
                if query:
                    answer = langchain_rag_chain.invoke({"input": query})
                    st.write(f"Answer: {answer['answer']}")
            except Exception as e:
                st.error(f"Error processing with LangChain: {e}")

        # LlamaIndex Tab
        with tab2:
            st.subheader("LlamaIndex Query")
            try:
                # Use PagedCSVReader directly on the uploaded file
                csv_reader = PagedCSVReader()
                docs = csv_reader.load_from_file(temp_file_path)

                # Preview the first document chunk
                if docs:
                    st.write("Preview of a document chunk (LlamaIndex):")
                    st.text(docs[0].text)

                # Initialize FAISS Vector Store
                llama_faiss_index = faiss.IndexFlatL2(EMBED_DIMENSION)
                llama_vector_store = FaissVectorStore(faiss_index=llama_faiss_index)

                # Create the ingestion pipeline and process the data
                pipeline = IngestionPipeline(vector_store=llama_vector_store, documents=docs)
                nodes = pipeline.run()

                # Create a query engine
                llama_index = VectorStoreIndex(nodes)
                query_engine = llama_index.as_query_engine(similarity_top_k=3)

                # Query input
                query = st.text_input("Ask a question about your data (LlamaIndex):")
                if query:
                    response = query_engine.query(query)
                    st.write(f"Answer: {response.response}")
            except Exception as e:
                st.error(f"Error processing with LlamaIndex: {e}")
            finally:
                # Clean up the temporary file
                if 'temp_file_path' in locals() and os.path.exists(temp_file_path):
                    os.remove(temp_file_path)

    except Exception as e:
        st.error(f"Error reading uploaded file: {e}")