Spaces:

Arxived
/

chat-w-csv

Sleeping

File size: 5,970 Bytes

944593e
 
 
e020ff3
1c7a5e7
7a84307
eef276d
dff1e7c
eef276d
dff1e7c
 
944593e
d47491b
944593e
 
 
 
e020ff3
944593e
f2ed4e7
944593e
 
 
 
8d99061
e020ff3
 
 
8d99061
 
 
 
 
 
fa8f268
8d99061
944593e
 
f080dd9
944593e
 
 
 
f080dd9
249a008
f080dd9
 
 
 
08f6ce3
1c7a5e7
f2ed4e7
1c7a5e7
573b41b
08f6ce3
e020ff3
08f6ce3
 
d47491b
 
 
f2ed4e7
eef276d
 
f080dd9
e020ff3
f080dd9
 
cfb9d35
f080dd9
e020ff3
573b41b
 
 
 
8d99061
e020ff3
 
8d99061
 
 
 
f080dd9
e020ff3
 
cfb9d35
f080dd9
8d99061
f080dd9
d47491b
 
f080dd9
cfb9d35
8d99061
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f080dd9
e020ff3
f080dd9
cfb9d35
f080dd9
cfb9d35
 
 
 
 
e020ff3
cfb9d35
8d99061
cfb9d35
f080dd9
e020ff3
f080dd9
8d99061
eef276d
f080dd9
e020ff3
11dd106
8d99061

import streamlit as st
import pandas as pd
import os
import traceback
from dotenv import load_dotenv
from llama_index.readers.file.paged_csv.base import PagedCSVReader
from llama_index.core import Settings, VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.ingestion import IngestionPipeline
from langchain_community.vectorstores import FAISS as LangChainFAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document  
import faiss
import tempfile

# Load environment variables
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# ✅ Check OpenAI API Key
if not os.getenv("OPENAI_API_KEY"):
    st.error("⚠️ OpenAI API Key is missing! Please check your .env file or environment variables.")

# ✅ Ensure OpenAI Embeddings match FAISS dimensions
embedding_function = OpenAIEmbeddings()
test_vector = embedding_function.embed_query("test")  # Sample embedding
faiss_dimension = len(test_vector)  # ✅ Dynamically detect correct dimension

# ✅ Update global settings for LlamaIndex
Settings.llm = OpenAI(model="gpt-4o")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=faiss_dimension)

# Streamlit app
st.title("Chat with CSV Files - LangChain vs LlamaIndex")

# File uploader
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file:
    try:
        # Read and preview CSV data using pandas
        data = pd.read_csv(uploaded_file)
        st.write("Preview of uploaded data:")
        st.dataframe(data)

        # Save the uploaded file to a temporary location
        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8") as temp_file:
            temp_file_path = temp_file.name
            data.to_csv(temp_file.name, index=False, encoding="utf-8")
            temp_file.flush()

        # Debugging: Verify the temporary file (Display partial content)
        st.write("Temporary file path:", temp_file_path)
        with open(temp_file_path, "r") as f:
            content = f.read()
        st.write("Partial file content (first 500 characters):")
        st.text(content[:500])

        # Tabs for LangChain and LlamaIndex
        tab1, tab2 = st.tabs(["LangChain", "LlamaIndex"])

        # ✅ LangChain Processing
        with tab1:
            st.subheader("LangChain Query")

            try:
                # ✅ Convert CSV rows into LangChain Document objects
                st.write("Processing CSV with a custom loader...")
                documents = []
                for _, row in data.iterrows():
                    content = "\n".join([f"{col}: {row[col]}" for col in data.columns])
                    doc = Document(page_content=content)  
                    documents.append(doc)


                # ✅ Create FAISS VectorStore with Correct Dimensions
                st.write(f"✅ Initializing FAISS with dimension: {faiss_dimension}")
                langchain_index = faiss.IndexFlatL2(faiss_dimension)

                docstore = InMemoryDocstore()
                index_to_docstore_id = {}

                langchain_vector_store = LangChainFAISS(
                    embedding_function=embedding_function,
                    index=langchain_index,
                    docstore=docstore,
                    index_to_docstore_id=index_to_docstore_id,
                )

                # ✅ Ensure documents are added correctly
                try:
                    langchain_vector_store.add_documents(documents)
                    st.write("✅ Documents successfully added to FAISS VectorStore.")
                except Exception as e:
                    st.error(f"Error adding documents to FAISS: {e}")

                # ✅ Create LangChain Query Execution Pipeline
                retriever = langchain_vector_store.as_retriever()
                system_prompt = (
                    "You are an assistant for question-answering tasks. "
                    "Use the following pieces of retrieved context to answer "
                    "the question. If you don't know the answer, say that you "
                    "don't know. Use three sentences maximum and keep the "
                    "answer concise.\n\n{context}"
                )
                prompt = ChatPromptTemplate.from_messages(
                    [("system", system_prompt), ("human", "{input}")]
                )
                question_answer_chain = create_stuff_documents_chain(ChatOpenAI(model="gpt-4o"), prompt)
                langchain_rag_chain = create_retrieval_chain(retriever, question_answer_chain)

                # ✅ Query Processing
                query = st.text_input("Ask a question about your data (LangChain):")

                if query:
                    try:
                        st.write("Processing your question...")
                        answer = langchain_rag_chain.invoke({"input": query})
                        st.write(f"**Answer:** {answer['answer']}")
                    except Exception as e:
                        error_message = traceback.format_exc()
                        st.error(f"Error processing query: {e}")
                        st.text(error_message)

            except Exception as e:
                error_message = traceback.format_exc()
                st.error(f"Error processing with LangChain: {e}")
                st.text(error_message)

    except Exception as e:
        error_message = traceback.format_exc()
        st.error(f"Error reading uploaded file: {e}")
        st.text(error_message)