Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import os | |
from dotenv import load_dotenv | |
from llama_index.readers.file.paged_csv.base import PagedCSVReader | |
from llama_index.core import Settings, VectorStoreIndex | |
from llama_index.llms.openai import OpenAI | |
from llama_index.embeddings.openai import OpenAIEmbedding | |
from llama_index.vector_stores.faiss import FaissVectorStore | |
from llama_index.core.ingestion import IngestionPipeline | |
from langchain_community.vectorstores import FAISS as LangChainFAISS | |
from langchain_community.docstore.in_memory import InMemoryDocstore | |
from langchain.chains import create_retrieval_chain | |
from langchain.chains.combine_documents import create_stuff_documents_chain | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI | |
import faiss | |
import tempfile | |
# Load environment variables | |
load_dotenv() | |
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") | |
# Global settings for LlamaIndex | |
EMBED_DIMENSION = 512 | |
Settings.llm = OpenAI(model="gpt-4o") | |
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION) | |
# Streamlit app | |
st.title("Chat with CSV Files - LangChain vs LlamaIndex") | |
# File uploader | |
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) | |
if uploaded_file: | |
try: | |
# Read and preview CSV data using pandas | |
data = pd.read_csv(uploaded_file) | |
st.write("Preview of uploaded data:") | |
st.dataframe(data) | |
# Save the uploaded file to a temporary location | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8") as temp_file: | |
temp_file_path = temp_file.name | |
data.to_csv(temp_file.name, index=False, encoding="utf-8") | |
temp_file.flush() | |
# Debugging: Verify the temporary file (Display partially) | |
st.write("Temporary file path:", temp_file_path) | |
with open(temp_file_path, "r") as f: | |
content = f.read() | |
st.write("Partial file content (first 500 characters):") | |
st.text(content[:500]) | |
# Tabs for LangChain and LlamaIndex | |
tab1, tab2 = st.tabs(["LangChain", "LlamaIndex"]) | |
# LangChain Tab with Proper FAISS Initialization | |
with tab1: | |
st.subheader("LangChain Query") | |
try: | |
# Custom preprocessing: Split CSV rows into smaller chunks | |
st.write("Processing CSV with a custom loader...") | |
documents = [] | |
for _, row in data.iterrows(): | |
content = "\n".join([f"{col}: {row[col]}" for col in data.columns]) | |
documents.append({"page_content": content}) | |
# Debugging: Preview loaded documents | |
st.write("Successfully processed documents:") | |
if documents: | |
st.text(documents[0]["page_content"]) | |
# Create FAISS VectorStore with proper arguments | |
langchain_index = faiss.IndexFlatL2(EMBED_DIMENSION) | |
docstore = InMemoryDocstore() # Create an in-memory docstore | |
index_to_docstore_id = {} # Mapping of index to document ID | |
langchain_vector_store = LangChainFAISS( | |
embedding_function=OpenAIEmbeddings(), | |
index=langchain_index, | |
docstore=docstore, | |
index_to_docstore_id=index_to_docstore_id, | |
) | |
langchain_vector_store.add_documents(documents) | |
# LangChain Retrieval Chain | |
retriever = langchain_vector_store.as_retriever() | |
system_prompt = ( | |
"You are an assistant for question-answering tasks. " | |
"Use the following pieces of retrieved context to answer " | |
"the question. If you don't know the answer, say that you " | |
"don't know. Use three sentences maximum and keep the " | |
"answer concise.\n\n{context}" | |
) | |
prompt = ChatPromptTemplate.from_messages( | |
[("system", system_prompt), ("human", "{input}")] | |
) | |
question_answer_chain = create_stuff_documents_chain(ChatOpenAI(model="gpt-4o"), prompt) | |
langchain_rag_chain = create_retrieval_chain(retriever, question_answer_chain) | |
# Query input | |
query = st.text_input("Ask a question about your data (LangChain):") | |
if query: | |
answer = langchain_rag_chain.invoke({"input": query}) | |
st.write(f"Answer: {answer['answer']}") | |
except Exception as e: | |
st.error(f"Error processing with LangChain: {e}") | |
# LlamaIndex Tab | |
with tab2: | |
st.subheader("LlamaIndex Query") | |
try: | |
# Use PagedCSVReader directly on the uploaded file | |
st.write("Loading file with LlamaIndex PagedCSVReader...") | |
csv_reader = PagedCSVReader() | |
docs = csv_reader.load_from_file(temp_file_path) | |
# Debugging: Preview loaded documents | |
st.write("Successfully loaded documents:") | |
if docs: | |
st.text(docs[0].text) | |
# Initialize FAISS Vector Store | |
llama_faiss_index = faiss.IndexFlatL2(EMBED_DIMENSION) | |
llama_vector_store = FaissVectorStore(faiss_index=llama_faiss_index) | |
# Create the ingestion pipeline and process the data | |
pipeline = IngestionPipeline(vector_store=llama_vector_store, documents=docs) | |
nodes = pipeline.run() | |
# Create a query engine | |
llama_index = VectorStoreIndex(nodes) | |
query_engine = llama_index.as_query_engine(similarity_top_k=3) | |
# Query input | |
query = st.text_input("Ask a question about your data (LlamaIndex):") | |
if query: | |
response = query_engine.query(query) | |
st.write(f"Answer: {response.response}") | |
except Exception as e: | |
st.error(f"Error processing with LlamaIndex: {e}") | |
finally: | |
# Clean up the temporary file | |
if 'temp_file_path' in locals() and os.path.exists(temp_file_path): | |
os.remove(temp_file_path) | |
except Exception as e: | |
st.error(f"Error reading uploaded file: {e}") | |