Spaces:
Sleeping
Sleeping
File size: 6,554 Bytes
944593e 1c7a5e7 7a84307 eef276d dff1e7c eef276d dff1e7c 944593e d47491b 944593e f2ed4e7 944593e 1c7a5e7 944593e 69c78ba 944593e fa8f268 69c78ba 944593e f080dd9 944593e f080dd9 249a008 f080dd9 08f6ce3 1c7a5e7 f2ed4e7 1c7a5e7 573b41b 08f6ce3 d47491b 08f6ce3 d47491b f2ed4e7 eef276d f080dd9 d47491b f080dd9 573b41b f080dd9 08f6ce3 573b41b f080dd9 d47491b f080dd9 d47491b f080dd9 d47491b f080dd9 573b41b f080dd9 1f90e38 f080dd9 eef276d f080dd9 f2ed4e7 08f6ce3 f080dd9 f2ed4e7 f080dd9 08f6ce3 f080dd9 eef276d f080dd9 f2ed4e7 eef276d f080dd9 11dd106 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import streamlit as st
import pandas as pd
import os
from dotenv import load_dotenv
from llama_index.readers.file.paged_csv.base import PagedCSVReader
from llama_index.core import Settings, VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.ingestion import IngestionPipeline
from langchain_community.vectorstores import FAISS as LangChainFAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import faiss
import tempfile
# Load environment variables
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# Global settings for LlamaIndex
EMBED_DIMENSION = 512
Settings.llm = OpenAI(model="gpt-4o")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION)
# Streamlit app
st.title("Chat with CSV Files - LangChain vs LlamaIndex")
# File uploader
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file:
try:
# Read and preview CSV data using pandas
data = pd.read_csv(uploaded_file)
st.write("Preview of uploaded data:")
st.dataframe(data)
# Save the uploaded file to a temporary location
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8") as temp_file:
temp_file_path = temp_file.name
data.to_csv(temp_file.name, index=False, encoding="utf-8")
temp_file.flush()
# Debugging: Verify the temporary file (Display partially)
st.write("Temporary file path:", temp_file_path)
with open(temp_file_path, "r") as f:
content = f.read()
st.write("Partial file content (first 500 characters):")
st.text(content[:500])
# Tabs for LangChain and LlamaIndex
tab1, tab2 = st.tabs(["LangChain", "LlamaIndex"])
# LangChain Tab with Proper FAISS Initialization
with tab1:
st.subheader("LangChain Query")
try:
# Custom preprocessing: Split CSV rows into smaller chunks
st.write("Processing CSV with a custom loader...")
documents = []
for _, row in data.iterrows():
content = "\n".join([f"{col}: {row[col]}" for col in data.columns])
documents.append({"page_content": content})
# Debugging: Preview loaded documents
st.write("Successfully processed documents:")
if documents:
st.text(documents[0]["page_content"])
# Create FAISS VectorStore with proper arguments
langchain_index = faiss.IndexFlatL2(EMBED_DIMENSION)
docstore = InMemoryDocstore() # Create an in-memory docstore
index_to_docstore_id = {} # Mapping of index to document ID
langchain_vector_store = LangChainFAISS(
embedding_function=OpenAIEmbeddings(),
index=langchain_index,
docstore=docstore,
index_to_docstore_id=index_to_docstore_id,
)
langchain_vector_store.add_documents(documents)
# LangChain Retrieval Chain
retriever = langchain_vector_store.as_retriever()
system_prompt = (
"You are an assistant for question-answering tasks. "
"Use the following pieces of retrieved context to answer "
"the question. If you don't know the answer, say that you "
"don't know. Use three sentences maximum and keep the "
"answer concise.\n\n{context}"
)
prompt = ChatPromptTemplate.from_messages(
[("system", system_prompt), ("human", "{input}")]
)
question_answer_chain = create_stuff_documents_chain(ChatOpenAI(model="gpt-4o"), prompt)
langchain_rag_chain = create_retrieval_chain(retriever, question_answer_chain)
# Query input
query = st.text_input("Ask a question about your data (LangChain):")
if query:
answer = langchain_rag_chain.invoke({"input": query})
st.write(f"Answer: {answer['answer']}")
except Exception as e:
st.error(f"Error processing with LangChain: {e}")
# LlamaIndex Tab
with tab2:
st.subheader("LlamaIndex Query")
try:
# Use PagedCSVReader directly on the uploaded file
st.write("Loading file with LlamaIndex PagedCSVReader...")
csv_reader = PagedCSVReader()
docs = csv_reader.load_from_file(temp_file_path)
# Debugging: Preview loaded documents
st.write("Successfully loaded documents:")
if docs:
st.text(docs[0].text)
# Initialize FAISS Vector Store
llama_faiss_index = faiss.IndexFlatL2(EMBED_DIMENSION)
llama_vector_store = FaissVectorStore(faiss_index=llama_faiss_index)
# Create the ingestion pipeline and process the data
pipeline = IngestionPipeline(vector_store=llama_vector_store, documents=docs)
nodes = pipeline.run()
# Create a query engine
llama_index = VectorStoreIndex(nodes)
query_engine = llama_index.as_query_engine(similarity_top_k=3)
# Query input
query = st.text_input("Ask a question about your data (LlamaIndex):")
if query:
response = query_engine.query(query)
st.write(f"Answer: {response.response}")
except Exception as e:
st.error(f"Error processing with LlamaIndex: {e}")
finally:
# Clean up the temporary file
if 'temp_file_path' in locals() and os.path.exists(temp_file_path):
os.remove(temp_file_path)
except Exception as e:
st.error(f"Error reading uploaded file: {e}")
|