Spaces:
Sleeping
Sleeping
File size: 6,374 Bytes
40e0a99 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import streamlit as st
import pandas as pd
import io
import os
from dotenv import load_dotenv
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from llama_index.readers.file.paged_csv.base import PagedCSVReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.ingestion import IngestionPipeline
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import FAISS as LangChainFAISS
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import faiss
import tempfile
# Load environment variables
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# Global settings for LlamaIndex
EMBED_DIMENSION = 512
Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION)
# Streamlit app
st.title("Chat with CSV Files - LangChain vs LlamaIndex")
# File uploader
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file:
try:
# Read and preview CSV data using pandas
data = pd.read_csv(uploaded_file)
st.write("Preview of uploaded data:")
st.dataframe(data)
# Tabs
tab1, tab2 = st.tabs(["Chat w CSV using LangChain", "Chat w CSV using LlamaIndex"])
# LangChain Tab
with tab1:
st.subheader("LangChain Query")
try:
# Save the uploaded file to a temporary file for LangChain
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w") as temp_file:
# Write the DataFrame to the temp file
data.to_csv(temp_file.name, index=False)
temp_file_path = temp_file.name
# Use CSVLoader with the temporary file path
loader = CSVLoader(file_path=temp_file_path)
docs = loader.load_and_split()
# Preview the first document
if docs:
st.write("Preview of a document chunk (LangChain):")
st.text(docs[0].page_content)
# LangChain FAISS VectorStore
langchain_index = faiss.IndexFlatL2(EMBED_DIMENSION)
langchain_vector_store = LangChainFAISS(
embedding_function=OpenAIEmbeddings(),
index=langchain_index,
)
langchain_vector_store.add_documents(docs)
# LangChain Retrieval Chain
retriever = langchain_vector_store.as_retriever()
system_prompt = (
"You are an assistant for question-answering tasks. "
"Use the following pieces of retrieved context to answer "
"the question. If you don't know the answer, say that you "
"don't know. Use three sentences maximum and keep the "
"answer concise.\n\n{context}"
)
prompt = ChatPromptTemplate.from_messages(
[("system", system_prompt), ("human", "{input}")]
)
question_answer_chain = create_stuff_documents_chain(ChatOpenAI(), prompt)
langchain_rag_chain = create_retrieval_chain(retriever, question_answer_chain)
# Query input for LangChain
query = st.text_input("Ask a question about your data (LangChain):")
if query:
answer = langchain_rag_chain.invoke({"input": query})
st.write(f"Answer: {answer['answer']}")
except Exception as e:
st.error(f"Error processing with LangChain: {e}")
finally:
# Clean up the temporary file
if 'temp_file_path' in locals() and os.path.exists(temp_file_path):
os.remove(temp_file_path)
# LlamaIndex Tab
with tab2:
st.subheader("LlamaIndex Query")
try:
# Save uploaded file content to a temporary CSV file for LlamaIndex
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w") as temp_file:
data.to_csv(temp_file.name, index=False)
temp_file_path = temp_file.name
# Use PagedCSVReader for LlamaIndex
csv_reader = PagedCSVReader()
reader = SimpleDirectoryReader(
input_files=[temp_file_path],
file_extractor={".csv": csv_reader},
)
docs = reader.load_data()
# Preview the first document
if docs:
st.write("Preview of a document chunk (LlamaIndex):")
st.text(docs[0].text)
# Initialize FAISS Vector Store
llama_faiss_index = faiss.IndexFlatL2(EMBED_DIMENSION)
llama_vector_store = FaissVectorStore(faiss_index=llama_faiss_index)
# Create the ingestion pipeline and process the data
pipeline = IngestionPipeline(vector_store=llama_vector_store, documents=docs)
nodes = pipeline.run()
# Create a query engine
llama_index = VectorStoreIndex(nodes)
query_engine = llama_index.as_query_engine(similarity_top_k=3)
# Query input for LlamaIndex
query = st.text_input("Ask a question about your data (LlamaIndex):")
if query:
response = query_engine.query(query)
st.write(f"Answer: {response.response}")
except Exception as e:
st.error(f"Error processing with LlamaIndex: {e}")
finally:
# Clean up the temporary file
if 'temp_file_path' in locals() and os.path.exists(temp_file_path):
os.remove(temp_file_path)
except Exception as e:
st.error(f"Error reading uploaded file: {e}") |