Spaces:
Sleeping
Sleeping
File size: 6,428 Bytes
944593e e020ff3 1c7a5e7 7a84307 eef276d dff1e7c eef276d dff1e7c 944593e d47491b 944593e e020ff3 944593e f2ed4e7 a756b7d 944593e 8d99061 e020ff3 8d99061 a756b7d 8d99061 fa8f268 8d99061 944593e f080dd9 944593e f080dd9 249a008 f080dd9 08f6ce3 1c7a5e7 f2ed4e7 1c7a5e7 573b41b 08f6ce3 eef276d a756b7d f080dd9 e020ff3 f080dd9 cfb9d35 f080dd9 a756b7d 573b41b a756b7d 573b41b a756b7d 573b41b a756b7d e020ff3 a756b7d 8d99061 f080dd9 e020ff3 cfb9d35 f080dd9 8d99061 f080dd9 d47491b f080dd9 cfb9d35 8d99061 a756b7d f080dd9 af6e981 e020ff3 f080dd9 cfb9d35 f080dd9 cfb9d35 a756b7d af6e981 a756b7d cfb9d35 e020ff3 cfb9d35 af6e981 cfb9d35 f080dd9 e020ff3 f080dd9 af6e981 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import streamlit as st
import pandas as pd
import os
import traceback
from dotenv import load_dotenv
from llama_index.readers.file.paged_csv.base import PagedCSVReader
from llama_index.core import Settings, VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.ingestion import IngestionPipeline
from langchain_community.vectorstores import FAISS as LangChainFAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document
import faiss
import tempfile
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Load environment variables
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# β
Check OpenAI API Key
if not os.getenv("OPENAI_API_KEY"):
st.error("β οΈ OpenAI API Key is missing! Please check your .env file or environment variables.")
# β
Ensure OpenAI Embeddings match FAISS dimensions
embedding_function = OpenAIEmbeddings()
test_vector = embedding_function.embed_query("test")
faiss_dimension = len(test_vector)
# β
Update global settings for LlamaIndex
Settings.llm = OpenAI(model="gpt-4o")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=faiss_dimension)
# Streamlit app
st.title("Chat with CSV Files - LangChain vs LlamaIndex")
# File uploader
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file:
try:
# Read and preview CSV data using pandas
data = pd.read_csv(uploaded_file)
st.write("Preview of uploaded data:")
st.dataframe(data)
# Save the uploaded file to a temporary location
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8") as temp_file:
temp_file_path = temp_file.name
data.to_csv(temp_file.name, index=False, encoding="utf-8")
temp_file.flush()
# Tabs for LangChain and LlamaIndex
tab1, tab2 = st.tabs(["Chat w CSV using LangChain", "Chat w CSV using LlamaIndex"])
# β
LangChain Processing
with tab1:
st.subheader("LangChain Query")
try:
# β
Convert CSV rows into LangChain Document objects with chunking
st.write("Processing CSV with a custom loader...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=90)
documents = []
for _, row in data.iterrows():
content = "\n".join([f"{col}: {row[col]}" for col in data.columns])
chunks = text_splitter.split_text(content)
for chunk in chunks:
doc = Document(page_content=chunk)
documents.append(doc)
# β
Create FAISS VectorStore
st.write(f"β
Initializing FAISS with dimension: {faiss_dimension}")
langchain_index = faiss.IndexFlatL2(faiss_dimension)
docstore = InMemoryDocstore()
index_to_docstore_id = {}
langchain_vector_store = LangChainFAISS(
embedding_function=embedding_function,
index=langchain_index,
docstore=docstore,
index_to_docstore_id=index_to_docstore_id,
)
# β
Ensure documents are added correctly
try:
langchain_vector_store.add_documents(documents)
st.write("β
Documents successfully added to FAISS VectorStore.")
except Exception as e:
st.error(f"Error adding documents to FAISS: {e}")
# β
Limit number of retrieved documents
retriever = langchain_vector_store.as_retriever(search_kwargs={"k": 5})
# β
Create LangChain Query Execution Pipeline
system_prompt = (
"You are an assistant for question-answering tasks. "
"Use the following pieces of retrieved context to answer "
"the question. Keep the answer concise.\n\n{context}"
)
prompt = ChatPromptTemplate.from_messages(
[("system", system_prompt), ("human", "{input}")]
)
question_answer_chain = create_stuff_documents_chain(ChatOpenAI(model="gpt-4o"), prompt)
langchain_rag_chain = create_retrieval_chain(retriever, question_answer_chain)
# β
Query Processing
query = st.text_input("Ask a question about your data (LangChain):")
if query:
try:
retrieved_context = "\n\n".join([doc.page_content for doc in retriever.get_relevant_documents(query)])
retrieved_context = retrieved_context[:3000]
# β
Ensure that we use the retrieved context
system_prompt = (
"You are an assistant for question-answering tasks. "
"Use the following pieces of retrieved context to answer "
"the question. Keep the answer concise.\n\n"
f"{retrieved_context}"
)
answer = langchain_rag_chain.invoke({"input": query})
st.write(f"**Answer:** {answer['answer']}")
except Exception as e:
error_message = traceback.format_exc()
st.error(f"Error processing query: {e}")
st.text(error_message)
except Exception as e:
error_message = traceback.format_exc()
st.error(f"Error processing with LangChain: {e}")
st.text(error_message)
except Exception as e:
error_message = traceback.format_exc()
st.error(f"Error reading uploaded file: {e}")
st.text(error_message) #
|