Spaces:
Running
Running
import os | |
import streamlit as st | |
from langchain_community.document_loaders import PDFPlumberLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.prompts import ChatPromptTemplate | |
from langchain.chains import LLMChain | |
from langchain.llms import CTransformers | |
# === Configuration === | |
pdfs_directory = 'pdfs' | |
vectorstores_directory = 'vectorstores' | |
os.makedirs(pdfs_directory, exist_ok=True) | |
os.makedirs(vectorstores_directory, exist_ok=True) | |
PREDEFINED_BOOKS = [f for f in os.listdir(pdfs_directory) if f.endswith(".pdf")] | |
TEMPLATE = """ | |
You are a helpful assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. | |
If you don't know the answer, say "I don't know". Limit your answer to three concise sentences. | |
Question: {question} | |
Context: {context} | |
Answer: | |
""" | |
# === Embeddings and LLM (CPU-friendly) === | |
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') | |
llm = CTransformers( | |
model='TheBloke/Mistral-7B-Instruct-v0.1-GGUF', | |
model_file='mistral-7b-instruct-v0.1.Q4_K_M.gguf', | |
model_type='mistral', | |
config={'max_new_tokens': 512, 'temperature': 0.5} | |
) | |
# === Functions === | |
def upload_pdf(file): | |
save_path = os.path.join(pdfs_directory, file.name) | |
with open(save_path, "wb") as f: | |
f.write(file.getbuffer()) | |
return file.name | |
def load_pdf(file_path): | |
loader = PDFPlumberLoader(file_path) | |
return loader.load() | |
def split_text(documents): | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=200, | |
add_start_index=True | |
) | |
return splitter.split_documents(documents) | |
def get_vectorstore_path(book_filename): | |
base_name = os.path.splitext(book_filename)[0] | |
return os.path.join(vectorstores_directory, base_name) | |
def load_or_create_vectorstore(book_filename, documents=None): | |
vs_path = get_vectorstore_path(book_filename) | |
if os.path.exists(os.path.join(vs_path, "index.faiss")): | |
return FAISS.load_local(vs_path, embedding_model, allow_dangerous_deserialization=True) | |
if documents is None: | |
raise ValueError("Documents must be provided when creating a new vectorstore.") | |
os.makedirs(vs_path, exist_ok=True) | |
chunks = split_text(documents) | |
vector_store = FAISS.from_documents(chunks, embedding_model) | |
vector_store.save_local(vs_path) | |
return vector_store | |
def retrieve_docs(vector_store, query): | |
return vector_store.similarity_search(query) | |
def answer_question(question, documents): | |
context = "\n\n".join(doc.page_content for doc in documents) | |
prompt = ChatPromptTemplate.from_template(TEMPLATE) | |
chain = LLMChain(llm=llm, prompt=prompt) | |
return chain.run({"question": question, "context": context}) | |
# === UI === | |
st.set_page_config(page_title="π PDF Q&A (Cached FAISS)", layout="centered") | |
st.title("π Chat with PDF - Cached Vector Stores") | |
with st.sidebar: | |
st.header("Select or Upload a Book") | |
selected_book = st.selectbox("Choose a PDF", PREDEFINED_BOOKS + ["Upload new book"]) | |
if selected_book == "Upload new book": | |
uploaded_file = st.file_uploader("Upload PDF", type="pdf") | |
if uploaded_file: | |
filename = upload_pdf(uploaded_file) | |
st.success(f"Uploaded: {filename}") | |
selected_book = filename | |
# === Load or Create Vector Store === | |
if selected_book and selected_book != "Upload new book": | |
file_path = os.path.join(pdfs_directory, selected_book) | |
vectorstore_path = get_vectorstore_path(selected_book) | |
try: | |
if os.path.exists(os.path.join(vectorstore_path, "index.faiss")): | |
st.info("β Using cached vector store.") | |
vector_store = load_or_create_vectorstore(selected_book) | |
else: | |
st.warning("β³ Creating new vector store (first-time load)...") | |
documents = load_pdf(file_path) | |
vector_store = load_or_create_vectorstore(selected_book, documents) | |
st.success("β Vector store created and cached.") | |
question = st.chat_input("Ask a question about the book...") | |
if question: | |
st.chat_message("user").write(question) | |
related_docs = retrieve_docs(vector_store, question) | |
answer = answer_question(question, related_docs) | |
st.chat_message("assistant").write(answer) | |
except Exception as e: | |
st.error(f"β Error loading PDF or vector store: {e}") | |