amine-01's picture
Update app.py
656b3bd verified
import streamlit as st
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
import os
# Set up the directories for data and vector DB
DATA_DIR = "MyData"
DB_DIR = "MyData"
# Initialize the embeddings model
embeddings_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Load and process PDF documents
def load_data():
loader = PyPDFDirectoryLoader(DATA_DIR)
data_on_pdf = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", ". ", " ", ""],
chunk_size=2000,
chunk_overlap=200
)
splits = text_splitter.split_documents(data_on_pdf)
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory=DB_DIR)
return vectorstore
# Set up the generative AI model
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key="AIzaSyAnsIVS4x_7lJLe9AYXGLV8FRwUTQkB-1w")
# Load vector store
vectorstore = load_data()
# Streamlit interface
st.title("RAG App: Question-Answering with PDFs")
# User input for question
question = st.text_input("Ask a question about the documents:")
if st.button("Submit"):
if question:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
response = rag_chain.invoke(question)
st.markdown(response)
else:
st.warning("Please enter a question.")