|
import os |
|
import streamlit as st |
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
from langchain.vectorstores import Chroma |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.chat_models import ChatOpenAI |
|
from langchain.chains import ConversationalRetrievalChain |
|
from langchain.memory import ConversationBufferMemory |
|
from langchain.document_loaders import PyPDFLoader |
|
|
|
|
|
if "messages" not in st.session_state: |
|
st.session_state.messages = [] |
|
if "chain" not in st.session_state: |
|
st.session_state.chain = None |
|
if "vectorstore" not in st.session_state: |
|
st.session_state.vectorstore = None |
|
|
|
def create_sidebar(): |
|
with st.sidebar: |
|
st.title("PDF Chat") |
|
st.markdown("### Quick Demo of RAG") |
|
api_key = st.text_input("OpenAI API Key:", type="password") |
|
st.markdown(""" |
|
### Tools Used |
|
- OpenAI |
|
- LangChain |
|
- ChromaDB |
|
|
|
### Steps |
|
1. Add API key |
|
2. Upload PDF |
|
3. Chat! |
|
""") |
|
return api_key |
|
|
|
def process_pdfs(papers, api_key): |
|
"""Process PDFs and return whether processing was successful""" |
|
if not papers: |
|
return False |
|
|
|
with st.spinner("Processing PDFs..."): |
|
try: |
|
|
|
embeddings = OpenAIEmbeddings(openai_api_key=api_key) |
|
|
|
|
|
all_texts = [] |
|
for paper in papers: |
|
|
|
file_path = os.path.join('./uploads', paper.name) |
|
os.makedirs('./uploads', exist_ok=True) |
|
with open(file_path, "wb") as f: |
|
f.write(paper.getbuffer()) |
|
|
|
|
|
loader = PyPDFLoader(file_path) |
|
documents = loader.load() |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=1000, |
|
chunk_overlap=200, |
|
) |
|
texts = text_splitter.split_documents(documents) |
|
all_texts.extend(texts) |
|
|
|
|
|
os.remove(file_path) |
|
|
|
|
|
st.session_state.vectorstore = Chroma.from_documents( |
|
documents=all_texts, |
|
embedding=embeddings, |
|
) |
|
|
|
|
|
st.session_state.chain = ConversationalRetrievalChain.from_llm( |
|
llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_key=api_key), |
|
retriever=st.session_state.vectorstore.as_retriever( |
|
search_kwargs={"k": 3} |
|
), |
|
memory=ConversationBufferMemory( |
|
memory_key="chat_history", |
|
return_messages=True, |
|
), |
|
return_source_documents=True, |
|
) |
|
|
|
st.success(f"Processed {len(papers)} PDF(s) successfully!") |
|
return True |
|
|
|
except Exception as e: |
|
st.error(f"Error processing PDFs: {str(e)}") |
|
return False |
|
|
|
def main(): |
|
st.set_page_config(page_title="PDF Chat") |
|
|
|
|
|
api_key = create_sidebar() |
|
|
|
if not api_key: |
|
st.warning("Please enter your OpenAI API key") |
|
return |
|
|
|
st.title("Chat with PDF") |
|
|
|
|
|
papers = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True) |
|
|
|
|
|
if papers: |
|
if st.button("Process PDFs"): |
|
process_pdfs(papers, api_key) |
|
|
|
|
|
for message in st.session_state.messages: |
|
with st.chat_message(message["role"]): |
|
st.markdown(message["content"]) |
|
|
|
|
|
if prompt := st.chat_input("Ask about your PDFs"): |
|
|
|
st.session_state.messages.append({"role": "user", "content": prompt}) |
|
|
|
|
|
with st.chat_message("user"): |
|
st.markdown(prompt) |
|
|
|
|
|
with st.chat_message("assistant"): |
|
if st.session_state.chain is None: |
|
response = "Please upload and process a PDF first." |
|
else: |
|
with st.spinner("Thinking..."): |
|
|
|
result = st.session_state.chain({"question": prompt}) |
|
response = result["answer"] |
|
|
|
|
|
if "source_documents" in result: |
|
sources = result["source_documents"] |
|
if sources: |
|
response += "\n\nSources:" |
|
for i, doc in enumerate(sources, 1): |
|
|
|
page_info = f" (Page {doc.metadata['page'] + 1})" if 'page' in doc.metadata else "" |
|
response += f"\n{i}.{page_info} {doc.page_content[:200]}..." |
|
|
|
st.markdown(response) |
|
st.session_state.messages.append({"role": "assistant", "content": response}) |
|
|
|
if __name__ == "__main__": |
|
main() |