import streamlit as st import os from PyPDF2 import PdfReader import docx from langchain.chat_models import ChatOpenAI from langchain.llms import OpenAI from dotenv import load_dotenv from langchain.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import FAISS from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory from streamlit_chat import message from langchain.callbacks import get_openai_callback # Load environment variables load_dotenv() openapi_key = os.getenv("OPENAI_API_KEY") def main(): st.set_page_config(page_title="Chat with your file") st.header("DocumentGPT") if "conversation" not in st.session_state: st.session_state.conversation = None if "chat_history" not in st.session_state: st.session_state.chat_history = None if "processComplete" not in st.session_state: st.session_state.processComplete = None with st.sidebar: uploaded_files = st.file_uploader("Upload your file", type=['pdf', 'docx'], accept_multiple_files=True) process = st.button("Process") if process: if not openapi_key: st.info("Please add your OpenAI API key to continue.") st.stop() files_text = get_files_text(uploaded_files) st.write("File loaded...") text_chunks = get_text_chunks(files_text) st.write("File chunks created...") vectorstore = get_vectorstore(text_chunks) st.write("Vector Store Created...") st.session_state.conversation = get_conversation_chain(vectorstore, openapi_key) st.session_state.processComplete = True if st.session_state.processComplete: user_question = st.chat_input("Ask a question about your files.") if user_question: handle_user_input(user_question) def get_files_text(uploaded_files): text = "" for uploaded_file in uploaded_files: file_extension = os.path.splitext(uploaded_file.name)[1] if file_extension == ".pdf": text += get_pdf_text(uploaded_file) elif file_extension == ".docx": text += get_docx_text(uploaded_file) return text def get_pdf_text(pdf): pdf_reader = PdfReader(pdf) text = "" for page in pdf_reader.pages: text += page.extract_text() return text def get_docx_text(file): doc = docx.Document(file) return ' '.join([para.text for para in doc.paragraphs]) def get_text_chunks(text): text_splitter = CharacterTextSplitter( separator="\n", chunk_size=900, chunk_overlap=100, length_function=len ) return text_splitter.split_text(text) def get_vectorstore(text_chunks): embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") return FAISS.from_texts(text_chunks, embeddings) def get_conversation_chain(vectorstore, openapi_key): llm = ChatOpenAI(openai_api_key=openapi_key, model_name='gpt-3.5-turbo', temperature=0) memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True) return ConversationalRetrievalChain.from_llm( llm=llm, retriever=vectorstore.as_retriever(), memory=memory ) def handle_user_input(user_question): with get_openai_callback() as cb: response = st.session_state.conversation({'question': user_question}) st.session_state.chat_history = response['chat_history'] response_container = st.container() with response_container: for i, message in enumerate(st.session_state.chat_history): message(message.content, is_user=(i % 2 == 0), key=str(i)) if __name__ == '__main__': main()