Waseem771 commited on
Commit
fa23cf8
·
verified ·
1 Parent(s): 0422410

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -0
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from dotenv import load_dotenv
4
+ import streamlit as st
5
+ from PyPDF2 import PdfReader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ # from langchain.embeddings import HuggingFaceInstructEmbeddings
8
+ from langchain_cohere import CohereEmbeddings
9
+ from langchain.vectorstores import FAISS
10
+ from langchain.memory import ConversationBufferMemory
11
+ from langchain.chains import ConversationalRetrievalChain
12
+ # from langchain.llms import Ollama
13
+ from langchain_groq import ChatGroq
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+
18
+ # Set up logging
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(asctime)s - %(levelname)s - %(message)s'
22
+ )
23
+
24
+ # Function to extract text from PDF files
25
+ def get_pdf_text(pdf_docs):
26
+ text = ""
27
+ for pdf in pdf_docs:
28
+ pdf_reader = PdfReader(pdf)
29
+ for page in pdf_reader.pages:
30
+ text += page.extract_text()
31
+ return text
32
+
33
+ # Function to split the extracted text into chunks
34
+ def get_text_chunks(text):
35
+ text_splitter = CharacterTextSplitter(
36
+ separator="\n",
37
+ chunk_size=1000,
38
+ chunk_overlap=200,
39
+ length_function=len
40
+ )
41
+ chunks = text_splitter.split_text(text)
42
+ return chunks
43
+
44
+ # Function to create a FAISS vectorstore
45
+ # def get_vectorstore(text_chunks):
46
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
47
+ # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
48
+ # return vectorstore
49
+
50
+ def get_vectorstore(text_chunks):
51
+ cohere_api_key = os.getenv("COHERE_API_KEY")
52
+ embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
53
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
54
+ return vectorstore
55
+
56
+ # Function to set up the conversational retrieval chain
57
+ def get_conversation_chain(vectorstore):
58
+ try:
59
+ # llm = Ollama(model="llama3.2:1b")
60
+ llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5)
61
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
62
+
63
+ conversation_chain = ConversationalRetrievalChain.from_llm(
64
+ llm=llm,
65
+ retriever=vectorstore.as_retriever(),
66
+ memory=memory
67
+ )
68
+
69
+ logging.info("Conversation chain created successfully.")
70
+ return conversation_chain
71
+ except Exception as e:
72
+ logging.error(f"Error creating conversation chain: {e}")
73
+ st.error("An error occurred while setting up the conversation chain.")
74
+
75
+ # Handle user input
76
+ def handle_userinput(user_question):
77
+ if st.session_state.conversation is not None:
78
+ response = st.session_state.conversation({'question': user_question})
79
+ st.session_state.chat_history = response['chat_history']
80
+
81
+ for i, message in enumerate(st.session_state.chat_history):
82
+ if i % 2 == 0:
83
+ st.write(f"*User:* {message.content}")
84
+ else:
85
+ st.write(f"*Bot:* {message.content}")
86
+ else:
87
+ st.warning("Please process the documents first.")
88
+
89
+ # Main function to run the Streamlit app
90
+ def main():
91
+ load_dotenv()
92
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
93
+
94
+ if "conversation" not in st.session_state:
95
+ st.session_state.conversation = None
96
+ if "chat_history" not in st.session_state:
97
+ st.session_state.chat_history = None
98
+
99
+ st.header("Chat with multiple PDFs :books:")
100
+ user_question = st.text_input("Ask a question about your documents:")
101
+ if user_question:
102
+ handle_userinput(user_question)
103
+
104
+ with st.sidebar:
105
+ st.subheader("Your documents")
106
+ pdf_docs = st.file_uploader(
107
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
108
+ )
109
+ if st.button("Process"):
110
+ with st.spinner("Processing..."):
111
+ raw_text = get_pdf_text(pdf_docs)
112
+ text_chunks = get_text_chunks(raw_text)
113
+ vectorstore = get_vectorstore(text_chunks)
114
+ st.session_state.conversation = get_conversation_chain(vectorstore)
115
+
116
+ if __name__ == '__main__':
117
+ main()