sango07 commited on
Commit
4622ed5
·
verified ·
1 Parent(s): 02ddb5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -62
app.py CHANGED
@@ -16,80 +16,156 @@ from langchain_openai import AzureOpenAIEmbeddings
16
  from langchain_openai import OpenAIEmbeddings
17
  from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
18
  from langchain_openai import ChatOpenAI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
- def main():
22
- load_dotenv()
23
 
24
- st.set_page_config(
25
- page_title="PDF Insights AI",
26
- page_icon=":books:",
27
- layout="wide"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  )
29
- st.write(css, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- # Welcome section
33
- st.title("📚 PDF Insights AI")
34
- st.markdown("""
35
- ### Unlock the Knowledge in Your PDFs
36
- - 🤖 AI-powered document analysis
37
- - 💬 Ask questions about your uploaded documents
38
- - 📄 Support for multiple PDF files
39
- """)
40
 
41
  if "conversation" not in st.session_state:
42
  st.session_state.conversation = None
43
  if "chat_history" not in st.session_state:
44
  st.session_state.chat_history = []
45
 
46
- # File upload section
 
 
 
 
 
47
  with st.sidebar:
48
- st.header("📤 Upload Documents")
49
  pdf_docs = st.file_uploader(
50
- "Upload your PDFs here",
51
- type=['pdf'],
52
- accept_multiple_files=True,
53
- help="Upload PDF files to analyze. Max file size: 200MB"
54
- )
55
-
56
- # File validation
57
- if pdf_docs:
58
- for doc in pdf_docs:
59
- if doc.size > 200 * 1024 * 1024: # 200 MB
60
- st.error(f"File {doc.name} is too large. Maximum file size is 200MB.")
61
- pdf_docs.remove(doc)
62
-
63
- if st.button("Process Documents", type="primary"):
64
- if not pdf_docs:
65
- st.warning("Please upload at least one PDF file.")
66
- else:
67
- with st.spinner("Processing your documents..."):
68
- try:
69
- # get pdf text
70
- content, metadata = prepare_docs(pdf_docs)
71
-
72
- # get the text chunks
73
- split_docs = get_text_chunks(content, metadata)
74
-
75
- # create vector store
76
- vectorstore = ingest_into_vectordb(split_docs)
77
-
78
- # create conversation chain
79
- st.session_state.conversation = get_conversation_chain(vectorstore)
80
-
81
- st.success("Documents processed successfully! You can now ask questions.")
82
- except Exception as e:
83
- st.error(f"An error occurred while processing documents: {str(e)}")
84
-
85
- # Question input section
86
- user_question = st.text_input(
87
- "📝 Ask a question about your documents",
88
- placeholder="What insights can you provide from these documents?"
89
- )
90
 
91
- if user_question:
92
- if st.session_state.conversation is None:
93
- st.warning("Please upload and process documents first.")
94
- else:
95
- handle_userinput(user_question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  from langchain_openai import OpenAIEmbeddings
17
  from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
18
  from langchain_openai import ChatOpenAI
19
+ os.environ["OPENAI_API_KEY"] = "sk-.............."
20
+ os.environ["GROQ_API_KEY"]="........."
21
+ from langchain_groq import ChatGroq
22
+
23
+ llmtemplate = """You’re an AI information specialist with a strong emphasis on extracting accurate information from markdown documents. Your expertise involves summarizing data succinctly while adhering to strict guidelines about neutrality and clarity.
24
+
25
+ Your task is to answer a specific question based on a provided markdown document. Here is the question you need to address:
26
+ {question}
27
+
28
+ Keep in mind the following instructions:
29
+ - Your response should be direct and factual, limited to 50 words and 2-3 sentences.
30
+ - Avoid using introductory phrases like "yes" or "no."
31
+ - Maintain an ethical and unbiased tone, steering clear of harmful or offensive content.
32
+ - If the document lacks relevant information, respond with "I cannot provide an answer based on the provided document."
33
+ - Do not fabricate information, include questions, or use confirmatory phrases.
34
+ - Remember not to prompt for additional information or ask any questions.
35
+
36
+ Ensure your response is strictly based on the content of the markdown document.
37
+ """
38
 
39
 
 
 
40
 
41
+ def prepare_docs(pdf_docs):
42
+ docs = []
43
+ metadata = []
44
+ content = []
45
+
46
+ for pdf in pdf_docs:
47
+ print(pdf.name)
48
+ pdf_reader = PyPDF2.PdfReader(pdf)
49
+ for index, text in enumerate(pdf_reader.pages):
50
+ doc_page = {'title': pdf.name + " page " + str(index + 1),
51
+ 'content': pdf_reader.pages[index].extract_text()}
52
+ docs.append(doc_page)
53
+ for doc in docs:
54
+ content.append(doc["content"])
55
+ metadata.append({
56
+ "title": doc["title"]
57
+ })
58
+ return content, metadata
59
+
60
+
61
+ def get_text_chunks(content, metadata):
62
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
63
+ chunk_size=1024,
64
+ chunk_overlap=256,
65
  )
66
+ split_docs = text_splitter.create_documents(content, metadatas=metadata)
67
+ print(f"Split documents into {len(split_docs)} passages")
68
+ return split_docs
69
+
70
+
71
+ def ingest_into_vectordb(split_docs):
72
+ # embeddings = OpenAIEmbeddings()
73
+ # embeddings = FastEmbedEmbeddings()
74
+ embeddings = SpacyEmbeddings(model_name="en_core_web_sm")
75
+ db = FAISS.from_documents(split_docs, embeddings)
76
+ DB_FAISS_PATH = 'vectorstore/db_faiss'
77
+ db.save_local(DB_FAISS_PATH)
78
+ return db
79
+
80
+
81
+ def get_conversation_chain(vectordb):
82
+ # llama_llm = ChatOpenAI(temperature=0.7, model="gpt-3.5-turbo")
83
+ llm = ChatGroq(model="llama3-70b-8192", temperature=0.25)
84
+ retriever = vectordb.as_retriever()
85
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(llmtemplate)
86
+
87
+ memory = ConversationBufferMemory(
88
+ memory_key='chat_history', return_messages=True, output_key='answer')
89
+
90
+ conversation_chain = (ConversationalRetrievalChain.from_llm
91
+ (llm=llm,
92
+ retriever=retriever,
93
+ #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
94
+ memory=memory,
95
+ return_source_documents=True))
96
+ print("Conversational Chain created for the LLM using the vector store")
97
+ return conversation_chain
98
+
99
+ def validate_answer_against_sources(response_answer, source_documents):
100
+ model = SentenceTransformer('all-MiniLM-L6-v2')
101
+ similarity_threshold = 0.5
102
+ source_texts = [doc.page_content for doc in source_documents]
103
+
104
+ answer_embedding = model.encode(response_answer, convert_to_tensor=True)
105
+ source_embeddings = model.encode(source_texts, convert_to_tensor=True)
106
+
107
+ cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
108
+
109
+
110
+ if any(score.item() > similarity_threshold for score in cosine_scores[0]):
111
+ return True
112
+
113
+ return False
114
+
115
+ def handle_userinput(user_question):
116
+ response = st.session_state.conversation({'question': user_question})
117
+ st.session_state.chat_history = response['chat_history']
118
 
119
+ for i, message in enumerate(st.session_state.chat_history):
120
+ print(i)
121
+ if i % 2 == 0:
122
+ st.write(user_template.replace(
123
+ "{{MSG}}", message.content), unsafe_allow_html=True)
124
+ else:
125
+ print(message.content)
126
+ st.write(bot_template.replace(
127
+ "{{MSG}}", message.content), unsafe_allow_html=True)
128
+
129
+
130
 
131
+ def main():
132
+ load_dotenv()
133
+
134
+ st.set_page_config(page_title="Chat with your PDFs",
135
+ page_icon=":books:")
136
+ st.write(css, unsafe_allow_html=True)
 
 
137
 
138
  if "conversation" not in st.session_state:
139
  st.session_state.conversation = None
140
  if "chat_history" not in st.session_state:
141
  st.session_state.chat_history = []
142
 
143
+ st.header("Chat with multiple PDFs :books:")
144
+ user_question = st.text_input("Ask a question about your documents:")
145
+
146
+ if user_question:
147
+ handle_userinput(user_question)
148
+
149
  with st.sidebar:
150
+ st.subheader("Your documents")
151
  pdf_docs = st.file_uploader(
152
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ if st.button("Process"):
155
+ with st.spinner("Processing"):
156
+ # get pdf text
157
+ content, metadata = prepare_docs(pdf_docs)
158
+
159
+ # get the text chunks
160
+ split_docs = get_text_chunks(content, metadata)
161
+
162
+ # create vector store
163
+ vectorstore = ingest_into_vectordb(split_docs)
164
+
165
+ # create conversation chain
166
+ st.session_state.conversation = get_conversation_chain(
167
+ vectorstore)
168
+
169
+
170
+ if __name__ == '__main__':
171
+ main()