sango07 commited on
Commit
ea1e50b
·
verified ·
1 Parent(s): 56b5e53

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -18
app.py CHANGED
@@ -1,25 +1,134 @@
1
  import streamlit as st
2
  from dotenv import load_dotenv
3
  import os
4
- from htmlTemplate import css, bot_template, user_template
 
 
5
  import PyPDF2
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
- from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
8
- from langchain_community.llms import LlamaCpp
9
- from langchain.embeddings import HuggingFaceEmbeddings
 
10
  from langchain.vectorstores import FAISS
 
 
 
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.chains import ConversationalRetrievalChain
13
  from langchain.prompts import PromptTemplate
14
- from sentence_transformers import SentenceTransformer, util
15
- from langchain_openai import AzureOpenAIEmbeddings
16
- from langchain_openai import OpenAIEmbeddings
17
- from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
18
- from langchain_openai import ChatOpenAI
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def main():
 
23
  load_dotenv()
24
 
25
  st.set_page_config(
@@ -28,7 +137,6 @@ def main():
28
  layout="wide"
29
  )
30
  st.write(css, unsafe_allow_html=True)
31
-
32
 
33
  # Welcome section
34
  st.title("📚 PDF Insights AI")
@@ -39,6 +147,7 @@ def main():
39
  - 📄 Support for multiple PDF files
40
  """)
41
 
 
42
  if "conversation" not in st.session_state:
43
  st.session_state.conversation = None
44
  if "chat_history" not in st.session_state:
@@ -67,16 +176,10 @@ def main():
67
  else:
68
  with st.spinner("Processing your documents..."):
69
  try:
70
- # get pdf text
71
  content, metadata = prepare_docs(pdf_docs)
72
-
73
- # get the text chunks
74
  split_docs = get_text_chunks(content, metadata)
75
-
76
- # create vector store
77
  vectorstore = ingest_into_vectordb(split_docs)
78
-
79
- # create conversation chain
80
  st.session_state.conversation = get_conversation_chain(vectorstore)
81
 
82
  st.success("Documents processed successfully! You can now ask questions.")
@@ -93,4 +196,7 @@ def main():
93
  if st.session_state.conversation is None:
94
  st.warning("Please upload and process documents first.")
95
  else:
96
- handle_userinput(user_question)
 
 
 
 
1
  import streamlit as st
2
  from dotenv import load_dotenv
3
  import os
4
+ import traceback
5
+
6
+ # PDF and NLP Libraries
7
  import PyPDF2
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from sentence_transformers import SentenceTransformer, util
10
+
11
+ # Embedding and Vector Store
12
+ from langchain.embeddings import HuggingFaceEmbeddings
13
  from langchain.vectorstores import FAISS
14
+
15
+ # LLM and Conversational Chain
16
+ from langchain_groq import ChatGroq
17
  from langchain.memory import ConversationBufferMemory
18
  from langchain.chains import ConversationalRetrievalChain
19
  from langchain.prompts import PromptTemplate
 
 
 
 
 
20
 
21
+ # Custom Templates
22
+ from htmlTemplate import css, bot_template, user_template
23
+
24
+ # Load environment variables
25
+ os.environ["GROQ_API_KEY"]= "sss"
26
+
27
+ # LLM Template for focused responses
28
+ llmtemplate = """You're an AI information specialist with a strong emphasis on extracting accurate information from markdown documents. Your expertise involves summarizing data succinctly while adhering to strict guidelines about neutrality and clarity.
29
+ Your task is to answer a specific question based on a provided markdown document. Here is the question you need to address:
30
+ {question}
31
+ Keep in mind the following instructions:
32
+ - Your response should be direct and factual, limited to 50 words and 2-3 sentences.
33
+ - Avoid using introductory phrases like "yes" or "no."
34
+ - Maintain an ethical and unbiased tone, steering clear of harmful or offensive content.
35
+ - If the document lacks relevant information, respond with "I cannot provide an answer based on the provided document."
36
+ - Do not fabricate information, include questions, or use confirmatory phrases.
37
+ - Remember not to prompt for additional information or ask any questions.
38
+ Ensure your response is strictly based on the content of the markdown document.
39
+ """
40
+
41
+ def prepare_docs(pdf_docs):
42
+ """Extract text from uploaded PDF documents"""
43
+ docs = []
44
+ metadata = []
45
+ content = []
46
+
47
+ for pdf in pdf_docs:
48
+ pdf_reader = PyPDF2.PdfReader(pdf)
49
+ for index, text in enumerate(pdf_reader.pages):
50
+ doc_page = {
51
+ 'title': f"{pdf.name} page {index + 1}",
52
+ 'content': pdf_reader.pages[index].extract_text()
53
+ }
54
+ docs.append(doc_page)
55
+
56
+ for doc in docs:
57
+ content.append(doc["content"])
58
+ metadata.append({"title": doc["title"]})
59
+
60
+ return content, metadata
61
+
62
+ def get_text_chunks(content, metadata):
63
+ """Split documents into manageable chunks"""
64
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
65
+ chunk_size=1024,
66
+ chunk_overlap=256,
67
+ )
68
+ split_docs = text_splitter.create_documents(content, metadatas=metadata)
69
+ print(f"Split documents into {len(split_docs)} passages")
70
+ return split_docs
71
+
72
+ def ingest_into_vectordb(split_docs):
73
+ """Create vector embeddings and store in FAISS"""
74
+ embeddings = HuggingFaceEmbeddings(
75
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
76
+ model_kwargs={'device':'cpu'}
77
+ )
78
+ db = FAISS.from_documents(split_docs, embeddings)
79
+ DB_FAISS_PATH = 'vectorstore/db_faiss'
80
+ db.save_local(DB_FAISS_PATH)
81
+ return db
82
+
83
+ def get_conversation_chain(vectordb):
84
+ """Create conversational retrieval chain"""
85
+ llm = ChatGroq(model="llama3-70b-8192", temperature=0.25)
86
+ retriever = vectordb.as_retriever()
87
+
88
+ memory = ConversationBufferMemory(
89
+ memory_key='chat_history',
90
+ return_messages=True,
91
+ output_key='answer'
92
+ )
93
+
94
+ conversation_chain = ConversationalRetrievalChain.from_llm(
95
+ llm=llm,
96
+ retriever=retriever,
97
+ memory=memory,
98
+ return_source_documents=True
99
+ )
100
+
101
+ print("Conversational Chain created for the LLM using the vector store")
102
+ return conversation_chain
103
+
104
+ def validate_answer_against_sources(response_answer, source_documents):
105
+ """Validate AI's response against source documents"""
106
+ model = SentenceTransformer('all-MiniLM-L6-v2')
107
+ similarity_threshold = 0.5
108
+ source_texts = [doc.page_content for doc in source_documents]
109
+
110
+ answer_embedding = model.encode(response_answer, convert_to_tensor=True)
111
+ source_embeddings = model.encode(source_texts, convert_to_tensor=True)
112
+
113
+ cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
114
 
115
+ return any(score.item() > similarity_threshold for score in cosine_scores[0])
116
+
117
+ def handle_userinput(user_question):
118
+ """Process user input and display chat history"""
119
+ response = st.session_state.conversation({'question': user_question})
120
+ st.session_state.chat_history = response['chat_history']
121
+
122
+ for i, message in enumerate(st.session_state.chat_history):
123
+ if i % 2 == 0:
124
+ st.write(user_template.replace(
125
+ "{{MSG}}", message.content), unsafe_allow_html=True)
126
+ else:
127
+ st.write(bot_template.replace(
128
+ "{{MSG}}", message.content), unsafe_allow_html=True)
129
 
130
  def main():
131
+ """Main Streamlit application"""
132
  load_dotenv()
133
 
134
  st.set_page_config(
 
137
  layout="wide"
138
  )
139
  st.write(css, unsafe_allow_html=True)
 
140
 
141
  # Welcome section
142
  st.title("📚 PDF Insights AI")
 
147
  - 📄 Support for multiple PDF files
148
  """)
149
 
150
+ # Initialize session state
151
  if "conversation" not in st.session_state:
152
  st.session_state.conversation = None
153
  if "chat_history" not in st.session_state:
 
176
  else:
177
  with st.spinner("Processing your documents..."):
178
  try:
179
+ # Process documents
180
  content, metadata = prepare_docs(pdf_docs)
 
 
181
  split_docs = get_text_chunks(content, metadata)
 
 
182
  vectorstore = ingest_into_vectordb(split_docs)
 
 
183
  st.session_state.conversation = get_conversation_chain(vectorstore)
184
 
185
  st.success("Documents processed successfully! You can now ask questions.")
 
196
  if st.session_state.conversation is None:
197
  st.warning("Please upload and process documents first.")
198
  else:
199
+ handle_userinput(user_question)
200
+
201
+ if __name__ == '__main__':
202
+ main()