sango07 commited on
Commit
766f16f
·
verified ·
1 Parent(s): e2ab520

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -0
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ import os
4
+ from htmlTemplate import css, bot_template, user_template
5
+ import PyPDF2
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
8
+ from langchain_community.llms import LlamaCpp
9
+ from langchain.embeddings import HuggingFaceEmbeddings
10
+ from langchain.vectorstores import FAISS
11
+ from langchain.memory import ConversationBufferMemory
12
+ from langchain.chains import ConversationalRetrievalChain
13
+ from langchain.prompts import PromptTemplate
14
+ from sentence_transformers import SentenceTransformer, util
15
+ from langchain_openai import AzureOpenAIEmbeddings
16
+ from langchain_openai import OpenAIEmbeddings
17
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
18
+ from langchain_openai import ChatOpenAI
19
+ os.environ["OPENAI_API_KEY"] = "sk-.............."
20
+ os.environ["GROQ_API_KEY"]="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"
21
+ from langchain_groq import ChatGroq
22
+
23
+ llmtemplate = """You’re an AI information specialist with a strong emphasis on extracting accurate information from markdown documents. Your expertise involves summarizing data succinctly while adhering to strict guidelines about neutrality and clarity.
24
+
25
+ Your task is to answer a specific question based on a provided markdown document. Here is the question you need to address:
26
+ {question}
27
+
28
+ Keep in mind the following instructions:
29
+ - Your response should be direct and factual, limited to 50 words and 2-3 sentences.
30
+ - Avoid using introductory phrases like "yes" or "no."
31
+ - Maintain an ethical and unbiased tone, steering clear of harmful or offensive content.
32
+ - If the document lacks relevant information, respond with "I cannot provide an answer based on the provided document."
33
+ - Do not fabricate information, include questions, or use confirmatory phrases.
34
+ - Remember not to prompt for additional information or ask any questions.
35
+
36
+ Ensure your response is strictly based on the content of the markdown document.
37
+ """
38
+
39
+
40
+
41
+ def prepare_docs(pdf_docs):
42
+ docs = []
43
+ metadata = []
44
+ content = []
45
+
46
+ for pdf in pdf_docs:
47
+ print(pdf.name)
48
+ pdf_reader = PyPDF2.PdfReader(pdf)
49
+ for index, text in enumerate(pdf_reader.pages):
50
+ doc_page = {'title': pdf.name + " page " + str(index + 1),
51
+ 'content': pdf_reader.pages[index].extract_text()}
52
+ docs.append(doc_page)
53
+ for doc in docs:
54
+ content.append(doc["content"])
55
+ metadata.append({
56
+ "title": doc["title"]
57
+ })
58
+ return content, metadata
59
+
60
+
61
+ def get_text_chunks(content, metadata):
62
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
63
+ chunk_size=1024,
64
+ chunk_overlap=256,
65
+ )
66
+ split_docs = text_splitter.create_documents(content, metadatas=metadata)
67
+ print(f"Split documents into {len(split_docs)} passages")
68
+ return split_docs
69
+
70
+
71
+ def ingest_into_vectordb(split_docs):
72
+ # embeddings = OpenAIEmbeddings()
73
+ # embeddings = FastEmbedEmbeddings()
74
+ embeddings = SpacyEmbeddings(model_name="en_core_web_sm")
75
+ db = FAISS.from_documents(split_docs, embeddings)
76
+ DB_FAISS_PATH = 'vectorstore/db_faiss'
77
+ db.save_local(DB_FAISS_PATH)
78
+ return db
79
+
80
+
81
+ def get_conversation_chain(vectordb):
82
+ # llama_llm = ChatOpenAI(temperature=0.7, model="gpt-3.5-turbo")
83
+ llm = ChatGroq(model="llama3-70b-8192", temperature=0.25)
84
+ retriever = vectordb.as_retriever()
85
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(llmtemplate)
86
+
87
+ memory = ConversationBufferMemory(
88
+ memory_key='chat_history', return_messages=True, output_key='answer')
89
+
90
+ conversation_chain = (ConversationalRetrievalChain.from_llm
91
+ (llm=llm,
92
+ retriever=retriever,
93
+ #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
94
+ memory=memory,
95
+ return_source_documents=True))
96
+ print("Conversational Chain created for the LLM using the vector store")
97
+ return conversation_chain
98
+
99
+ def validate_answer_against_sources(response_answer, source_documents):
100
+ model = SentenceTransformer('all-MiniLM-L6-v2')
101
+ similarity_threshold = 0.5
102
+ source_texts = [doc.page_content for doc in source_documents]
103
+
104
+ answer_embedding = model.encode(response_answer, convert_to_tensor=True)
105
+ source_embeddings = model.encode(source_texts, convert_to_tensor=True)
106
+
107
+ cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
108
+
109
+
110
+ if any(score.item() > similarity_threshold for score in cosine_scores[0]):
111
+ return True
112
+
113
+ return False
114
+
115
+ def handle_userinput(user_question):
116
+ response = st.session_state.conversation({'question': user_question})
117
+ st.session_state.chat_history = response['chat_history']
118
+
119
+ for i, message in enumerate(st.session_state.chat_history):
120
+ print(i)
121
+ if i % 2 == 0:
122
+ st.write(user_template.replace(
123
+ "{{MSG}}", message.content), unsafe_allow_html=True)
124
+ else:
125
+ print(message.content)
126
+ st.write(bot_template.replace(
127
+ "{{MSG}}", message.content), unsafe_allow_html=True)
128
+
129
+
130
+
131
+ def main():
132
+ load_dotenv()
133
+
134
+ st.set_page_config(page_title="Chat with your PDFs",
135
+ page_icon=":books:")
136
+ st.write(css, unsafe_allow_html=True)
137
+
138
+ if "conversation" not in st.session_state:
139
+ st.session_state.conversation = None
140
+ if "chat_history" not in st.session_state:
141
+ st.session_state.chat_history = []
142
+
143
+ st.header("Chat with multiple PDFs :books:")
144
+ user_question = st.text_input("Ask a question about your documents:")
145
+
146
+ if user_question:
147
+ handle_userinput(user_question)
148
+
149
+ with st.sidebar:
150
+ st.subheader("Your documents")
151
+ pdf_docs = st.file_uploader(
152
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
153
+
154
+ if st.button("Process"):
155
+ with st.spinner("Processing"):
156
+ # get pdf text
157
+ content, metadata = prepare_docs(pdf_docs)
158
+
159
+ # get the text chunks
160
+ split_docs = get_text_chunks(content, metadata)
161
+
162
+ # create vector store
163
+ vectorstore = ingest_into_vectordb(split_docs)
164
+
165
+ # create conversation chain
166
+ st.session_state.conversation = get_conversation_chain(
167
+ vectorstore)
168
+
169
+
170
+ if __name__ == '__main__':
171
+ main()