sango07 commited on
Commit
7541c6e
·
verified ·
1 Parent(s): 8c7b46a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -163
app.py CHANGED
@@ -1,176 +1,74 @@
1
- import streamlit as st
2
- from dotenv import load_dotenv
3
- import os
4
- from langchain.embeddings import HuggingFaceEmbeddings
5
- from htmlTemplate import css, bot_template, user_template
6
- import PyPDF2
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
9
- from langchain_community.llms import LlamaCpp
10
- from langchain.embeddings import HuggingFaceEmbeddings
11
- from langchain.vectorstores import FAISS
12
- from langchain.memory import ConversationBufferMemory
13
- from langchain.chains import ConversationalRetrievalChain
14
- from langchain.prompts import PromptTemplate
15
- from sentence_transformers import SentenceTransformer, util
16
- #from langchain_openai import AzureOpenAIEmbeddings
17
- #from langchain_openai import OpenAIEmbeddings
18
- from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
19
- #from langchain_openai import ChatOpenAI
20
- os.environ["OPENAI_API_KEY"] = "sk-.............."
21
-
22
- import os
23
- os.environ["GROQ_API_KEY"]=os.getenv('GROQ_API_KEY')
24
- from langchain_groq import ChatGroq
25
-
26
- llmtemplate = """You’re an AI information specialist with a strong emphasis on extracting accurate information from markdown documents. Your expertise involves summarizing data succinctly while adhering to strict guidelines about neutrality and clarity.
27
-
28
- Your task is to answer a specific question based on a provided markdown document. Here is the question you need to address:
29
- {question}
30
-
31
- Keep in mind the following instructions:
32
- - Your response should be direct and factual, limited to 50 words and 2-3 sentences.
33
- - Avoid using introductory phrases like "yes" or "no."
34
- - Maintain an ethical and unbiased tone, steering clear of harmful or offensive content.
35
- - If the document lacks relevant information, respond with "I cannot provide an answer based on the provided document."
36
- - Do not fabricate information, include questions, or use confirmatory phrases.
37
- - Remember not to prompt for additional information or ask any questions.
38
-
39
- Ensure your response is strictly based on the content of the markdown document.
40
- """
41
-
42
-
43
-
44
- def prepare_docs(pdf_docs):
45
- docs = []
46
- metadata = []
47
- content = []
48
-
49
- for pdf in pdf_docs:
50
- print(pdf.name)
51
- pdf_reader = PyPDF2.PdfReader(pdf)
52
- for index, text in enumerate(pdf_reader.pages):
53
- doc_page = {'title': pdf.name + " page " + str(index + 1),
54
- 'content': pdf_reader.pages[index].extract_text()}
55
- docs.append(doc_page)
56
- for doc in docs:
57
- content.append(doc["content"])
58
- metadata.append({
59
- "title": doc["title"]
60
- })
61
- return content, metadata
62
-
63
-
64
- def get_text_chunks(content, metadata):
65
- text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
66
- chunk_size=1024,
67
- chunk_overlap=256,
68
- )
69
- split_docs = text_splitter.create_documents(content, metadatas=metadata)
70
- print(f"Split documents into {len(split_docs)} passages")
71
- return split_docs
72
-
73
-
74
- def ingest_into_vectordb(split_docs):
75
- # embeddings = OpenAIEmbeddings()
76
- # embeddings = FastEmbedEmbeddings()
77
- # embeddings = SpacyEmbeddings(model_name="en_core_web_sm")
78
- embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
79
- model_kwargs={'device':'cpu'})
80
- db = FAISS.from_documents(split_docs, embeddings)
81
- DB_FAISS_PATH = 'vectorstore/db_faiss'
82
- db.save_local(DB_FAISS_PATH)
83
- return db
84
-
85
-
86
- def get_conversation_chain(vectordb):
87
- # llama_llm = ChatOpenAI(temperature=0.7, model="gpt-3.5-turbo")
88
- llm = ChatGroq(model="llama3-70b-8192", temperature=0.25)
89
- retriever = vectordb.as_retriever()
90
- CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(llmtemplate)
91
-
92
- memory = ConversationBufferMemory(
93
- memory_key='chat_history', return_messages=True, output_key='answer')
94
-
95
- conversation_chain = (ConversationalRetrievalChain.from_llm
96
- (llm=llm,
97
- retriever=retriever,
98
- #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
99
- memory=memory,
100
- return_source_documents=True))
101
- print("Conversational Chain created for the LLM using the vector store")
102
- return conversation_chain
103
-
104
- def validate_answer_against_sources(response_answer, source_documents):
105
- model = SentenceTransformer('all-MiniLM-L6-v2')
106
- similarity_threshold = 0.5
107
- source_texts = [doc.page_content for doc in source_documents]
108
-
109
- answer_embedding = model.encode(response_answer, convert_to_tensor=True)
110
- source_embeddings = model.encode(source_texts, convert_to_tensor=True)
111
-
112
- cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
113
-
114
-
115
- if any(score.item() > similarity_threshold for score in cosine_scores[0]):
116
- return True
117
-
118
- return False
119
-
120
- def handle_userinput(user_question):
121
- response = st.session_state.conversation({'question': user_question})
122
- st.session_state.chat_history = response['chat_history']
123
-
124
- for i, message in enumerate(st.session_state.chat_history):
125
- print(i)
126
- if i % 2 == 0:
127
- st.write(user_template.replace(
128
- "{{MSG}}", message.content), unsafe_allow_html=True)
129
- else:
130
- print(message.content)
131
- st.write(bot_template.replace(
132
- "{{MSG}}", message.content), unsafe_allow_html=True)
133
-
134
-
135
-
136
  def main():
137
  load_dotenv()
138
 
139
- st.set_page_config(page_title="Chat with your PDFs",
140
- page_icon=":books:")
 
 
 
141
  st.write(css, unsafe_allow_html=True)
142
 
 
 
 
 
 
 
 
 
 
143
  if "conversation" not in st.session_state:
144
  st.session_state.conversation = None
145
  if "chat_history" not in st.session_state:
146
  st.session_state.chat_history = []
147
 
148
- st.header("Chat with multiple PDFs :books:")
149
- user_question = st.text_input("Ask a question about your documents:")
150
-
151
- if user_question:
152
- handle_userinput(user_question)
153
-
154
  with st.sidebar:
155
- st.subheader("Your documents")
156
  pdf_docs = st.file_uploader(
157
- "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
158
-
159
- if st.button("Process"):
160
- with st.spinner("Processing"):
161
- # get pdf text
162
- content, metadata = prepare_docs(pdf_docs)
163
-
164
- # get the text chunks
165
- split_docs = get_text_chunks(content, metadata)
166
-
167
- # create vector store
168
- vectorstore = ingest_into_vectordb(split_docs)
169
-
170
- # create conversation chain
171
- st.session_state.conversation = get_conversation_chain(
172
- vectorstore)
173
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
- if __name__ == '__main__':
176
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def main():
2
  load_dotenv()
3
 
4
+ st.set_page_config(
5
+ page_title="PDF Insights AI",
6
+ page_icon=":books:",
7
+ layout="wide"
8
+ )
9
  st.write(css, unsafe_allow_html=True)
10
 
11
+ # Welcome section
12
+ st.title("📚 PDF Insights AI")
13
+ st.markdown("""
14
+ ### Unlock the Knowledge in Your PDFs
15
+ - 🤖 AI-powered document analysis
16
+ - 💬 Ask questions about your uploaded documents
17
+ - 📄 Support for multiple PDF files
18
+ """)
19
+
20
  if "conversation" not in st.session_state:
21
  st.session_state.conversation = None
22
  if "chat_history" not in st.session_state:
23
  st.session_state.chat_history = []
24
 
25
+ # File upload section
 
 
 
 
 
26
  with st.sidebar:
27
+ st.header("📤 Upload Documents")
28
  pdf_docs = st.file_uploader(
29
+ "Upload your PDFs here",
30
+ type=['pdf'],
31
+ accept_multiple_files=True,
32
+ help="Upload PDF files to analyze. Max file size: 200MB"
33
+ )
34
+
35
+ # File validation
36
+ if pdf_docs:
37
+ for doc in pdf_docs:
38
+ if doc.size > 200 * 1024 * 1024: # 200 MB
39
+ st.error(f"File {doc.name} is too large. Maximum file size is 200MB.")
40
+ pdf_docs.remove(doc)
41
+
42
+ if st.button("Process Documents", type="primary"):
43
+ if not pdf_docs:
44
+ st.warning("Please upload at least one PDF file.")
45
+ else:
46
+ with st.spinner("Processing your documents..."):
47
+ try:
48
+ # get pdf text
49
+ content, metadata = prepare_docs(pdf_docs)
50
+
51
+ # get the text chunks
52
+ split_docs = get_text_chunks(content, metadata)
53
+
54
+ # create vector store
55
+ vectorstore = ingest_into_vectordb(split_docs)
56
+
57
+ # create conversation chain
58
+ st.session_state.conversation = get_conversation_chain(vectorstore)
59
+
60
+ st.success("Documents processed successfully! You can now ask questions.")
61
+ except Exception as e:
62
+ st.error(f"An error occurred while processing documents: {str(e)}")
63
+
64
+ # Question input section
65
+ user_question = st.text_input(
66
+ "📝 Ask a question about your documents",
67
+ placeholder="What insights can you provide from these documents?"
68
+ )
69
 
70
+ if user_question:
71
+ if st.session_state.conversation is None:
72
+ st.warning("Please upload and process documents first.")
73
+ else:
74
+ handle_userinput(user_question)