Somnath3570 commited on
Commit
eb2a41f
·
verified ·
1 Parent(s): 8fea7fb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -0
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain.chains import RetrievalQA
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_core.prompts import PromptTemplate
7
+ from langchain_huggingface import HuggingFaceEndpoint
8
+ from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+
11
+ # Load environment variables
12
+ from dotenv import load_dotenv, find_dotenv
13
+ load_dotenv(find_dotenv())
14
+
15
+ # Constants
16
+ DATA_PATH = "data/"
17
+ DB_FAISS_PATH = "vectorstore/db_faiss"
18
+ HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
19
+ HF_TOKEN = os.environ.get("HF_TOKEN")
20
+
21
+ # Custom prompt template
22
+ CUSTOM_PROMPT_TEMPLATE = """
23
+ Use the pieces of information provided in the context to answer user's question.
24
+ If you dont know the answer, just say that you dont know, dont try to make up an answer.
25
+
26
+ Dont provide anything out of the given context
27
+
28
+ Context: {context}
29
+ Question: {question}
30
+
31
+ Start the answer directly. No small talk please.
32
+ """
33
+
34
+ def load_pdf_files(data_path):
35
+ try:
36
+ loader = DirectoryLoader(data_path,
37
+ glob='*.pdf',
38
+ loader_cls=PyPDFLoader)
39
+ documents = loader.load()
40
+ return documents
41
+ except Exception as e:
42
+ st.error(f"Error loading PDF files: {e}")
43
+ return []
44
+
45
+ def create_chunks(extracted_data):
46
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
47
+ chunk_overlap=50)
48
+ text_chunks = text_splitter.split_documents(extracted_data)
49
+ return text_chunks
50
+
51
+ def get_embedding_model():
52
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
53
+ return embedding_model
54
+
55
+ def create_vectorstore():
56
+ if not os.path.exists(DATA_PATH):
57
+ os.makedirs(DATA_PATH)
58
+ st.warning(f"Created empty data directory at {DATA_PATH}. Please upload PDF files.")
59
+ return None
60
+
61
+ documents = load_pdf_files(data=DATA_PATH)
62
+ if not documents:
63
+ st.warning("No PDF files found in data directory. Please upload some PDFs.")
64
+ return None
65
+
66
+ st.info(f"Loaded {len(documents)} PDF pages")
67
+ text_chunks = create_chunks(extracted_data=documents)
68
+ st.info(f"Created {len(text_chunks)} text chunks")
69
+
70
+ embedding_model = get_embedding_model()
71
+
72
+ if not os.path.exists(os.path.dirname(DB_FAISS_PATH)):
73
+ os.makedirs(os.path.dirname(DB_FAISS_PATH))
74
+
75
+ db = FAISS.from_documents(text_chunks, embedding_model)
76
+ db.save_local(DB_FAISS_PATH)
77
+ st.success(f"Created vector store at {DB_FAISS_PATH}")
78
+ return db
79
+
80
+ @st.cache_resource
81
+ def get_vectorstore():
82
+ if os.path.exists(DB_FAISS_PATH):
83
+ embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
84
+ try:
85
+ db = FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)
86
+ return db
87
+ except Exception as e:
88
+ st.error(f"Error loading vector store: {e}")
89
+ return None
90
+ else:
91
+ st.warning("Vector store not found. Please create it first.")
92
+ return None
93
+
94
+ def set_custom_prompt():
95
+ prompt = PromptTemplate(template=CUSTOM_PROMPT_TEMPLATE, input_variables=["context", "question"])
96
+ return prompt
97
+
98
+ def load_llm():
99
+ if not HF_TOKEN:
100
+ st.error("HF_TOKEN not found. Please set it in your environment variables.")
101
+ return None
102
+
103
+ try:
104
+ llm = HuggingFaceEndpoint(
105
+ repo_id=HUGGINGFACE_REPO_ID,
106
+ task="text-generation",
107
+ temperature=0.5,
108
+ model_kwargs={
109
+ "token": HF_TOKEN,
110
+ "max_length": 512
111
+ }
112
+ )
113
+ return llm
114
+ except Exception as e:
115
+ st.error(f"Error loading LLM: {e}")
116
+ return None
117
+
118
+ def upload_pdf():
119
+ uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
120
+ if uploaded_files:
121
+ for uploaded_file in uploaded_files:
122
+ with open(os.path.join(DATA_PATH, uploaded_file.name), "wb") as f:
123
+ f.write(uploaded_file.getbuffer())
124
+ st.success(f"Uploaded {len(uploaded_files)} files to {DATA_PATH}")
125
+ return True
126
+ return False
127
+
128
+ def main():
129
+ st.title("PDF Question Answering System")
130
+
131
+ # Sidebar
132
+ st.sidebar.title("Settings")
133
+ page = st.sidebar.radio("Choose an action", ["Upload PDFs", "Create Vector Store", "Chat with Documents"])
134
+
135
+ if page == "Upload PDFs":
136
+ st.header("Upload PDF Files")
137
+ st.info("Upload PDF files that will be used for question answering")
138
+ if upload_pdf():
139
+ st.info("Now go to 'Create Vector Store' to process your documents")
140
+
141
+ elif page == "Create Vector Store":
142
+ st.header("Create Vector Store")
143
+ st.info("This will process your PDF files and create embeddings")
144
+ if st.button("Create Vector Store"):
145
+ with st.spinner("Processing documents..."):
146
+ create_vectorstore()
147
+
148
+ elif page == "Chat with Documents":
149
+ st.header("Ask Questions About Your Documents")
150
+
151
+ if 'messages' not in st.session_state:
152
+ st.session_state.messages = []
153
+
154
+ for message in st.session_state.messages:
155
+ st.chat_message(message['role']).markdown(message['content'])
156
+
157
+ prompt = st.chat_input("Ask a question about your documents")
158
+
159
+ if prompt:
160
+ st.chat_message('user').markdown(prompt)
161
+ st.session_state.messages.append({'role': 'user', 'content': prompt})
162
+
163
+ vectorstore = get_vectorstore()
164
+ if vectorstore is None:
165
+ st.error("Vector store not available. Please create it first.")
166
+ return
167
+
168
+ llm = load_llm()
169
+ if llm is None:
170
+ return
171
+
172
+ try:
173
+ with st.spinner("Thinking..."):
174
+ qa_chain = RetrievalQA.from_chain_type(
175
+ llm=llm,
176
+ chain_type="stuff",
177
+ retriever=vectorstore.as_retriever(search_kwargs={'k': 3}),
178
+ return_source_documents=True,
179
+ chain_type_kwargs={'prompt': set_custom_prompt()}
180
+ )
181
+
182
+ response = qa_chain.invoke({'query': prompt})
183
+
184
+ result = response["result"]
185
+ source_documents = response["source_documents"]
186
+
187
+ # Format source documents more cleanly
188
+ source_docs_text = "\n\n**Source Documents:**\n"
189
+ for i, doc in enumerate(source_documents, 1):
190
+ source_docs_text += f"{i}. Page {doc.metadata.get('page', 'N/A')}: {doc.page_content[:200]}...\n\n"
191
+
192
+ result_to_show = f"{result}\n{source_docs_text}"
193
+
194
+ st.chat_message('assistant').markdown(result_to_show)
195
+ st.session_state.messages.append({'role': 'assistant', 'content': result_to_show})
196
+
197
+ except Exception as e:
198
+ st.error(f"Error: {str(e)}")
199
+ st.error("Please check your HuggingFace token and model access permissions")
200
+
201
+ if __name__ == "__main__":
202
+ main()