Somnath3570 commited on
Commit
12f6335
·
verified ·
1 Parent(s): 53c6af5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -149
app.py CHANGED
@@ -8,195 +8,156 @@ from langchain_huggingface import HuggingFaceEndpoint
8
  from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
 
11
- # Load environment variables
12
- from dotenv import load_dotenv, find_dotenv
13
- load_dotenv(find_dotenv())
14
-
15
- # Constants
16
  DATA_PATH = "data/"
17
  DB_FAISS_PATH = "vectorstore/db_faiss"
18
- HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
19
- HF_TOKEN = os.environ.get("HF_TOKEN")
20
-
21
- # Custom prompt template
22
- CUSTOM_PROMPT_TEMPLATE = """
23
- Use the pieces of information provided in the context to answer user's question.
24
- If you dont know the answer, just say that you dont know, dont try to make up an answer.
25
-
26
- Dont provide anything out of the given context
27
-
28
- Context: {context}
29
- Question: {question}
30
-
31
- Start the answer directly. No small talk please.
32
- """
33
 
34
  def load_pdf_files(data_path):
35
- try:
36
- loader = DirectoryLoader(data_path,
37
- glob='*.pdf',
38
- loader_cls=PyPDFLoader)
39
- documents = loader.load()
40
- return documents
41
- except Exception as e:
42
- st.error(f"Error loading PDF files: {e}")
43
- return []
44
 
45
  def create_chunks(extracted_data):
 
46
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
47
  chunk_overlap=50)
48
  text_chunks = text_splitter.split_documents(extracted_data)
49
  return text_chunks
50
 
51
  def get_embedding_model():
 
52
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
53
  return embedding_model
54
 
55
- def create_vectorstore():
56
- if not os.path.exists(DATA_PATH):
57
- os.makedirs(DATA_PATH)
58
- st.warning(f"Created empty data directory at {DATA_PATH}. Please upload PDF files.")
59
- return None
60
 
61
- documents = load_pdf_files(data=DATA_PATH)
62
- if not documents:
63
- st.warning("No PDF files found in data directory. Please upload some PDFs.")
64
- return None
65
-
66
- st.info(f"Loaded {len(documents)} PDF pages")
67
  text_chunks = create_chunks(extracted_data=documents)
68
  st.info(f"Created {len(text_chunks)} text chunks")
69
 
 
70
  embedding_model = get_embedding_model()
71
 
72
- if not os.path.exists(os.path.dirname(DB_FAISS_PATH)):
73
- os.makedirs(os.path.dirname(DB_FAISS_PATH))
74
-
75
  db = FAISS.from_documents(text_chunks, embedding_model)
76
  db.save_local(DB_FAISS_PATH)
77
- st.success(f"Created vector store at {DB_FAISS_PATH}")
78
  return db
79
 
80
- @st.cache_resource
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def get_vectorstore():
 
82
  if os.path.exists(DB_FAISS_PATH):
83
- embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
 
84
  try:
85
  db = FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)
86
  return db
87
  except Exception as e:
88
  st.error(f"Error loading vector store: {e}")
89
- return None
 
90
  else:
91
- st.warning("Vector store not found. Please create it first.")
92
- return None
93
-
94
- def set_custom_prompt():
95
- prompt = PromptTemplate(template=CUSTOM_PROMPT_TEMPLATE, input_variables=["context", "question"])
96
- return prompt
97
-
98
- def load_llm():
99
- if not HF_TOKEN:
100
- st.error("HF_TOKEN not found. Please set it in your environment variables.")
101
- return None
102
-
103
- try:
104
- llm = HuggingFaceEndpoint(
105
- repo_id=HUGGINGFACE_REPO_ID,
106
- task="text-generation",
107
- temperature=0.5,
108
- model_kwargs={
109
- "token": HF_TOKEN,
110
- "max_length": 512
111
- }
112
- )
113
- return llm
114
- except Exception as e:
115
- st.error(f"Error loading LLM: {e}")
116
- return None
117
-
118
- def upload_pdf():
119
- uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
120
- if uploaded_files:
121
- for uploaded_file in uploaded_files:
122
- with open(os.path.join(DATA_PATH, uploaded_file.name), "wb") as f:
123
- f.write(uploaded_file.getbuffer())
124
- st.success(f"Uploaded {len(uploaded_files)} files to {DATA_PATH}")
125
- return True
126
- return False
127
 
128
  def main():
129
- st.title("PDF Question Answering System")
 
130
 
131
- # Sidebar
132
- st.sidebar.title("Settings")
133
- page = st.sidebar.radio("Choose an action", ["Upload PDFs", "Create Vector Store", "Chat with Documents"])
134
 
135
- if page == "Upload PDFs":
136
- st.header("Upload PDF Files")
137
- st.info("Upload PDF files that will be used for question answering")
138
- if upload_pdf():
139
- st.info("Now go to 'Create Vector Store' to process your documents")
140
 
141
- elif page == "Create Vector Store":
142
- st.header("Create Vector Store")
143
- st.info("This will process your PDF files and create embeddings")
144
- if st.button("Create Vector Store"):
145
- with st.spinner("Processing documents..."):
146
- create_vectorstore()
147
 
148
- elif page == "Chat with Documents":
149
- st.header("Ask Questions About Your Documents")
150
-
151
- if 'messages' not in st.session_state:
152
- st.session_state.messages = []
153
-
154
- for message in st.session_state.messages:
155
- st.chat_message(message['role']).markdown(message['content'])
156
-
157
- prompt = st.chat_input("Ask a question about your documents")
 
 
 
 
 
 
 
158
 
159
- if prompt:
160
- st.chat_message('user').markdown(prompt)
161
- st.session_state.messages.append({'role': 'user', 'content': prompt})
162
-
163
- vectorstore = get_vectorstore()
164
- if vectorstore is None:
165
- st.error("Vector store not available. Please create it first.")
166
- return
167
-
168
- llm = load_llm()
169
- if llm is None:
170
- return
171
-
172
- try:
173
- with st.spinner("Thinking..."):
174
- qa_chain = RetrievalQA.from_chain_type(
175
- llm=llm,
176
- chain_type="stuff",
177
- retriever=vectorstore.as_retriever(search_kwargs={'k': 3}),
178
- return_source_documents=True,
179
- chain_type_kwargs={'prompt': set_custom_prompt()}
180
- )
181
-
182
- response = qa_chain.invoke({'query': prompt})
183
-
184
- result = response["result"]
185
- source_documents = response["source_documents"]
186
-
187
- # Format source documents more cleanly
188
- source_docs_text = "\n\n**Source Documents:**\n"
189
- for i, doc in enumerate(source_documents, 1):
190
- source_docs_text += f"{i}. Page {doc.metadata.get('page', 'N/A')}: {doc.page_content[:200]}...\n\n"
191
-
192
- result_to_show = f"{result}\n{source_docs_text}"
193
-
194
- st.chat_message('assistant').markdown(result_to_show)
195
- st.session_state.messages.append({'role': 'assistant', 'content': result_to_show})
196
-
197
- except Exception as e:
198
- st.error(f"Error: {str(e)}")
199
- st.error("Please check your HuggingFace token and model access permissions")
200
 
201
  if __name__ == "__main__":
202
  main()
 
8
  from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
 
11
+ # Use environment variable for Hugging Face token
12
+ HF_TOKEN = os.environ.get("HF_TOKEN")
13
+ HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
 
 
14
  DATA_PATH = "data/"
15
  DB_FAISS_PATH = "vectorstore/db_faiss"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def load_pdf_files(data_path):
18
+ """Load PDF files from the specified directory"""
19
+ loader = DirectoryLoader(data_path,
20
+ glob='*.pdf',
21
+ loader_cls=PyPDFLoader)
22
+ documents = loader.load()
23
+ return documents
 
 
 
24
 
25
  def create_chunks(extracted_data):
26
+ """Split documents into chunks"""
27
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
28
  chunk_overlap=50)
29
  text_chunks = text_splitter.split_documents(extracted_data)
30
  return text_chunks
31
 
32
  def get_embedding_model():
33
+ """Get the embedding model"""
34
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
35
  return embedding_model
36
 
37
+ def create_embeddings():
38
+ """Create embeddings and save to FAISS database"""
39
+ # Step 1: Load PDFs
40
+ documents = load_pdf_files(data_path=DATA_PATH)
41
+ st.info(f"Loaded {len(documents)} documents")
42
 
43
+ # Step 2: Create chunks
 
 
 
 
 
44
  text_chunks = create_chunks(extracted_data=documents)
45
  st.info(f"Created {len(text_chunks)} text chunks")
46
 
47
+ # Step 3: Get embedding model
48
  embedding_model = get_embedding_model()
49
 
50
+ # Step 4: Create and save embeddings
51
+ os.makedirs(os.path.dirname(DB_FAISS_PATH), exist_ok=True)
 
52
  db = FAISS.from_documents(text_chunks, embedding_model)
53
  db.save_local(DB_FAISS_PATH)
54
+ st.success("Embeddings created and saved successfully!")
55
  return db
56
 
57
+ def set_custom_prompt(custom_prompt_template):
58
+ """Set custom prompt template"""
59
+ prompt = PromptTemplate(template=custom_prompt_template, input_variables=["context", "question"])
60
+ return prompt
61
+
62
+ def load_llm(huggingface_repo_id):
63
+ """Load Hugging Face LLM"""
64
+ llm = HuggingFaceEndpoint(
65
+ repo_id=huggingface_repo_id,
66
+ task="text-generation",
67
+ temperature=0.5,
68
+ model_kwargs={
69
+ "token": HF_TOKEN,
70
+ "max_length": 512
71
+ }
72
+ )
73
+ return llm
74
+
75
  def get_vectorstore():
76
+ """Get or create vector store"""
77
  if os.path.exists(DB_FAISS_PATH):
78
+ st.info("Loading existing vector store...")
79
+ embedding_model = get_embedding_model()
80
  try:
81
  db = FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)
82
  return db
83
  except Exception as e:
84
  st.error(f"Error loading vector store: {e}")
85
+ st.info("Creating new vector store...")
86
+ return create_embeddings()
87
  else:
88
+ st.info("Creating new vector store...")
89
+ return create_embeddings()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  def main():
92
+ st.title("BeepKart FAQ Chatbot")
93
+ st.markdown("Ask questions about buying or selling bikes on BeepKart!")
94
 
95
+ # Initialize session state for messages
96
+ if 'messages' not in st.session_state:
97
+ st.session_state.messages = []
98
 
99
+ # Display chat history
100
+ for message in st.session_state.messages:
101
+ st.chat_message(message['role']).markdown(message['content'])
 
 
102
 
103
+ # Get user input
104
+ prompt = st.chat_input("Ask a question about BeepKart...")
 
 
 
 
105
 
106
+ # Custom prompt template
107
+ CUSTOM_PROMPT_TEMPLATE = """
108
+ Use the pieces of information provided in the context to answer user's question.
109
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
110
+
111
+ Don't provide anything out of the given context
112
+
113
+ Context: {context}
114
+ Question: {question}
115
+
116
+ Start the answer directly. No small talk please.
117
+ """
118
+
119
+ if prompt:
120
+ # Display user message
121
+ st.chat_message('user').markdown(prompt)
122
+ st.session_state.messages.append({'role': 'user', 'content': prompt})
123
 
124
+ try:
125
+ with st.spinner("Thinking..."):
126
+ # Get vector store
127
+ vectorstore = get_vectorstore()
128
+
129
+ # Create QA chain
130
+ qa_chain = RetrievalQA.from_chain_type(
131
+ llm=load_llm(huggingface_repo_id=HUGGINGFACE_REPO_ID),
132
+ chain_type="stuff",
133
+ retriever=vectorstore.as_retriever(search_kwargs={'k': 3}),
134
+ return_source_documents=True,
135
+ chain_type_kwargs={'prompt': set_custom_prompt(CUSTOM_PROMPT_TEMPLATE)}
136
+ )
137
+
138
+ # Get response
139
+ response = qa_chain.invoke({'query': prompt})
140
+
141
+ # Extract result and sources
142
+ result = response["result"]
143
+ source_documents = response["source_documents"]
144
+
145
+ # Format source documents
146
+ source_docs_text = "\n\n**Sources:**\n"
147
+ for i, doc in enumerate(source_documents, 1):
148
+ source_docs_text += f"{i}. Page {doc.metadata.get('page', 'N/A')}: {doc.page_content[:100]}...\n\n"
149
+
150
+ # Display result and sources
151
+ result_to_show = f"{result}\n{source_docs_text}"
152
+
153
+ st.chat_message('assistant').markdown(result_to_show)
154
+ st.session_state.messages.append({'role': 'assistant', 'content': result_to_show})
155
+
156
+ except Exception as e:
157
+ error_message = f"Error: {str(e)}"
158
+ st.error(error_message)
159
+ st.error("Please check your HuggingFace token and model access permissions")
160
+ st.session_state.messages.append({'role': 'assistant', 'content': error_message})
 
 
 
 
161
 
162
  if __name__ == "__main__":
163
  main()