KushwanthK commited on
Commit
6fb423e
·
verified ·
1 Parent(s): 1f6076c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +280 -12
app.py CHANGED
@@ -1,13 +1,127 @@
1
- import pymupdf # PyMuPDF
 
 
 
 
 
 
 
 
2
  import tempfile
 
 
 
 
 
 
3
  import nltk
 
 
4
  from nltk.corpus import stopwords
5
- from collections import Counter
 
6
  from streamlit_image_zoom import image_zoom
7
  from PIL import Image
8
- import streamlit as st
9
 
10
- nltk.download('stopwords')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def highlight_pdf(file_path, text_to_highlight, page_numbers):
13
  # Open the original PDF
@@ -45,6 +159,10 @@ def highlight_pdf(file_path, text_to_highlight, page_numbers):
45
 
46
  return temp_pdf_path, new_page_numbers
47
 
 
 
 
 
48
  def pdf_to_images(pdf_path, page_numbers):
49
  doc = pymupdf.open(pdf_path)
50
  images = []
@@ -55,13 +173,13 @@ def pdf_to_images(pdf_path, page_numbers):
55
  images.append(img)
56
  return images
57
 
58
- def display_highlighted_pdf():
59
- pdf_path = "Bhagavad-Gita-As-It-Is.pdf"
60
- sources = [7, 8, 18, 20, 40, 66]
61
- response_text = ("I offer my respectful obeisances unto the lotus feet of my spiritual master "
62
- "and unto the feet of all Vaishnavas. I offer my respectful")
63
-
64
- highlighted_pdf_path, new_page_numbers = highlight_pdf(file_path=pdf_path, text_to_highlight=response_text, page_numbers=sources)
65
 
66
  images = pdf_to_images(highlighted_pdf_path, new_page_numbers)
67
 
@@ -84,4 +202,154 @@ def display_highlighted_pdf():
84
  else:
85
  st.error("The provided image is not a valid Pillow Image object.")
86
 
87
- display_highlighted_pdf()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from streamlit_chat import message
4
+ import numpy as np
5
+ import pandas as pd
6
+ from io import StringIO
7
+ import io
8
+ import PyPDF2
9
+ import pymupdf
10
  import tempfile
11
+ import base64
12
+ # from tqdm.auto import tqdm
13
+ import math
14
+ # from transformers import pipeline
15
+
16
+ from collections import Counter
17
  import nltk
18
+
19
+ nltk.download('stopwords')
20
  from nltk.corpus import stopwords
21
+ import re
22
+
23
  from streamlit_image_zoom import image_zoom
24
  from PIL import Image
 
25
 
26
+
27
+ from sentence_transformers import SentenceTransformer
28
+ import torch
29
+ from langchain_community.llms.ollama import Ollama
30
+ from langchain.prompts import ChatPromptTemplate
31
+ from langchain_community.vectorstores import FAISS
32
+
33
+ from langchain_community.llms import HuggingFaceHub
34
+ # from langchain.vectorstores import faiss
35
+ # from langchain.vectorstores import FAISS
36
+
37
+ import time
38
+ from time import sleep
39
+ from stqdm import stqdm
40
+ from dotenv import load_dotenv
41
+
42
+ # Load environment variables from .env file
43
+ load_dotenv()
44
+
45
+ HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
46
+
47
+
48
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
49
+
50
+ # if device != 'cuda':
51
+ # st.markdown(f"you are using {device}. This is much slower than using "
52
+ # "a CUDA-enabled GPU. If on colab you can change this by "
53
+ # "clicking Runtime > change runtime type > GPU.")
54
+
55
+ model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)
56
+ def display_title():
57
+ selected_value = st.session_state["value"]
58
+
59
+ st.header(f'Vedic Scriptures: {selected_value} :blue[book] :books:')
60
+
61
+ question = "ask anything about scriptures"
62
+ def open_chat():
63
+ question = st.session_state["faq"]
64
+
65
+
66
+
67
+ if "value" not in st.session_state:
68
+ st.session_state["value"] = None
69
+
70
+ if "faq" not in st.session_state:
71
+ st.session_state["faq"] = None
72
+
73
+ # st.divider()
74
+
75
+ def upload_file():
76
+ uploaded_file = st.file_uploader("Upload a file", type=["pdf"])
77
+ if uploaded_file is not None:
78
+ st.write(uploaded_file.name)
79
+ return uploaded_file.name
80
+
81
+ def create_pickle_file(filepath):
82
+
83
+ from langchain_community.document_loaders import PyMuPDFLoader
84
+ loader = PyMuPDFLoader(filepath)
85
+ pages = loader.load()
86
+
87
+ # Load a pre-trained sentence transformer model
88
+ model_name = "sentence-transformers/all-mpnet-base-v2"
89
+ model_kwargs = {'device': 'cpu'}
90
+ encode_kwargs = {'normalize_embeddings': False}
91
+
92
+ # Create a HuggingFaceEmbeddings object
93
+ from langchain_community.embeddings import HuggingFaceEmbeddings
94
+ embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
95
+
96
+ # from pathlib import Path
97
+
98
+ # path = Path(filepath)
99
+
100
+ filename = filepath.split(".")
101
+
102
+ print(filename[0])
103
+
104
+ filename = filename[0]
105
+
106
+ from datetime import datetime
107
+
108
+ # Get current date and time
109
+ now = datetime.now()
110
+
111
+ # Format as string with milliseconds
112
+ formatted_datetime = now.strftime("%Y-%m-%d_%H:%M:%S.%f")[:-3]
113
+
114
+ print(formatted_datetime)
115
+
116
+ # Create FAISS index with the HuggingFace embeddings
117
+ faiss_index = FAISS.from_documents(pages, embeddings)
118
+ with open(f"./{filename}_{formatted_datetime}.pkl", "wb") as f:
119
+ pickle.dump(faiss_index, f)
120
+
121
+
122
+ # uploaded_file_name = upload_file()
123
+ # if uploaded_file_name is not None:
124
+ # create_pickle_file(uploaded_file_name)
125
 
126
  def highlight_pdf(file_path, text_to_highlight, page_numbers):
127
  # Open the original PDF
 
159
 
160
  return temp_pdf_path, new_page_numbers
161
 
162
+ file_path = "Bhagavad-Gita-As-It-Is.pdf"
163
+ text_to_highlight = ""
164
+ sources = []
165
+
166
  def pdf_to_images(pdf_path, page_numbers):
167
  doc = pymupdf.open(pdf_path)
168
  images = []
 
173
  images.append(img)
174
  return images
175
 
176
+ # Function to display PDF in Streamlit
177
+ def display_highlighted_pdf(file_path, text_to_highlight, sources):
178
+ # pdf_path = "../Transformers/Bhagavad-Gita-As-It-Is.pdf"
179
+ # sources = [7,8]
180
+ # response_text = "I offer my respectful obeisances unto the lotus feet of my spiritual master and unto the feet of all Vaiñëavas. I offer my respectful"
181
+
182
+ highlighted_pdf_path, new_page_numbers = highlight_pdf(file_path=file_path, text_to_highlight=text_to_highlight, page_numbers=sources)
183
 
184
  images = pdf_to_images(highlighted_pdf_path, new_page_numbers)
185
 
 
202
  else:
203
  st.error("The provided image is not a valid Pillow Image object.")
204
 
205
+ # Creating a Index(Pinecone Vector Database)
206
+ import os
207
+ # import pinecone
208
+
209
+ import pickle
210
+ @st.cache_data
211
+ def get_faiss_semantic_index():
212
+ try:
213
+ index_path = "./HuggingFaceEmbeddings.pkl"
214
+ print(index_path)
215
+ # Load embeddings from the pickle file
216
+ for _ in stqdm(range(5)):
217
+ with open(index_path, "rb") as f:
218
+ faiss_index = pickle.load(f)
219
+ sleep(0.1)
220
+ # st.write("Embeddings loaded successfully.")
221
+ return faiss_index
222
+ except Exception as e:
223
+ st.error(f"Error loading embeddings: {e}")
224
+ return None
225
+ faiss_index = get_faiss_semantic_index()
226
+ print(faiss_index)
227
+
228
+ # def promt_engineer(text):
229
+ PROMPT_TEMPLATE = """
230
+ Instructions:
231
+ -----------------------------------------------------------------------------------------
232
+ Answer the question only based on the below context:
233
+ - You're a Vedic AI expert in the Hindu Vedic scriptures.
234
+ - Always try to provide Keep it simple answers in nice format without incomplete sentence.
235
+ - Give the answer atleast 5 seperate lines addition to the title info.
236
+ - provide Title: <title> Chapter: <chapter> Text No: <textnumber> Page No: <pagenumber>
237
+ ------------------------------------------------------------------------------------------
238
+
239
+ {context}
240
+
241
+ ----------------------------------------------------------------------------------
242
+
243
+ Answer the question based on the above context: {question}
244
+ """
245
+ # # Load the summarization pipeline with the specified model
246
+ # summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
247
+
248
+ # # Generate the prompt
249
+ # prompt = prompt_template.format(text=text)
250
+
251
+ # # Generate the summary
252
+ # summary = summarizer(prompt, max_length=1024, min_length=50)[0]["summary_text"]
253
+
254
+ # with st.sidebar:
255
+ # st.divider()
256
+ # st.markdown("*:red[Text Summary Generation]* from above Top 5 **:green[similarity search results]**.")
257
+ # st.write(summary)
258
+ # st.divider()
259
+
260
+ def chat_actions():
261
+
262
+ st.session_state["chat_history"].append(
263
+ {"role": "user", "content": st.session_state["chat_input"]},
264
+ )
265
+
266
+ # query_embedding = model.encode(st.session_state["chat_input"])
267
+ query = st.session_state["chat_input"]
268
+ if faiss_index is not None:
269
+ docs = faiss_index.similarity_search(query, k=6)
270
+ else:
271
+ st.error("Failed to load embeddings.")
272
+ # docs = faiss_index.similarity_search(query, k=2)
273
+
274
+ for doc in docs:
275
+ print("\n")
276
+ print(str(doc.metadata["page"]+1) + ":", doc.page_content)
277
+ context_text = "\n\n---\n\n".join([doc.page_content for doc in docs])
278
+
279
+ sources = [doc.metadata.get("page", None) for doc in docs]
280
+
281
+
282
+ prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
283
+ prompt = prompt_template.format(context=context_text, question=query)
284
+ response_text = ""
285
+ result = ""
286
+ try:
287
+ llm = HuggingFaceHub(
288
+ repo_id="meta-llama/Meta-Llama-3-8B-Instruct", model_kwargs={"temperature": 0.1, "max_new_tokens": 256, "task":"text-generation"}
289
+ )
290
+ response_text = llm.invoke(prompt)
291
+ escaped_query = re.escape(query)
292
+ result = re.split(f'Answer the question based on the above context: {escaped_query}\n',response_text)[-1]
293
+ st.write(result)
294
+ except Exception as e:
295
+ st.error(f"Error invoke: {e}")
296
+
297
+
298
+ formatted_response = f"Response: {result}\nSources: {sources}"
299
+ print(formatted_response)
300
+
301
+ st.session_state["chat_history"].append(
302
+ {
303
+ "role": "assistant",
304
+ "content": f"{result}",
305
+ }, # This can be replaced with your chat response logic
306
+ )
307
+ # break;
308
+ # Example usage
309
+ file_path = "Bhagavad-Gita-As-It-Is.pdf"
310
+ text_to_highlight = context_text.strip()
311
+ display_highlighted_pdf(file_path, result, sources)
312
+
313
+ with st.sidebar:
314
+ option = st.selectbox(
315
+ "Select Your Favorite Scriptures",
316
+ ("Bhagvatgeetha", "Bhagavatham", "Ramayanam"),
317
+ # index=None,
318
+ # placeholder="Select scriptures...",
319
+ key="value",
320
+ on_change=display_title
321
+ )
322
+
323
+ st.write("You selected:", option)
324
+
325
+ faq = st.selectbox(
326
+ "Select Your Favorite Scriptures",
327
+ ("what is jeeva and paramathma?",
328
+ "What are the Krishna told to Arjuna?",
329
+ "What are the key points from Krishna?",
330
+ "Why does atheism exist even when all questions are answered in Bhagavad Gita?",
331
+ "Why don’t all souls surrender to Lord Krishna, although he has demonstrated that everyone is part and parcel of Him, and all can be liberated from all sufferings by surrendering to Him?",
332
+ "Why do souls misuse their independence by rebelling against Lord Krishna?",
333
+ "How do I put an end to my suffering in this world?",
334
+ "what is the reason behind Krishna decided to go far war?"),
335
+ # index=None,
336
+ # placeholder="Select scriptures...",
337
+ key="faq",
338
+ on_change=open_chat
339
+ )
340
+ st.write("You selected:", faq)
341
+
342
+
343
+ if "chat_history" not in st.session_state:
344
+ st.session_state["chat_history"] = []
345
+
346
+ st.chat_input(question, on_submit=chat_actions, key="chat_input")
347
+
348
+ for i in st.session_state["chat_history"]:
349
+ with st.chat_message(name=i["role"]):
350
+ st.write(i["content"])
351
+
352
+
353
+
354
+
355
+