Madiharehan commited on
Commit
e3b386e
β€’
1 Parent(s): 85ddf27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -59
app.py CHANGED
@@ -1,15 +1,13 @@
1
  import os
2
- import requests
3
  import streamlit as st
4
- from io import BytesIO
5
- from PyPDF2 import PdfReader
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores import FAISS
9
  from transformers import pipeline
10
- import torch
11
 
12
- # Set up the page configuration as the first Streamlit command
13
  st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")
14
 
15
  # Load the summarization pipeline model
@@ -20,53 +18,6 @@ def load_summarization_pipeline():
20
 
21
  summarizer = load_summarization_pipeline()
22
 
23
- # Dictionary of Hugging Face PDF URLs grouped by folders
24
- PDF_FOLDERS = {
25
- "PPC and Administration": [
26
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/PPC%20and%20Administration",
27
- ],
28
- "IHC": [
29
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/IHC"
30
- "LHC": [
31
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/LHC"
32
- "Lahore High Court Rules and Orders": [
33
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/Lahore%20High%20Court%20Rules%20and%20Orders"
34
- "PHC": [
35
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/PHC"
36
- "SC": [
37
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/SC"
38
- ],
39
- }
40
-
41
- # Helper function to convert Hugging Face blob URLs to direct download URLs
42
- def get_huggingface_raw_url(url):
43
- if "huggingface.co" in url and "/blob/" in url:
44
- return url.replace("/blob/", "/resolve/")
45
- return url
46
-
47
- # Fetch and extract text from all PDFs in specified folders
48
- def fetch_pdf_text_from_folders(pdf_folders):
49
- all_text = ""
50
- for folder_name, urls in pdf_folders.items():
51
- folder_text = f"\n[Folder: {folder_name}]\n"
52
- for url in urls:
53
- raw_url = get_huggingface_raw_url(url)
54
- response = requests.get(raw_url)
55
- if response.status_code == 200:
56
- pdf_file = BytesIO(response.content)
57
- try:
58
- pdf_reader = PdfReader(pdf_file)
59
- for page in pdf_reader.pages:
60
- page_text = page.extract_text()
61
- if page_text:
62
- folder_text += page_text
63
- except Exception as e:
64
- st.error(f"Failed to read PDF from URL {url}: {e}")
65
- else:
66
- st.error(f"Failed to fetch PDF from URL: {url}")
67
- all_text += folder_text
68
- return all_text
69
-
70
  # Split text into manageable chunks
71
  @st.cache_data
72
  def get_text_chunks(text):
@@ -77,22 +28,80 @@ def get_text_chunks(text):
77
  # Initialize embedding function
78
  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
79
 
80
- # Create a FAISS vector store with embeddings
81
  @st.cache_resource
82
  def load_or_create_vector_store(text_chunks):
 
 
 
83
  vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
84
  return vector_store
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  # Generate summary based on the retrieved text
87
  def generate_summary_with_huggingface(query, retrieved_text):
88
- summarization_input = f"{query}\n\nRelated information:\n{retrieved_text}"
89
  max_input_length = 1024
90
  summarization_input = summarization_input[:max_input_length]
91
  summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
92
  return summary[0]["summary_text"]
93
 
94
  # Generate response for user query
95
- def user_input(user_question, vector_store):
 
 
 
96
  docs = vector_store.similarity_search(user_question)
97
  context_text = " ".join([doc.page_content for doc in docs])
98
  return generate_summary_with_huggingface(user_question, context_text)
@@ -100,18 +109,23 @@ def user_input(user_question, vector_store):
100
  # Main function to run the Streamlit app
101
  def main():
102
  st.title("πŸ“„ Gen AI Lawyers Guide")
103
- raw_text = fetch_pdf_text_from_folders(PDF_FOLDERS)
104
- text_chunks = get_text_chunks(raw_text)
105
- vector_store = load_or_create_vector_store(text_chunks)
 
 
106
 
107
  user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
108
 
 
 
 
109
  if st.button("Get Response"):
110
  if not user_question:
111
  st.warning("Please enter a question before submitting.")
112
  else:
113
  with st.spinner("Generating response..."):
114
- answer = user_input(user_question, vector_store)
115
  st.markdown(f"**πŸ€– AI:** {answer}")
116
 
117
  if __name__ == "__main__":
 
1
  import os
 
2
  import streamlit as st
3
+ import pdfplumber
4
+ from concurrent.futures import ThreadPoolExecutor
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain.embeddings import HuggingFaceEmbeddings
7
  from langchain.vectorstores import FAISS
8
  from transformers import pipeline
 
9
 
10
+ # Set up the page configuration
11
  st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")
12
 
13
  # Load the summarization pipeline model
 
18
 
19
  summarizer = load_summarization_pipeline()
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # Split text into manageable chunks
22
  @st.cache_data
23
  def get_text_chunks(text):
 
28
  # Initialize embedding function
29
  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
30
 
31
+ # Create a FAISS vector store with embeddings, checking for empty chunks
32
  @st.cache_resource
33
  def load_or_create_vector_store(text_chunks):
34
+ if not text_chunks:
35
+ st.error("No valid text chunks found to create a vector store. Please check your PDF files.")
36
+ return None
37
  vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
38
  return vector_store
39
 
40
+ # Helper function to process a single PDF
41
+ def process_single_pdf(file_path):
42
+ text = ""
43
+ try:
44
+ with pdfplumber.open(file_path) as pdf:
45
+ for page in pdf.pages:
46
+ page_text = page.extract_text()
47
+ if page_text:
48
+ text += page_text
49
+ except Exception as e:
50
+ st.error(f"Failed to read PDF: {file_path} - {e}")
51
+ return text
52
+
53
+ # Function to load PDFs with progress display
54
+ def load_pdfs_with_progress(folder_path):
55
+ all_text = ""
56
+ pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
57
+ num_files = len(pdf_files)
58
+
59
+ if num_files == 0:
60
+ st.error("No PDF files found in the specified folder.")
61
+ st.session_state['vector_store'] = None
62
+ st.session_state['loading'] = False
63
+ return
64
+
65
+ # Title for the progress bar
66
+ st.markdown("### Loading data...")
67
+ progress_bar = st.progress(0)
68
+ status_text = st.empty()
69
+
70
+ processed_count = 0
71
+
72
+ for file_path in pdf_files:
73
+ result = process_single_pdf(file_path)
74
+ all_text += result
75
+ processed_count += 1
76
+ progress_percentage = int((processed_count / num_files) * 100)
77
+ progress_bar.progress(processed_count / num_files)
78
+ status_text.text(f"Loading documents: {progress_percentage}% completed")
79
+
80
+ progress_bar.empty() # Remove the progress bar when done
81
+ status_text.text("Document loading completed!") # Show completion message
82
+
83
+ if all_text:
84
+ text_chunks = get_text_chunks(all_text)
85
+ vector_store = load_or_create_vector_store(text_chunks)
86
+ st.session_state['vector_store'] = vector_store
87
+ else:
88
+ st.session_state['vector_store'] = None
89
+
90
+ st.session_state['loading'] = False # Mark loading as complete
91
+
92
  # Generate summary based on the retrieved text
93
  def generate_summary_with_huggingface(query, retrieved_text):
94
+ summarization_input = f"{query} Related information:{retrieved_text}"
95
  max_input_length = 1024
96
  summarization_input = summarization_input[:max_input_length]
97
  summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
98
  return summary[0]["summary_text"]
99
 
100
  # Generate response for user query
101
+ def user_input(user_question):
102
+ vector_store = st.session_state.get('vector_store')
103
+ if vector_store is None:
104
+ return "The app is still loading documents or no documents were successfully loaded."
105
  docs = vector_store.similarity_search(user_question)
106
  context_text = " ".join([doc.page_content for doc in docs])
107
  return generate_summary_with_huggingface(user_question, context_text)
 
109
  # Main function to run the Streamlit app
110
  def main():
111
  st.title("πŸ“„ Gen AI Lawyers Guide")
112
+
113
+ # Start loading documents if not already loaded
114
+ if 'loading' not in st.session_state or st.session_state['loading']:
115
+ st.session_state['loading'] = True
116
+ load_pdfs_with_progress('documents1')
117
 
118
  user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
119
 
120
+ if st.session_state.get('loading', True):
121
+ st.info("The app is loading documents in the background. You can type your question now and submit once loading is complete.")
122
+
123
  if st.button("Get Response"):
124
  if not user_question:
125
  st.warning("Please enter a question before submitting.")
126
  else:
127
  with st.spinner("Generating response..."):
128
+ answer = user_input(user_question)
129
  st.markdown(f"**πŸ€– AI:** {answer}")
130
 
131
  if __name__ == "__main__":