Spaces:
Sleeping
Sleeping
File size: 5,309 Bytes
454484f e3b386e 454484f e3b386e 454484f e3b386e 454484f e3b386e 454484f e3b386e 8d32e6d e3b386e 8d32e6d e3b386e 454484f e3b386e 454484f e3b386e 454484f e3b386e 8d32e6d 454484f e3b386e 454484f e3b386e 454484f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import os
import streamlit as st
import pdfplumber
from concurrent.futures import ThreadPoolExecutor
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline
# Set up the page configuration
st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="π")
# Load the summarization pipeline model
@st.cache_resource
def load_summarization_pipeline():
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
return summarizer
summarizer = load_summarization_pipeline()
# Split text into manageable chunks
@st.cache_data
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
chunks = text_splitter.split_text(text)
return chunks
# Initialize embedding function
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Create a FAISS vector store with embeddings, checking for empty chunks
@st.cache_resource
def load_or_create_vector_store(text_chunks):
if not text_chunks:
st.error("No valid text chunks found to create a vector store. Please check your PDF files.")
return None
vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
return vector_store
# Helper function to process a single PDF
def process_single_pdf(file_path):
text = ""
try:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text
except Exception as e:
st.error(f"Failed to read PDF: {file_path} - {e}")
return text
# Function to load PDFs with progress display
def load_pdfs_with_progress(folder_path):
all_text = ""
# Check if the folder exists before proceeding
if not os.path.exists(folder_path):
st.error(f"The folder {folder_path} does not exist.")
return
pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
num_files = len(pdf_files)
if num_files == 0:
st.error(f"No PDF files found in the folder: {folder_path}")
st.session_state['vector_store'] = None
st.session_state['loading'] = False
return
# Title for the progress bar
st.markdown("### Loading data...")
progress_bar = st.progress(0)
status_text = st.empty()
processed_count = 0
for file_path in pdf_files:
result = process_single_pdf(file_path)
all_text += result
processed_count += 1
progress_percentage = int((processed_count / num_files) * 100)
progress_bar.progress(processed_count / num_files)
status_text.text(f"Loading documents: {progress_percentage}% completed")
progress_bar.empty() # Remove the progress bar when done
status_text.text("Document loading completed!") # Show completion message
if all_text:
text_chunks = get_text_chunks(all_text)
vector_store = load_or_create_vector_store(text_chunks)
st.session_state['vector_store'] = vector_store
else:
st.session_state['vector_store'] = None
st.session_state['loading'] = False # Mark loading as complete
# Generate summary based on the retrieved text
def generate_summary_with_huggingface(query, retrieved_text):
summarization_input = f"{query} Related information:{retrieved_text}"
max_input_length = 1024
summarization_input = summarization_input[:max_input_length]
summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
return summary[0]["summary_text"]
# Generate response for user query
def user_input(user_question):
vector_store = st.session_state.get('vector_store')
if vector_store is None:
return "The app is still loading documents or no documents were successfully loaded."
docs = vector_store.similarity_search(user_question)
context_text = " ".join([doc.page_content for doc in docs])
return generate_summary_with_huggingface(user_question, context_text)
# Main function to run the Streamlit app
def main():
st.title("π Gen AI Lawyers Guide")
# Start loading documents if not already loaded
if 'loading' not in st.session_state or st.session_state['loading']:
st.session_state['loading'] = True
# Update folder path for your environment, e.g., 'documents1' should be checked
folder_path = 'documents1'
load_pdfs_with_progress(folder_path)
user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
if st.session_state.get('loading', True):
st.info("The app is loading documents in the background. You can type your question now and submit once loading is complete.")
if st.button("Get Response"):
if not user_question:
st.warning("Please enter a question before submitting.")
else:
with st.spinner("Generating response..."):
answer = user_input(user_question)
st.markdown(f"**π€ AI:** {answer}")
if __name__ == "__main__":
main()
|