File size: 5,309 Bytes
454484f
 
e3b386e
 
454484f
 
 
 
 
e3b386e
454484f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3b386e
454484f
 
e3b386e
 
 
454484f
 
 
e3b386e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d32e6d
 
 
 
 
e3b386e
 
 
 
8d32e6d
e3b386e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454484f
 
e3b386e
454484f
 
 
 
 
 
e3b386e
 
 
 
454484f
 
 
 
 
 
 
e3b386e
 
 
 
8d32e6d
 
 
454484f
 
 
e3b386e
 
 
454484f
 
 
 
 
e3b386e
454484f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import streamlit as st
import pdfplumber
from concurrent.futures import ThreadPoolExecutor
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline

# Set up the page configuration
st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")

# Load the summarization pipeline model
@st.cache_resource
def load_summarization_pipeline():
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    return summarizer

summarizer = load_summarization_pipeline()

# Split text into manageable chunks
@st.cache_data
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks

# Initialize embedding function
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a FAISS vector store with embeddings, checking for empty chunks
@st.cache_resource
def load_or_create_vector_store(text_chunks):
    if not text_chunks:
        st.error("No valid text chunks found to create a vector store. Please check your PDF files.")
        return None
    vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
    return vector_store

# Helper function to process a single PDF
def process_single_pdf(file_path):
    text = ""
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
    except Exception as e:
        st.error(f"Failed to read PDF: {file_path} - {e}")
    return text

# Function to load PDFs with progress display
def load_pdfs_with_progress(folder_path):
    all_text = ""
    # Check if the folder exists before proceeding
    if not os.path.exists(folder_path):
        st.error(f"The folder {folder_path} does not exist.")
        return

    pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
    num_files = len(pdf_files)

    if num_files == 0:
        st.error(f"No PDF files found in the folder: {folder_path}")
        st.session_state['vector_store'] = None
        st.session_state['loading'] = False
        return

    # Title for the progress bar
    st.markdown("### Loading data...")
    progress_bar = st.progress(0)
    status_text = st.empty()

    processed_count = 0

    for file_path in pdf_files:
        result = process_single_pdf(file_path)
        all_text += result
        processed_count += 1
        progress_percentage = int((processed_count / num_files) * 100)
        progress_bar.progress(processed_count / num_files)
        status_text.text(f"Loading documents: {progress_percentage}% completed")

    progress_bar.empty()  # Remove the progress bar when done
    status_text.text("Document loading completed!")  # Show completion message

    if all_text:
        text_chunks = get_text_chunks(all_text)
        vector_store = load_or_create_vector_store(text_chunks)
        st.session_state['vector_store'] = vector_store
    else:
        st.session_state['vector_store'] = None

    st.session_state['loading'] = False  # Mark loading as complete

# Generate summary based on the retrieved text
def generate_summary_with_huggingface(query, retrieved_text):
    summarization_input = f"{query} Related information:{retrieved_text}"
    max_input_length = 1024
    summarization_input = summarization_input[:max_input_length]
    summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
    return summary[0]["summary_text"]

# Generate response for user query
def user_input(user_question):
    vector_store = st.session_state.get('vector_store')
    if vector_store is None:
        return "The app is still loading documents or no documents were successfully loaded."
    docs = vector_store.similarity_search(user_question)
    context_text = " ".join([doc.page_content for doc in docs])
    return generate_summary_with_huggingface(user_question, context_text)

# Main function to run the Streamlit app
def main():
    st.title("πŸ“„ Gen AI Lawyers Guide")

    # Start loading documents if not already loaded
    if 'loading' not in st.session_state or st.session_state['loading']:
        st.session_state['loading'] = True
        # Update folder path for your environment, e.g., 'documents1' should be checked
        folder_path = 'documents1'  
        load_pdfs_with_progress(folder_path)

    user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")

    if st.session_state.get('loading', True):
        st.info("The app is loading documents in the background. You can type your question now and submit once loading is complete.")

    if st.button("Get Response"):
        if not user_question:
            st.warning("Please enter a question before submitting.")
        else:
            with st.spinner("Generating response..."):
                answer = user_input(user_question)
                st.markdown(f"**πŸ€– AI:** {answer}")

if __name__ == "__main__":
    main()