File size: 3,914 Bytes
ba5f07e
41a527e
180125b
2559e80
 
 
 
b5b7646
ba5f07e
0e65123
 
 
 
 
 
54146e4
ba5f07e
61651bd
ba5f07e
54146e4
 
ba5f07e
 
2559e80
 
61651bd
 
 
 
 
 
 
 
 
 
2559e80
b5b7646
 
61651bd
 
 
 
 
 
 
 
b5b7646
0e65123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54146e4
ba5f07e
0e65123
 
ba5f07e
54146e4
 
ba5f07e
54146e4
ba5f07e
54146e4
ba5f07e
54146e4
2559e80
 
 
61651bd
 
 
 
0e65123
 
 
54146e4
b5b7646
61651bd
0e65123
61651bd
0e65123
 
 
54146e4
 
ba5f07e
2559e80
ba5f07e
2559e80
 
 
 
afbfc0e
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import tempfile
import os
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import time

# Global variables for caching the model and embeddings
model = None
index = None
embeddings = None
text_chunks = []

# Function to process the uploaded PDF and save it temporarily
def process_pdf(file):
    st.write("Processing uploaded PDF...")
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
        tmpfile.write(file.read())  # Write the uploaded file's content to the temp file
        tmpfile_path = tmpfile.name  # Get the temporary file path
    return tmpfile_path

# Function to extract text from the PDF
def extract_text_from_pdf(pdf_path):
    try:
        st.write("Extracting text from the PDF...")
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
        return ""

# Function to chunk text into smaller sections
def chunk_text(text, chunk_size=200):
    try:
        st.write("Chunking text into smaller sections...")
        words = text.split()
        chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
        return chunks
    except Exception as e:
        st.error(f"Error chunking text: {e}")
        return []

# Function to load the embedding model
def load_model():
    global model
    st.write("Loading embedding model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to generate embeddings
def generate_embeddings():
    global embeddings, text_chunks, index
    st.write("Generating embeddings...")
    embeddings = []
    for chunk in text_chunks:
        embeddings.append(model.encode(chunk, convert_to_numpy=True))
    embeddings = np.array(embeddings)

    # Build FAISS index
    st.write("Building FAISS index...")
    dimension = embeddings.shape[-1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

# Main function to run the Streamlit app
def main():
    global embeddings, text_chunks, index, model

    st.title("PDF Embedding and Query System")

    # File uploader for the user to upload a PDF
    uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

    if uploaded_file is not None:
        # Process the uploaded PDF and get its file path
        tmp_file_path = process_pdf(uploaded_file)

        # Extract text from the uploaded PDF
        pdf_text = extract_text_from_pdf(tmp_file_path)

        if not pdf_text:
            st.error("No text extracted from the PDF. Please upload a valid file.")
            return

        # Initialize Sentence-Transformer model and embeddings only once
        if model is None:
            load_model()

        # Chunk text into smaller sections for embedding generation
        if not text_chunks:
            text_chunks = chunk_text(pdf_text, chunk_size=200)

        # Generate embeddings only once
        if embeddings is None:
            generate_embeddings()

        # Query input field for users to enter their search queries
        query = st.text_input("Enter a query to search:")

        if query:
            # Generate embedding for the query
            query_embedding = model.encode([query], convert_to_numpy=True)

            # Perform similarity search using FAISS
            st.write("Searching...")
            start_time = time.time()
            D, I = index.search(query_embedding, k=5)
            end_time = time.time()

            # Display the results
            st.write(f"Query processed in {end_time - start_time:.2f} seconds.")
            for i in range(len(I[0])):
                st.write(f"Match {i + 1}: {text_chunks[I[0][i]]} (Distance: {D[0][i]:.4f})")

if __name__ == "__main__":
    main()