Ahmadkhan12's picture
Update app.py
0e65123 verified
import tempfile
import os
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import time
# Global variables for caching the model and embeddings
model = None
index = None
embeddings = None
text_chunks = []
# Function to process the uploaded PDF and save it temporarily
def process_pdf(file):
st.write("Processing uploaded PDF...")
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
tmpfile.write(file.read()) # Write the uploaded file's content to the temp file
tmpfile_path = tmpfile.name # Get the temporary file path
return tmpfile_path
# Function to extract text from the PDF
def extract_text_from_pdf(pdf_path):
try:
st.write("Extracting text from the PDF...")
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
except Exception as e:
st.error(f"Error extracting text from PDF: {e}")
return ""
# Function to chunk text into smaller sections
def chunk_text(text, chunk_size=200):
try:
st.write("Chunking text into smaller sections...")
words = text.split()
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
except Exception as e:
st.error(f"Error chunking text: {e}")
return []
# Function to load the embedding model
def load_model():
global model
st.write("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
# Function to generate embeddings
def generate_embeddings():
global embeddings, text_chunks, index
st.write("Generating embeddings...")
embeddings = []
for chunk in text_chunks:
embeddings.append(model.encode(chunk, convert_to_numpy=True))
embeddings = np.array(embeddings)
# Build FAISS index
st.write("Building FAISS index...")
dimension = embeddings.shape[-1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
# Main function to run the Streamlit app
def main():
global embeddings, text_chunks, index, model
st.title("PDF Embedding and Query System")
# File uploader for the user to upload a PDF
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
# Process the uploaded PDF and get its file path
tmp_file_path = process_pdf(uploaded_file)
# Extract text from the uploaded PDF
pdf_text = extract_text_from_pdf(tmp_file_path)
if not pdf_text:
st.error("No text extracted from the PDF. Please upload a valid file.")
return
# Initialize Sentence-Transformer model and embeddings only once
if model is None:
load_model()
# Chunk text into smaller sections for embedding generation
if not text_chunks:
text_chunks = chunk_text(pdf_text, chunk_size=200)
# Generate embeddings only once
if embeddings is None:
generate_embeddings()
# Query input field for users to enter their search queries
query = st.text_input("Enter a query to search:")
if query:
# Generate embedding for the query
query_embedding = model.encode([query], convert_to_numpy=True)
# Perform similarity search using FAISS
st.write("Searching...")
start_time = time.time()
D, I = index.search(query_embedding, k=5)
end_time = time.time()
# Display the results
st.write(f"Query processed in {end_time - start_time:.2f} seconds.")
for i in range(len(I[0])):
st.write(f"Match {i + 1}: {text_chunks[I[0][i]]} (Distance: {D[0][i]:.4f})")
if __name__ == "__main__":
main()