Spaces:
Sleeping
Sleeping
import tempfile | |
import os | |
import streamlit as st | |
from PyPDF2 import PdfReader | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
import time | |
# Global variables for caching the model and embeddings | |
model = None | |
index = None | |
embeddings = None | |
text_chunks = [] | |
# Function to process the uploaded PDF and save it temporarily | |
def process_pdf(file): | |
st.write("Processing uploaded PDF...") | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile: | |
tmpfile.write(file.read()) # Write the uploaded file's content to the temp file | |
tmpfile_path = tmpfile.name # Get the temporary file path | |
return tmpfile_path | |
# Function to extract text from the PDF | |
def extract_text_from_pdf(pdf_path): | |
try: | |
st.write("Extracting text from the PDF...") | |
reader = PdfReader(pdf_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
except Exception as e: | |
st.error(f"Error extracting text from PDF: {e}") | |
return "" | |
# Function to chunk text into smaller sections | |
def chunk_text(text, chunk_size=200): | |
try: | |
st.write("Chunking text into smaller sections...") | |
words = text.split() | |
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] | |
return chunks | |
except Exception as e: | |
st.error(f"Error chunking text: {e}") | |
return [] | |
# Function to load the embedding model | |
def load_model(): | |
global model | |
st.write("Loading embedding model...") | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Function to generate embeddings | |
def generate_embeddings(): | |
global embeddings, text_chunks, index | |
st.write("Generating embeddings...") | |
embeddings = [] | |
for chunk in text_chunks: | |
embeddings.append(model.encode(chunk, convert_to_numpy=True)) | |
embeddings = np.array(embeddings) | |
# Build FAISS index | |
st.write("Building FAISS index...") | |
dimension = embeddings.shape[-1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(embeddings) | |
# Main function to run the Streamlit app | |
def main(): | |
global embeddings, text_chunks, index, model | |
st.title("PDF Embedding and Query System") | |
# File uploader for the user to upload a PDF | |
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
if uploaded_file is not None: | |
# Process the uploaded PDF and get its file path | |
tmp_file_path = process_pdf(uploaded_file) | |
# Extract text from the uploaded PDF | |
pdf_text = extract_text_from_pdf(tmp_file_path) | |
if not pdf_text: | |
st.error("No text extracted from the PDF. Please upload a valid file.") | |
return | |
# Initialize Sentence-Transformer model and embeddings only once | |
if model is None: | |
load_model() | |
# Chunk text into smaller sections for embedding generation | |
if not text_chunks: | |
text_chunks = chunk_text(pdf_text, chunk_size=200) | |
# Generate embeddings only once | |
if embeddings is None: | |
generate_embeddings() | |
# Query input field for users to enter their search queries | |
query = st.text_input("Enter a query to search:") | |
if query: | |
# Generate embedding for the query | |
query_embedding = model.encode([query], convert_to_numpy=True) | |
# Perform similarity search using FAISS | |
st.write("Searching...") | |
start_time = time.time() | |
D, I = index.search(query_embedding, k=5) | |
end_time = time.time() | |
# Display the results | |
st.write(f"Query processed in {end_time - start_time:.2f} seconds.") | |
for i in range(len(I[0])): | |
st.write(f"Match {i + 1}: {text_chunks[I[0][i]]} (Distance: {D[0][i]:.4f})") | |
if __name__ == "__main__": | |
main() | |