Spaces:
Sleeping
Sleeping
File size: 3,914 Bytes
ba5f07e 41a527e 180125b 2559e80 b5b7646 ba5f07e 0e65123 54146e4 ba5f07e 61651bd ba5f07e 54146e4 ba5f07e 2559e80 61651bd 2559e80 b5b7646 61651bd b5b7646 0e65123 54146e4 ba5f07e 0e65123 ba5f07e 54146e4 ba5f07e 54146e4 ba5f07e 54146e4 ba5f07e 54146e4 2559e80 61651bd 0e65123 54146e4 b5b7646 61651bd 0e65123 61651bd 0e65123 54146e4 ba5f07e 2559e80 ba5f07e 2559e80 afbfc0e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import tempfile
import os
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import time
# Global variables for caching the model and embeddings
model = None
index = None
embeddings = None
text_chunks = []
# Function to process the uploaded PDF and save it temporarily
def process_pdf(file):
st.write("Processing uploaded PDF...")
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
tmpfile.write(file.read()) # Write the uploaded file's content to the temp file
tmpfile_path = tmpfile.name # Get the temporary file path
return tmpfile_path
# Function to extract text from the PDF
def extract_text_from_pdf(pdf_path):
try:
st.write("Extracting text from the PDF...")
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
except Exception as e:
st.error(f"Error extracting text from PDF: {e}")
return ""
# Function to chunk text into smaller sections
def chunk_text(text, chunk_size=200):
try:
st.write("Chunking text into smaller sections...")
words = text.split()
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
except Exception as e:
st.error(f"Error chunking text: {e}")
return []
# Function to load the embedding model
def load_model():
global model
st.write("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
# Function to generate embeddings
def generate_embeddings():
global embeddings, text_chunks, index
st.write("Generating embeddings...")
embeddings = []
for chunk in text_chunks:
embeddings.append(model.encode(chunk, convert_to_numpy=True))
embeddings = np.array(embeddings)
# Build FAISS index
st.write("Building FAISS index...")
dimension = embeddings.shape[-1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
# Main function to run the Streamlit app
def main():
global embeddings, text_chunks, index, model
st.title("PDF Embedding and Query System")
# File uploader for the user to upload a PDF
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
# Process the uploaded PDF and get its file path
tmp_file_path = process_pdf(uploaded_file)
# Extract text from the uploaded PDF
pdf_text = extract_text_from_pdf(tmp_file_path)
if not pdf_text:
st.error("No text extracted from the PDF. Please upload a valid file.")
return
# Initialize Sentence-Transformer model and embeddings only once
if model is None:
load_model()
# Chunk text into smaller sections for embedding generation
if not text_chunks:
text_chunks = chunk_text(pdf_text, chunk_size=200)
# Generate embeddings only once
if embeddings is None:
generate_embeddings()
# Query input field for users to enter their search queries
query = st.text_input("Enter a query to search:")
if query:
# Generate embedding for the query
query_embedding = model.encode([query], convert_to_numpy=True)
# Perform similarity search using FAISS
st.write("Searching...")
start_time = time.time()
D, I = index.search(query_embedding, k=5)
end_time = time.time()
# Display the results
st.write(f"Query processed in {end_time - start_time:.2f} seconds.")
for i in range(len(I[0])):
st.write(f"Match {i + 1}: {text_chunks[I[0][i]]} (Distance: {D[0][i]:.4f})")
if __name__ == "__main__":
main()
|