Spaces:

Ahmadkhan12
/

Rag-university-act-2016

Sleeping

Rag-university-act-2016

File size: 3,914 Bytes

import tempfile
import os
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import time

# Global variables for caching the model and embeddings
model = None
index = None
embeddings = None
text_chunks = []

# Function to process the uploaded PDF and save it temporarily
def process_pdf(file):
    st.write("Processing uploaded PDF...")
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
        tmpfile.write(file.read())  # Write the uploaded file's content to the temp file
        tmpfile_path = tmpfile.name  # Get the temporary file path
    return tmpfile_path

# Function to extract text from the PDF
def extract_text_from_pdf(pdf_path):
    try:
        st.write("Extracting text from the PDF...")
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
        return ""

# Function to chunk text into smaller sections
def chunk_text(text, chunk_size=200):
    try:
        st.write("Chunking text into smaller sections...")
        words = text.split()
        chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
        return chunks
    except Exception as e:
        st.error(f"Error chunking text: {e}")
        return []

# Function to load the embedding model
def load_model():
    global model
    st.write("Loading embedding model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to generate embeddings
def generate_embeddings():
    global embeddings, text_chunks, index
    st.write("Generating embeddings...")
    embeddings = []
    for chunk in text_chunks:
        embeddings.append(model.encode(chunk, convert_to_numpy=True))
    embeddings = np.array(embeddings)

    # Build FAISS index
    st.write("Building FAISS index...")
    dimension = embeddings.shape[-1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

# Main function to run the Streamlit app
def main():
    global embeddings, text_chunks, index, model

    st.title("PDF Embedding and Query System")

    # File uploader for the user to upload a PDF
    uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

    if uploaded_file is not None:
        # Process the uploaded PDF and get its file path
        tmp_file_path = process_pdf(uploaded_file)

        # Extract text from the uploaded PDF
        pdf_text = extract_text_from_pdf(tmp_file_path)

        if not pdf_text:
            st.error("No text extracted from the PDF. Please upload a valid file.")
            return

        # Initialize Sentence-Transformer model and embeddings only once
        if model is None:
            load_model()

        # Chunk text into smaller sections for embedding generation
        if not text_chunks:
            text_chunks = chunk_text(pdf_text, chunk_size=200)

        # Generate embeddings only once
        if embeddings is None:
            generate_embeddings()

        # Query input field for users to enter their search queries
        query = st.text_input("Enter a query to search:")

        if query:
            # Generate embedding for the query
            query_embedding = model.encode([query], convert_to_numpy=True)

            # Perform similarity search using FAISS
            st.write("Searching...")
            start_time = time.time()
            D, I = index.search(query_embedding, k=5)
            end_time = time.time()

            # Display the results
            st.write(f"Query processed in {end_time - start_time:.2f} seconds.")
            for i in range(len(I[0])):
                st.write(f"Match {i + 1}: {text_chunks[I[0][i]]} (Distance: {D[0][i]:.4f})")

if __name__ == "__main__":
    main()