Spaces:

Ahmadkhan12
/

Rag-university-act-2016

Sleeping

App Files Files Community

Rag-university-act-2016 / app.py

Ahmadkhan12

Update app.py

0e65123 verified 8 months ago

raw

history blame contribute delete

3.91 kB

	import tempfile
	import os
	import streamlit as st
	from PyPDF2 import PdfReader
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import time

	# Global variables for caching the model and embeddings
	model = None
	index = None
	embeddings = None
	text_chunks = []

	# Function to process the uploaded PDF and save it temporarily
	def process_pdf(file):
	st.write("Processing uploaded PDF...")
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
	tmpfile.write(file.read()) # Write the uploaded file's content to the temp file
	tmpfile_path = tmpfile.name # Get the temporary file path
	return tmpfile_path

	# Function to extract text from the PDF
	def extract_text_from_pdf(pdf_path):
	try:
	st.write("Extracting text from the PDF...")
	reader = PdfReader(pdf_path)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text
	except Exception as e:
	st.error(f"Error extracting text from PDF: {e}")
	return ""

	# Function to chunk text into smaller sections
	def chunk_text(text, chunk_size=200):
	try:
	st.write("Chunking text into smaller sections...")
	words = text.split()
	chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
	return chunks
	except Exception as e:
	st.error(f"Error chunking text: {e}")
	return []

	# Function to load the embedding model
	def load_model():
	global model
	st.write("Loading embedding model...")
	model = SentenceTransformer('all-MiniLM-L6-v2')

	# Function to generate embeddings
	def generate_embeddings():
	global embeddings, text_chunks, index
	st.write("Generating embeddings...")
	embeddings = []
	for chunk in text_chunks:
	embeddings.append(model.encode(chunk, convert_to_numpy=True))
	embeddings = np.array(embeddings)

	# Build FAISS index
	st.write("Building FAISS index...")
	dimension = embeddings.shape[-1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings)

	# Main function to run the Streamlit app
	def main():
	global embeddings, text_chunks, index, model

	st.title("PDF Embedding and Query System")

	# File uploader for the user to upload a PDF
	uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

	if uploaded_file is not None:
	# Process the uploaded PDF and get its file path
	tmp_file_path = process_pdf(uploaded_file)

	# Extract text from the uploaded PDF
	pdf_text = extract_text_from_pdf(tmp_file_path)

	if not pdf_text:
	st.error("No text extracted from the PDF. Please upload a valid file.")
	return

	# Initialize Sentence-Transformer model and embeddings only once
	if model is None:
	load_model()

	# Chunk text into smaller sections for embedding generation
	if not text_chunks:
	text_chunks = chunk_text(pdf_text, chunk_size=200)

	# Generate embeddings only once
	if embeddings is None:
	generate_embeddings()

	# Query input field for users to enter their search queries
	query = st.text_input("Enter a query to search:")

	if query:
	# Generate embedding for the query
	query_embedding = model.encode([query], convert_to_numpy=True)

	# Perform similarity search using FAISS
	st.write("Searching...")
	start_time = time.time()
	D, I = index.search(query_embedding, k=5)
	end_time = time.time()

	# Display the results
	st.write(f"Query processed in {end_time - start_time:.2f} seconds.")
	for i in range(len(I[0])):
	st.write(f"Match {i + 1}: {text_chunks[I[0][i]]} (Distance: {D[0][i]:.4f})")

	if __name__ == "__main__":
	main()