Spaces:

datascientist22
/

rag-pdf-chatbot

Sleeping

App Files Files Community

rag-pdf-chatbot / app.py

datascientist22

Create app.py

e868234 verified 11 months ago

raw

history blame

2.52 kB

	import streamlit as st
	import os
	import PyPDF2
	import torch
	from transformers import AutoTokenizer, AutoModel
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	# Set up the title
	st.title("Engr. Hamesh Raj's PDF Chunking & Embedding Viewer")
	st.markdown("[LinkedIn](https://www.linkedin.com/in/datascientisthameshraj/)")

	# Load the pre-trained model and tokenizer
	@st.cache_resource
	def load_model():
	tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
	model = AutoModel.from_pretrained('distilbert-base-uncased')
	return tokenizer, model

	tokenizer, model = load_model()

	def extract_text_from_pdf(pdf_file):
	reader = PyPDF2.PdfReader(pdf_file)
	text = ''
	for page in range(len(reader.pages)):
	text += reader.pages[page].extract_text()
	return text

	def chunkize_text(text, chunk_size=1000, chunk_overlap=200):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)
	chunks = text_splitter.split_text(text)
	return chunks

	def get_embeddings(texts):
	inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
	with torch.no_grad():
	outputs = model(**inputs)
	embeddings = outputs.last_hidden_state.mean(dim=1)
	return embeddings

	# Sidebar for file upload
	st.sidebar.title("Upload PDF")
	uploaded_files = st.sidebar.file_uploader("Choose a PDF file(s)", type="pdf", accept_multiple_files=True)

	if uploaded_files:
	pdf_chunks_embeddings = {}

	for uploaded_file in uploaded_files:
	pdf_name = uploaded_file.name
	st.write(f"### Processing `{pdf_name}`...")

	# Extract text from the uploaded PDF
	text = extract_text_from_pdf(uploaded_file)

	# Chunkize the extracted text
	chunks = chunkize_text(text)

	# Generate embeddings for each chunk
	embeddings = get_embeddings(chunks)

	# Store the chunks and embeddings
	pdf_chunks_embeddings[pdf_name] = {
	'chunks': chunks,
	'embeddings': embeddings
	}

	# Display chunks and embeddings
	st.write(f"#### Chunks and Embeddings for `{pdf_name}`")
	for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
	st.write(f"Chunk {i+1}:\n{chunk}")
	st.write(f"Embedding {i+1}:\n{embedding}\n{'-'*50}")

	st.success("Processing completed!")
	else:
	st.write("Upload a PDF file to get started.")