rag-pdf-chatbot / app.py
datascientist22's picture
Create app.py
e868234 verified
raw
history blame
2.52 kB
import streamlit as st
import os
import PyPDF2
import torch
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Set up the title
st.title("Engr. Hamesh Raj's PDF Chunking & Embedding Viewer")
st.markdown("[LinkedIn](https://www.linkedin.com/in/datascientisthameshraj/)")
# Load the pre-trained model and tokenizer
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')
return tokenizer, model
tokenizer, model = load_model()
def extract_text_from_pdf(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
text = ''
for page in range(len(reader.pages)):
text += reader.pages[page].extract_text()
return text
def chunkize_text(text, chunk_size=1000, chunk_overlap=200):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_text(text)
return chunks
def get_embeddings(texts):
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1)
return embeddings
# Sidebar for file upload
st.sidebar.title("Upload PDF")
uploaded_files = st.sidebar.file_uploader("Choose a PDF file(s)", type="pdf", accept_multiple_files=True)
if uploaded_files:
pdf_chunks_embeddings = {}
for uploaded_file in uploaded_files:
pdf_name = uploaded_file.name
st.write(f"### Processing `{pdf_name}`...")
# Extract text from the uploaded PDF
text = extract_text_from_pdf(uploaded_file)
# Chunkize the extracted text
chunks = chunkize_text(text)
# Generate embeddings for each chunk
embeddings = get_embeddings(chunks)
# Store the chunks and embeddings
pdf_chunks_embeddings[pdf_name] = {
'chunks': chunks,
'embeddings': embeddings
}
# Display chunks and embeddings
st.write(f"#### Chunks and Embeddings for `{pdf_name}`")
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
st.write(f"**Chunk {i+1}:**\n{chunk}")
st.write(f"**Embedding {i+1}:**\n{embedding}\n{'-'*50}")
st.success("Processing completed!")
else:
st.write("Upload a PDF file to get started.")