Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import PyPDF2 | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# Set up the title | |
st.title("Engr. Hamesh Raj's PDF Chunking & Embedding Viewer") | |
st.markdown("[LinkedIn](https://www.linkedin.com/in/datascientisthameshraj/)") | |
# Load the pre-trained model and tokenizer | |
def load_model(): | |
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') | |
model = AutoModel.from_pretrained('distilbert-base-uncased') | |
return tokenizer, model | |
tokenizer, model = load_model() | |
def extract_text_from_pdf(pdf_file): | |
reader = PyPDF2.PdfReader(pdf_file) | |
text = '' | |
for page in range(len(reader.pages)): | |
text += reader.pages[page].extract_text() | |
return text | |
def chunkize_text(text, chunk_size=1000, chunk_overlap=200): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap | |
) | |
chunks = text_splitter.split_text(text) | |
return chunks | |
def get_embeddings(texts): | |
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt') | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
embeddings = outputs.last_hidden_state.mean(dim=1) | |
return embeddings | |
# Sidebar for file upload | |
st.sidebar.title("Upload PDF") | |
uploaded_files = st.sidebar.file_uploader("Choose a PDF file(s)", type="pdf", accept_multiple_files=True) | |
if uploaded_files: | |
pdf_chunks_embeddings = {} | |
for uploaded_file in uploaded_files: | |
pdf_name = uploaded_file.name | |
st.write(f"### Processing `{pdf_name}`...") | |
# Extract text from the uploaded PDF | |
text = extract_text_from_pdf(uploaded_file) | |
# Chunkize the extracted text | |
chunks = chunkize_text(text) | |
# Generate embeddings for each chunk | |
embeddings = get_embeddings(chunks) | |
# Store the chunks and embeddings | |
pdf_chunks_embeddings[pdf_name] = { | |
'chunks': chunks, | |
'embeddings': embeddings | |
} | |
# Display chunks and embeddings | |
st.write(f"#### Chunks and Embeddings for `{pdf_name}`") | |
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): | |
st.write(f"**Chunk {i+1}:**\n{chunk}") | |
st.write(f"**Embedding {i+1}:**\n{embedding}\n{'-'*50}") | |
st.success("Processing completed!") | |
else: | |
st.write("Upload a PDF file to get started.") |