datascientist22 commited on
Commit
e868234
·
verified ·
1 Parent(s): 7f3f8f5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -0
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import PyPDF2
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModel
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+
8
+ # Set up the title
9
+ st.title("Engr. Hamesh Raj's PDF Chunking & Embedding Viewer")
10
+ st.markdown("[LinkedIn](https://www.linkedin.com/in/datascientisthameshraj/)")
11
+
12
+ # Load the pre-trained model and tokenizer
13
+ @st.cache_resource
14
+ def load_model():
15
+ tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
16
+ model = AutoModel.from_pretrained('distilbert-base-uncased')
17
+ return tokenizer, model
18
+
19
+ tokenizer, model = load_model()
20
+
21
+ def extract_text_from_pdf(pdf_file):
22
+ reader = PyPDF2.PdfReader(pdf_file)
23
+ text = ''
24
+ for page in range(len(reader.pages)):
25
+ text += reader.pages[page].extract_text()
26
+ return text
27
+
28
+ def chunkize_text(text, chunk_size=1000, chunk_overlap=200):
29
+ text_splitter = RecursiveCharacterTextSplitter(
30
+ chunk_size=chunk_size,
31
+ chunk_overlap=chunk_overlap
32
+ )
33
+ chunks = text_splitter.split_text(text)
34
+ return chunks
35
+
36
+ def get_embeddings(texts):
37
+ inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
38
+ with torch.no_grad():
39
+ outputs = model(**inputs)
40
+ embeddings = outputs.last_hidden_state.mean(dim=1)
41
+ return embeddings
42
+
43
+ # Sidebar for file upload
44
+ st.sidebar.title("Upload PDF")
45
+ uploaded_files = st.sidebar.file_uploader("Choose a PDF file(s)", type="pdf", accept_multiple_files=True)
46
+
47
+ if uploaded_files:
48
+ pdf_chunks_embeddings = {}
49
+
50
+ for uploaded_file in uploaded_files:
51
+ pdf_name = uploaded_file.name
52
+ st.write(f"### Processing `{pdf_name}`...")
53
+
54
+ # Extract text from the uploaded PDF
55
+ text = extract_text_from_pdf(uploaded_file)
56
+
57
+ # Chunkize the extracted text
58
+ chunks = chunkize_text(text)
59
+
60
+ # Generate embeddings for each chunk
61
+ embeddings = get_embeddings(chunks)
62
+
63
+ # Store the chunks and embeddings
64
+ pdf_chunks_embeddings[pdf_name] = {
65
+ 'chunks': chunks,
66
+ 'embeddings': embeddings
67
+ }
68
+
69
+ # Display chunks and embeddings
70
+ st.write(f"#### Chunks and Embeddings for `{pdf_name}`")
71
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
72
+ st.write(f"**Chunk {i+1}:**\n{chunk}")
73
+ st.write(f"**Embedding {i+1}:**\n{embedding}\n{'-'*50}")
74
+
75
+ st.success("Processing completed!")
76
+ else:
77
+ st.write("Upload a PDF file to get started.")