Samarth991 commited on
Commit
4294bc8
·
verified ·
1 Parent(s): 83cecf4

Update PDF_Reader.py

Browse files
Files changed (1) hide show
  1. PDF_Reader.py +16 -4
PDF_Reader.py CHANGED
@@ -1,8 +1,10 @@
1
  import os
2
  from langchain_experimental.text_splitter import SemanticChunker
 
3
  from langchain_chroma import Chroma
4
  from langchain_community.document_loaders import PyPDFLoader
5
  from langchain.embeddings import HuggingFaceEmbeddings
 
6
 
7
  embedding_modelPath = "sentence-transformers/all-MiniLM-l6-v2"
8
  embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})
@@ -22,6 +24,16 @@ def replace_t_with_space(list_of_documents):
22
  doc.page_content = doc.page_content.replace('\t', ' ') # Replace tabs with spaces
23
  return list_of_documents
24
 
 
 
 
 
 
 
 
 
 
 
25
  def read_pdf(pdf_path):
26
  loader = PyPDFLoader(pdf_path)
27
  docs = loader.load()
@@ -29,15 +41,15 @@ def read_pdf(pdf_path):
29
  return docs
30
 
31
  def Chunks(docs):
32
-
33
  text_splitter = SemanticChunker(embeddings,breakpoint_threshold_type='interquartile')
34
  docs = text_splitter.split_documents(docs)
35
  cleaned_docs = replace_t_with_space(docs)
36
  return cleaned_docs
37
 
38
- def PDF_4_QA(file):
39
- docs = read_pdf(file)
40
- cleaned_docs = Chunks(docs)
 
41
  vectordb = Chroma.from_documents(
42
  documents=cleaned_docs,
43
  embedding=embeddings,
 
1
  import os
2
  from langchain_experimental.text_splitter import SemanticChunker
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_chroma import Chroma
5
  from langchain_community.document_loaders import PyPDFLoader
6
  from langchain.embeddings import HuggingFaceEmbeddings
7
+ from PyPDF2 import PdfReader
8
 
9
  embedding_modelPath = "sentence-transformers/all-MiniLM-l6-v2"
10
  embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})
 
24
  doc.page_content = doc.page_content.replace('\t', ' ') # Replace tabs with spaces
25
  return list_of_documents
26
 
27
+ def read_pdf_text(pdf_path):
28
+ text = ""
29
+ pdf_reader = PdfReader(pdf_path)
30
+ for page in pdf_reader.pages:
31
+ text += page.extract_text()
32
+
33
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
34
+ text_chunks = text_splitter.split_text(text)
35
+ return text_chunks
36
+
37
  def read_pdf(pdf_path):
38
  loader = PyPDFLoader(pdf_path)
39
  docs = loader.load()
 
41
  return docs
42
 
43
  def Chunks(docs):
 
44
  text_splitter = SemanticChunker(embeddings,breakpoint_threshold_type='interquartile')
45
  docs = text_splitter.split_documents(docs)
46
  cleaned_docs = replace_t_with_space(docs)
47
  return cleaned_docs
48
 
49
+ def PDF_4_QA(file_path):
50
+ #docs = read_pdf(file_path)
51
+ #cleaned_docs = Chunks(docs)
52
+ read_pdf_text(file_path)
53
  vectordb = Chroma.from_documents(
54
  documents=cleaned_docs,
55
  embedding=embeddings,