Samarth991 commited on
Commit
5ab154b
·
verified ·
1 Parent(s): 3e7ea26

Update PDF_Reader.py

Browse files
Files changed (1) hide show
  1. PDF_Reader.py +39 -25
PDF_Reader.py CHANGED
@@ -1,31 +1,45 @@
1
- import PyPDF2
2
- from langchain.text_splitter import RecursiveCharacterTextSplitter
3
- from langchain.embeddings import HuggingFaceBgeEmbeddings
4
- from langchain.vectorstores import FAISS
5
 
6
- def read_pdf(uploaded_file):
7
- pdf_reader = PyPDF2.PdfReader(uploaded_file)
8
- text = ""
9
- for page in pdf_reader.pages:
10
- text += page.extract_text()
11
- return text
12
 
13
- def Chunks(docs):
14
- text_splitter = RecursiveCharacterTextSplitter(
15
- # Set a really small chunk size, just to show.
16
- chunk_size = 1000,
17
- chunk_overlap = 100,
18
- )
19
- doc = text_splitter.split_text(docs)
20
- return doc
21
 
 
 
22
 
23
- def PDF_4_QA(file):
24
- content = read_pdf(file)
25
- pdf_chunks = Chunks(docs=content)
26
 
27
- embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
28
- model_kwargs={'device': 'cpu'})
29
- vectorstore_openai = FAISS.from_texts(pdf_chunks, embeddings)
30
 
31
- return vectorstore_openai
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_experimental.text_splitter import SemanticChunker
2
+ from langchain_chroma import Chroma
3
+ from langchain_community.document_loaders import PyPDFLoader
 
4
 
5
+ embedding_modelPath = "sentence-transformers/all-MiniLM-l6-v2"
6
+ embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})
 
 
 
 
7
 
8
+ def replace_t_with_space(list_of_documents):
9
+ """
10
+ Replaces all tab characters ('\t') with spaces in the page content of each document.
 
 
 
 
 
11
 
12
+ Args:
13
+ list_of_documents: A list of document objects, each with a 'page_content' attribute.
14
 
15
+ Returns:
16
+ The modified list of documents with tab characters replaced by spaces.
17
+ """
18
 
19
+ for doc in list_of_documents:
20
+ doc.page_content = doc.page_content.replace('\t', ' ') # Replace tabs with spaces
21
+ return list_of_documents
22
 
23
+ def read_pdf(uploaded_file):
24
+ loader = PyPDFLoader(pdf_path)
25
+ docs = loader.load()
26
+ print("Total Documents :",len(docs))
27
+ return docs
28
+
29
+ def Chunks(docs):
30
+
31
+ text_splitter = SemanticChunker(embeddings,breakpoint_threshold_type='interquartile')
32
+ docs = text_splitter.split_documents(docs)
33
+ cleaned_docs = replace_t_with_space(docs)
34
+ return cleaned_docs
35
+
36
+ def PDF_4_QA(file):
37
+ docs = read_pdf(file)
38
+ cleaned_docs = Chunks(docs)
39
+
40
+ vectordb = Chroma.from_documents(
41
+ documents=cleaned_docs,
42
+ embedding=local_embeddings,
43
+ persist_directory=persist_directory
44
+ )
45
+ return vectordb