File size: 2,108 Bytes
cb0d8e2
5ab154b
4294bc8
5ab154b
 
1d73ddf
4294bc8
a0b6dfc
d45c8e8
5ab154b
 
4b41cfa
5ab154b
 
 
4b41cfa
5ab154b
 
4b41cfa
5ab154b
 
 
4b41cfa
5ab154b
 
 
4b41cfa
4294bc8
 
 
73bf17a
 
4294bc8
 
f18883e
 
4294bc8
d45c8e8
5ab154b
 
 
 
 
 
 
 
 
 
 
4294bc8
 
 
6a1b5d7
f18883e
6a1b5d7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os 
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from PyPDF2 import PdfReader
from langchain.docstore.document import Document

embedding_modelPath = "sentence-transformers/all-MiniLM-l6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})

def replace_t_with_space(list_of_documents):
    """
    Replaces all tab characters ('\t') with spaces in the page content of each document.

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents

def read_pdf_text(pdf_path):
    text = ""
    pdf_reader = PdfReader(pdf_path)
    for page in pdf_reader.pages:
        text += page.extract_text()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    text_chunks = text_splitter.split_text(text)
    text_docs = [Document(page_content=txt) for txt in text_chunks]
    return text_docs

def read_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    print("Total Documents :",len(docs))
    return docs

def Chunks(docs):
    text_splitter = SemanticChunker(embeddings,breakpoint_threshold_type='interquartile')
    docs = text_splitter.split_documents(docs)
    cleaned_docs = replace_t_with_space(docs)
    return cleaned_docs
    
def PDF_4_QA(file_path):
    #docs = read_pdf(file_path)
    #cleaned_docs = Chunks(docs)
    cleaned_docs = read_pdf_text(file_path)
    vectordb = Chroma.from_documents(cleaned_docs,embedding=embeddings,persist_directory="Chroma/docs")
    return vectordb,cleaned_docs