File size: 2,076 Bytes
df8bb52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import glob
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents.base import Document

class DocReader:
    def __init__(self, pdf_path, model_path="sentence-transformers/all-mpnet-base-v2", persist_directory="db"):
        self.pdfs = glob.glob(f"{pdf_path}/*.pdf")  # Adjusted to get all PDF files in the folder
        self.model_path = model_path
        self.persist_directory = persist_directory

    def load_pdfs(self):
        all_pages = []
        for pdf_file in self.pdfs:
            loader = PyPDFLoader(pdf_file)
            pages = loader.load()
            all_pages.extend(pages)
        return all_pages

    def convert_to_markdown(self, documents):
        markdown_text = ""
        for doc in documents:
    
            page_text = doc.page_content.replace('\n', '\n\n')  # Add extra newline for Markdown
            markdown_text += page_text + "\n\n---\n\n"
        return markdown_text

    def split_text(self, pages):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=128, 
            chunk_overlap=24)
        documents = [Document(page_content=page) for page in pages]
        split_documents = text_splitter.split_documents(documents)
        texts = [doc.page_content for doc in split_documents]

        return texts

    def generate_embeddings(self, texts):
        embeddings = HuggingFaceEmbeddings(
            model_name=self.model_path,
            model_kwargs={"device": "cuda:0"},
            encode_kwargs={"normalize_embeddings": True},
        )
        documents = [Document(page_content=text) for text in texts]

        db = Qdrant.from_documents(documents, embeddings, location=":memory:", collection_name="pdf_collection")
        return db

    def search_similar(self, input_text, k=3):
        results = self.db.similarity_search(input_text, k)
        return results