File size: 3,724 Bytes
af9408a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import json
import os
import uuid
import datetime
from qdrant_client import QdrantClient, models
from langchain_core.load import dumpd, dumps, load, loads
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.text_splitter import NLTKTextSplitter, RecursiveCharacterTextSplitter
from langchain_qdrant import Qdrant

class PDFLoader:

    def __init__(self):
        pass
        
    def pdf_reader(self, path):
        key = None # change it with your own loader key
        endpoint = None # change it with your own loader endpoint
        analysis_features = ["ocrHighResolution"]
        # PDF Loader
        AzurePDFLoader = AzureAIDocumentIntelligenceLoader(
                api_endpoint=endpoint, 
                api_key=key, 
                file_path=path, 
                api_model="prebuilt-layout",
                mode="page",
                analysis_features=analysis_features
            )
        documents = AzurePDFLoader.load()
        return documents

    def save_raw_documents(self, path, name, documents):

        log_file={"documents": dumpd(documents)}
        log_file_name = os.path.join(path, name)

        with open(log_file_name, 'w') as output_file:
            print(json.dumps(log_file, indent=2), file=output_file)

    def load_raw_documents(self, path, name):

        log_file_name = os.path.join(path, name)

        with open(log_file_name, 'rb') as output_file:
            log_file= json.load(output_file)
            
        documents = load(log_file["documents"])
        return documents
    
    def recursive_splitter(self, documents, chunk_size=1024, chunk_overlap=256):

        # Splitter
        mySplitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, 
                                    chunk_overlap=chunk_overlap, 
                                    add_start_index=False)
        chunks = mySplitter.split_documents(documents)
        return chunks
    
    def generate_vectors(self, chunks, embeddings, source_name):
        vectors = []
        metadatas = []
        page_contents = []
        for chunk in chunks:
            page_contents.append(chunk.page_content)
            
            vector = embeddings.embed_documents([chunk.page_content])
            vectors.append(vector)

            meta = chunk.metadata
            meta["source"] = source_name
            metadatas.append(meta)

        return page_contents, vectors, metadatas
    
    def save_to_database(self, chunks, embeddings, collection_name):
    
        qdrant = Qdrant.from_documents(
        chunks,
        embeddings,
        url=os.getenv('qdrant_url'), 
        api_key=os.getenv('qdrant_api'),
        prefer_grpc=True,
        collection_name=collection_name)

    def load_from_database(self, embeddings, collection_name):

        db = Qdrant.from_existing_collection(
        embedding=embeddings,
        url=os.getenv('qdrant_url'), 
        api_key=os.getenv('qdrant_api'),
        collection_name=collection_name)
        
        return db
    
    def save_manuals(self, client, collection_name, car_id, model_year, vectors, metadatas, page_contents):
        
        client.upsert(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=uuid.uuid4().hex,
                payload={"metadata":metadatas[idx],
                        "page_content": page_contents[idx],
                        "car_id": car_id,
                        "model_year": model_year,
                        "create_date": datetime.datetime.now().isoformat()},
                vector=vector[0]
            )
            for idx, vector in enumerate(vectors)
        ],
    )