File size: 3,319 Bytes
fa82d94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee8a916
ce45214
278278b
ce45214
 
 
95da4bf
 
 
 
ce45214
 
 
 
95da4bf
 
ce45214
 
 
 
 
74bda08
 
ce45214
 
 
 
278278b
 
ce45214
 
 
 
 
ee8a916
9a8dfa4
ee8a916
 
 
 
 
 
9a8dfa4
ee8a916
 
 
 
 
 
fa82d94
 
9a8dfa4
ce45214
3d9274d
fa82d94
3d9274d
fa82d94
3d9274d
 
 
278278b
2d7e5db
 
fa82d94
3d9274d
 
fa82d94
3d9274d
 
fa82d94
 
3d9274d
fa82d94
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import json
from .base import Base
from .chunk import Chunk


class Document(Base):
    class ParserConfig(Base):
        def __init__(self, rag, res_dict):
            super().__init__(rag, res_dict)

    def __init__(self, rag, res_dict):
        self.id = ""
        self.name = ""
        self.thumbnail = None
        self.dataset_id = None
        self.chunk_method = "naive"
        self.parser_config = {"pages": [[1, 1000000]]}
        self.source_type = "local"
        self.type = ""
        self.created_by = ""
        self.size = 0
        self.token_count = 0
        self.chunk_count = 0
        self.progress = 0.0
        self.progress_msg = ""
        self.process_begin_at = None
        self.process_duration = 0.0
        self.run = "0"
        self.status = "1"
        for k in list(res_dict.keys()):
            if k not in self.__dict__:
                res_dict.pop(k)
        super().__init__(rag, res_dict)

    def update(self, update_message: dict):
        res = self.put(f'/datasets/{self.dataset_id}/documents/{self.id}',
                       update_message)
        res = res.json()
        if res.get("code") != 0:
            raise Exception(res["message"])

    def download(self):
        res = self.get(f"/datasets/{self.dataset_id}/documents/{self.id}")
        try:
            res = res.json()
            raise Exception(res.get("message"))
        except json.JSONDecodeError:
            return res.content

    def list_chunks(self, page=1, page_size=30, keywords=""):
        data = {"keywords": keywords, "page": page, "page_size": page_size}
        res = self.get(f'/datasets/{self.dataset_id}/documents/{self.id}/chunks', data)
        res = res.json()
        if res.get("code") == 0:
            chunks = []
            for data in res["data"].get("chunks"):
                chunk = Chunk(self.rag, data)
                chunks.append(chunk)
            return chunks
        raise Exception(res.get("message"))

    def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = []):
        res = self.post(f'/datasets/{self.dataset_id}/documents/{self.id}/chunks',
                        {"content": content, "important_keywords": important_keywords, "questions": questions})
        res = res.json()
        if res.get("code") == 0:
            return Chunk(self.rag, res["data"].get("chunk"))
        raise Exception(res.get("message"))

    def delete_chunks(self, ids: list[str] | None = None):
        res = self.rm(f"/datasets/{self.dataset_id}/documents/{self.id}/chunks", {"chunk_ids": ids})
        res = res.json()
        if res.get("code") != 0:
            raise Exception(res.get("message"))