File size: 2,951 Bytes
6abb254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from pydantic import BaseModel
from .record import Record
from storage import Storage
from embedding import Embedding
import time
import json

class Document(BaseModel):
    name: str
    description: str | None = None
    status: str = 'uploading' # uploading, processing, done, failed
    url: str | None = None

    _embedding: Embedding
    _storage: Storage

    def load_records(self) -> list[Record]:
        pass

class PlainTextDocument(Document):
    def __init__(
            self,
            embedding: Embedding,
            storage: Storage,
            **kwargs):
        super().__init__(**kwargs)
        self._embedding = embedding
        self._storage = storage

    def _enhance_line(self, line: str) -> str:
        return line
    
    def load_records(self) -> list[Record]:
        str = self._storage.load(self.url)
        lines = str.split('\n')

        for i, line in enumerate(lines):
            # remove empty lines
            if len(line.strip()) == 0:
                continue
            enhance_line = self._enhance_line(line)
            embedding = self._embedding.generate_embedding(enhance_line)
            embedding_type = self._embedding.type
            meta_data = {
                'embedding_type': embedding_type,
                'document_id': self.name,
                'line_number': i,
                'source': line,
            }

            yield Record(
                embedding=embedding,
                meta_data=meta_data,
                content=line,
                document_id=self.name,
                timestamp=int(time.time()))

class JsonDocument(Document):
    def __init__(
            self,
            embedding: Embedding,
            storage: Storage,
            **kwargs):
        super().__init__(**kwargs)
        self._embedding = embedding
        self._storage = storage

    def load_records(self) -> list[Record]:
        '''
        json format:
        {
            'content': str // the content of the record
            'meta_data': dict // the meta data of the record
        }
        '''
        str = self._storage.load(self.url)
        records = json.loads(str)
        for i, item in enumerate(records):
            # sleep 300ms
            time.sleep(0.3)
            embedding = self._embedding.generate_embedding(item['content'])
            embedding_type = self._embedding.type
            meta_data = {
                'embedding_type': embedding_type,
                'document_id': self.name,
                'line_number': i,
                'source': item['content'],
            }
            if 'meta_data' in item:
                # merge meta data
                meta_data = {**item['meta_data'], **meta_data}

            yield Record(
                embedding=embedding,
                meta_data=meta_data,
                content=item['content'],
                document_id=self.name,
                timestamp=int(time.time()))