Spaces:
Runtime error
Runtime error
File size: 2,951 Bytes
6abb254 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
from pydantic import BaseModel
from .record import Record
from storage import Storage
from embedding import Embedding
import time
import json
class Document(BaseModel):
name: str
description: str | None = None
status: str = 'uploading' # uploading, processing, done, failed
url: str | None = None
_embedding: Embedding
_storage: Storage
def load_records(self) -> list[Record]:
pass
class PlainTextDocument(Document):
def __init__(
self,
embedding: Embedding,
storage: Storage,
**kwargs):
super().__init__(**kwargs)
self._embedding = embedding
self._storage = storage
def _enhance_line(self, line: str) -> str:
return line
def load_records(self) -> list[Record]:
str = self._storage.load(self.url)
lines = str.split('\n')
for i, line in enumerate(lines):
# remove empty lines
if len(line.strip()) == 0:
continue
enhance_line = self._enhance_line(line)
embedding = self._embedding.generate_embedding(enhance_line)
embedding_type = self._embedding.type
meta_data = {
'embedding_type': embedding_type,
'document_id': self.name,
'line_number': i,
'source': line,
}
yield Record(
embedding=embedding,
meta_data=meta_data,
content=line,
document_id=self.name,
timestamp=int(time.time()))
class JsonDocument(Document):
def __init__(
self,
embedding: Embedding,
storage: Storage,
**kwargs):
super().__init__(**kwargs)
self._embedding = embedding
self._storage = storage
def load_records(self) -> list[Record]:
'''
json format:
{
'content': str // the content of the record
'meta_data': dict // the meta data of the record
}
'''
str = self._storage.load(self.url)
records = json.loads(str)
for i, item in enumerate(records):
# sleep 300ms
time.sleep(0.3)
embedding = self._embedding.generate_embedding(item['content'])
embedding_type = self._embedding.type
meta_data = {
'embedding_type': embedding_type,
'document_id': self.name,
'line_number': i,
'source': item['content'],
}
if 'meta_data' in item:
# merge meta data
meta_data = {**item['meta_data'], **meta_data}
yield Record(
embedding=embedding,
meta_data=meta_data,
content=item['content'],
document_id=self.name,
timestamp=int(time.time())) |