Spaces:
Runtime error
Runtime error
from pydantic import BaseModel | |
from .record import Record | |
from storage import Storage | |
from embedding import Embedding | |
import time | |
import json | |
class Document(BaseModel): | |
name: str | |
description: str | None = None | |
status: str = 'uploading' # uploading, processing, done, failed | |
url: str | None = None | |
_embedding: Embedding | |
_storage: Storage | |
def load_records(self) -> list[Record]: | |
pass | |
class PlainTextDocument(Document): | |
def __init__( | |
self, | |
embedding: Embedding, | |
storage: Storage, | |
**kwargs): | |
super().__init__(**kwargs) | |
self._embedding = embedding | |
self._storage = storage | |
def _enhance_line(self, line: str) -> str: | |
return line | |
def load_records(self) -> list[Record]: | |
str = self._storage.load(self.url) | |
lines = str.split('\n') | |
for i, line in enumerate(lines): | |
# remove empty lines | |
if len(line.strip()) == 0: | |
continue | |
enhance_line = self._enhance_line(line) | |
embedding = self._embedding.generate_embedding(enhance_line) | |
embedding_type = self._embedding.type | |
meta_data = { | |
'embedding_type': embedding_type, | |
'document_id': self.name, | |
'line_number': i, | |
'source': line, | |
} | |
yield Record( | |
embedding=embedding, | |
meta_data=meta_data, | |
content=line, | |
document_id=self.name, | |
timestamp=int(time.time())) | |
class JsonDocument(Document): | |
def __init__( | |
self, | |
embedding: Embedding, | |
storage: Storage, | |
**kwargs): | |
super().__init__(**kwargs) | |
self._embedding = embedding | |
self._storage = storage | |
def load_records(self) -> list[Record]: | |
''' | |
json format: | |
{ | |
'content': str // the content of the record | |
'meta_data': dict // the meta data of the record | |
} | |
''' | |
str = self._storage.load(self.url) | |
records = json.loads(str) | |
for i, item in enumerate(records): | |
# sleep 300ms | |
time.sleep(0.3) | |
embedding = self._embedding.generate_embedding(item['content']) | |
embedding_type = self._embedding.type | |
meta_data = { | |
'embedding_type': embedding_type, | |
'document_id': self.name, | |
'line_number': i, | |
'source': item['content'], | |
} | |
if 'meta_data' in item: | |
# merge meta data | |
meta_data = {**item['meta_data'], **meta_data} | |
yield Record( | |
embedding=embedding, | |
meta_data=meta_data, | |
content=item['content'], | |
document_id=self.name, | |
timestamp=int(time.time())) |