Spaces:
Sleeping
Sleeping
File size: 1,623 Bytes
b7be7da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# imports
from abc import ABC, abstractmethod
from typing import Optional, Union, Sequence, Dict, Mapping, List, Any
from typing_extensions import TypedDict
from chroma_datasets.types import AddEmbedding, Datapoint
from chroma_datasets.utils import load_huggingface_dataset, to_chroma_schema
class Dataset(ABC):
"""
Abstract class for a dataset
All datasets should inherit from this class
Properties:
hf_data: the raw data from huggingface
embedding_function: the embedding function used to generate the embeddings
embeddingFunctionInstructions: tell the user how to set up the embedding function
"""
hf_dataset_name: str
hf_data: Any
embedding_function: str
embedding_function_instructions: str
@classmethod
def load_data(cls):
cls.hf_data = load_huggingface_dataset(
cls.hf_dataset_name,
split_name="data"
)
@classmethod
def raw_text(cls) -> str:
if cls.hf_data is None:
cls.load_data()
return "\n".join(cls.hf_data["document"])
@classmethod
def chunked(cls) -> List[Datapoint]:
if cls.hf_data is None:
cls.load_data()
return cls.hf_data
@classmethod
def to_chroma(cls) -> AddEmbedding:
return to_chroma_schema(cls.chunked())
# class Memoires_DS(Dataset):
# """
# """
# hf_data = None
# hf_dataset_name = "eliot-hub/memoires_vec_800"
# embedding_function = "HFEmbeddingFunction"
# embedding_function_instructions = ef_instruction_dict[embedding_function] |