Spaces:

eliot-hub
/

chatbot_app

Sleeping

File size: 1,623 Bytes

b7be7da

# imports
from abc import ABC, abstractmethod
from typing import Optional, Union, Sequence, Dict, Mapping, List, Any
from typing_extensions import TypedDict
from chroma_datasets.types import AddEmbedding, Datapoint
from chroma_datasets.utils import load_huggingface_dataset, to_chroma_schema



class Dataset(ABC):
    """
        Abstract class for a dataset

        All datasets should inherit from this class

        Properties:
            hf_data: the raw data from huggingface
            embedding_function: the embedding function used to generate the embeddings
            embeddingFunctionInstructions: tell the user how to set up the embedding function
    """
    hf_dataset_name: str
    hf_data: Any
    embedding_function: str
    embedding_function_instructions: str

    @classmethod
    def load_data(cls):
        cls.hf_data = load_huggingface_dataset(
            cls.hf_dataset_name,
            split_name="data"
        )

    @classmethod
    def raw_text(cls) -> str:
        if cls.hf_data is None:
            cls.load_data()
        return "\n".join(cls.hf_data["document"])
    
    @classmethod
    def chunked(cls) -> List[Datapoint]:
        if cls.hf_data is None:
            cls.load_data()
        return cls.hf_data
    
    @classmethod
    def to_chroma(cls) -> AddEmbedding:
        return to_chroma_schema(cls.chunked())
    

# class Memoires_DS(Dataset):
#     """
#     """
#     hf_data = None
#     hf_dataset_name = "eliot-hub/memoires_vec_800"
#     embedding_function = "HFEmbeddingFunction"
#     embedding_function_instructions = ef_instruction_dict[embedding_function]