chatbot_app / hf_to_chroma_ds.py
eliot-hub's picture
emb_func
b7be7da
raw
history blame
1.62 kB
# imports
from abc import ABC, abstractmethod
from typing import Optional, Union, Sequence, Dict, Mapping, List, Any
from typing_extensions import TypedDict
from chroma_datasets.types import AddEmbedding, Datapoint
from chroma_datasets.utils import load_huggingface_dataset, to_chroma_schema
class Dataset(ABC):
"""
Abstract class for a dataset
All datasets should inherit from this class
Properties:
hf_data: the raw data from huggingface
embedding_function: the embedding function used to generate the embeddings
embeddingFunctionInstructions: tell the user how to set up the embedding function
"""
hf_dataset_name: str
hf_data: Any
embedding_function: str
embedding_function_instructions: str
@classmethod
def load_data(cls):
cls.hf_data = load_huggingface_dataset(
cls.hf_dataset_name,
split_name="data"
)
@classmethod
def raw_text(cls) -> str:
if cls.hf_data is None:
cls.load_data()
return "\n".join(cls.hf_data["document"])
@classmethod
def chunked(cls) -> List[Datapoint]:
if cls.hf_data is None:
cls.load_data()
return cls.hf_data
@classmethod
def to_chroma(cls) -> AddEmbedding:
return to_chroma_schema(cls.chunked())
# class Memoires_DS(Dataset):
# """
# """
# hf_data = None
# hf_dataset_name = "eliot-hub/memoires_vec_800"
# embedding_function = "HFEmbeddingFunction"
# embedding_function_instructions = ef_instruction_dict[embedding_function]