Spaces:

eliot-hub
/

chatbot_app

Sleeping

chatbot_app / hf_to_chroma_ds.py

emb_func

b7be7da 9 months ago

1.62 kB

	# imports
	from abc import ABC, abstractmethod
	from typing import Optional, Union, Sequence, Dict, Mapping, List, Any
	from typing_extensions import TypedDict
	from chroma_datasets.types import AddEmbedding, Datapoint
	from chroma_datasets.utils import load_huggingface_dataset, to_chroma_schema



	class Dataset(ABC):
	"""
	Abstract class for a dataset

	All datasets should inherit from this class

	Properties:
	hf_data: the raw data from huggingface
	embedding_function: the embedding function used to generate the embeddings
	embeddingFunctionInstructions: tell the user how to set up the embedding function
	"""
	hf_dataset_name: str
	hf_data: Any
	embedding_function: str
	embedding_function_instructions: str

	@classmethod
	def load_data(cls):
	cls.hf_data = load_huggingface_dataset(
	cls.hf_dataset_name,
	split_name="data"
	)

	@classmethod
	def raw_text(cls) -> str:
	if cls.hf_data is None:
	cls.load_data()
	return "\n".join(cls.hf_data["document"])

	@classmethod
	def chunked(cls) -> List[Datapoint]:
	if cls.hf_data is None:
	cls.load_data()
	return cls.hf_data

	@classmethod
	def to_chroma(cls) -> AddEmbedding:
	return to_chroma_schema(cls.chunked())


	# class Memoires_DS(Dataset):
	# """
	# """
	# hf_data = None
	# hf_dataset_name = "eliot-hub/memoires_vec_800"
	# embedding_function = "HFEmbeddingFunction"
	# embedding_function_instructions = ef_instruction_dict[embedding_function]