Spaces:

eliot-hub
/

chatbot_app

Sleeping

App Files Files Community

eliot-hub commited on Sep 26, 2024

Commit

b7be7da

1 Parent(s): 9c5d425

emb_func

Browse files

Files changed (2) hide show

app.py +11 -4
hf_to_chroma_ds.py +56 -0

app.py CHANGED Viewed

@@ -22,8 +22,8 @@ from langchain_huggingface import HuggingFaceEmbeddings
 import os
 from chroma_datasets.utils import import_into_chroma
 from datasets import load_dataset
-import chromadb.utils.embedding_functions as embedding_functions
 # Global params
@@ -47,13 +47,20 @@ huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
 # Set up ChromaDB
 client = chromadb.Client()
-dataset = load_dataset("eliot-hub/memoires_vec_800", split="data", token=HF_TOKEN)
 # client = chromadb.PersistentClient(path=os.path.join(os.path.abspath(os.getcwd()), "01_Notebooks", "RAG-ollama", "chatbot_actuariat_APP", CHROMA_PATH))
 db = import_into_chroma(
     chroma_client=client,
-    dataset=dataset,
     embedding_function=huggingface_ef
     )
 # db = Chroma(

 import os
 from chroma_datasets.utils import import_into_chroma
 from datasets import load_dataset
+from chromadb.utils import embedding_functions
+from hf_to_chroma_ds import Dataset
 # Global params
 # Set up ChromaDB
 client = chromadb.Client()
+# memoires_ds = load_dataset("eliot-hub/memoires_vec_800", split="data", token=HF_TOKEN)
 # client = chromadb.PersistentClient(path=os.path.join(os.path.abspath(os.getcwd()), "01_Notebooks", "RAG-ollama", "chatbot_actuariat_APP", CHROMA_PATH))
+memoires_ds = Dataset(
+    hf_data = None,
+    hf_dataset_name = "eliot-hub/memoires_vec_800",
+    embedding_function = huggingface_ef,
+    embedding_function_instructions = None
+    )
 db = import_into_chroma(
     chroma_client=client,
+    dataset=memoires_ds,
     embedding_function=huggingface_ef
     )
 # db = Chroma(

hf_to_chroma_ds.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# imports
+from abc import ABC, abstractmethod
+from typing import Optional, Union, Sequence, Dict, Mapping, List, Any
+from typing_extensions import TypedDict
+from chroma_datasets.types import AddEmbedding, Datapoint
+from chroma_datasets.utils import load_huggingface_dataset, to_chroma_schema
+class Dataset(ABC):
+    """
+        Abstract class for a dataset
+        All datasets should inherit from this class
+        Properties:
+            hf_data: the raw data from huggingface
+            embedding_function: the embedding function used to generate the embeddings
+            embeddingFunctionInstructions: tell the user how to set up the embedding function
+    """
+    hf_dataset_name: str
+    hf_data: Any
+    embedding_function: str
+    embedding_function_instructions: str
+    @classmethod
+    def load_data(cls):
+        cls.hf_data = load_huggingface_dataset(
+            cls.hf_dataset_name,
+            split_name="data"
+        )
+    @classmethod
+    def raw_text(cls) -> str:
+        if cls.hf_data is None:
+            cls.load_data()
+        return "\n".join(cls.hf_data["document"])
+    @classmethod
+    def chunked(cls) -> List[Datapoint]:
+        if cls.hf_data is None:
+            cls.load_data()
+        return cls.hf_data
+    @classmethod
+    def to_chroma(cls) -> AddEmbedding:
+        return to_chroma_schema(cls.chunked())
+# class Memoires_DS(Dataset):
+#     """
+#     """
+#     hf_data = None
+#     hf_dataset_name = "eliot-hub/memoires_vec_800"
+#     embedding_function = "HFEmbeddingFunction"
+#     embedding_function_instructions = ef_instruction_dict[embedding_function]