eliot-hub commited on
Commit
b7be7da
·
1 Parent(s): 9c5d425
Files changed (2) hide show
  1. app.py +11 -4
  2. hf_to_chroma_ds.py +56 -0
app.py CHANGED
@@ -22,8 +22,8 @@ from langchain_huggingface import HuggingFaceEmbeddings
22
  import os
23
  from chroma_datasets.utils import import_into_chroma
24
  from datasets import load_dataset
25
- import chromadb.utils.embedding_functions as embedding_functions
26
-
27
 
28
 
29
  # Global params
@@ -47,13 +47,20 @@ huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
47
 
48
  # Set up ChromaDB
49
  client = chromadb.Client()
50
- dataset = load_dataset("eliot-hub/memoires_vec_800", split="data", token=HF_TOKEN)
51
  # client = chromadb.PersistentClient(path=os.path.join(os.path.abspath(os.getcwd()), "01_Notebooks", "RAG-ollama", "chatbot_actuariat_APP", CHROMA_PATH))
52
 
 
 
 
 
 
 
 
53
 
54
  db = import_into_chroma(
55
  chroma_client=client,
56
- dataset=dataset,
57
  embedding_function=huggingface_ef
58
  )
59
  # db = Chroma(
 
22
  import os
23
  from chroma_datasets.utils import import_into_chroma
24
  from datasets import load_dataset
25
+ from chromadb.utils import embedding_functions
26
+ from hf_to_chroma_ds import Dataset
27
 
28
 
29
  # Global params
 
47
 
48
  # Set up ChromaDB
49
  client = chromadb.Client()
50
+ # memoires_ds = load_dataset("eliot-hub/memoires_vec_800", split="data", token=HF_TOKEN)
51
  # client = chromadb.PersistentClient(path=os.path.join(os.path.abspath(os.getcwd()), "01_Notebooks", "RAG-ollama", "chatbot_actuariat_APP", CHROMA_PATH))
52
 
53
+ memoires_ds = Dataset(
54
+ hf_data = None,
55
+ hf_dataset_name = "eliot-hub/memoires_vec_800",
56
+ embedding_function = huggingface_ef,
57
+ embedding_function_instructions = None
58
+ )
59
+
60
 
61
  db = import_into_chroma(
62
  chroma_client=client,
63
+ dataset=memoires_ds,
64
  embedding_function=huggingface_ef
65
  )
66
  # db = Chroma(
hf_to_chroma_ds.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # imports
2
+ from abc import ABC, abstractmethod
3
+ from typing import Optional, Union, Sequence, Dict, Mapping, List, Any
4
+ from typing_extensions import TypedDict
5
+ from chroma_datasets.types import AddEmbedding, Datapoint
6
+ from chroma_datasets.utils import load_huggingface_dataset, to_chroma_schema
7
+
8
+
9
+
10
+ class Dataset(ABC):
11
+ """
12
+ Abstract class for a dataset
13
+
14
+ All datasets should inherit from this class
15
+
16
+ Properties:
17
+ hf_data: the raw data from huggingface
18
+ embedding_function: the embedding function used to generate the embeddings
19
+ embeddingFunctionInstructions: tell the user how to set up the embedding function
20
+ """
21
+ hf_dataset_name: str
22
+ hf_data: Any
23
+ embedding_function: str
24
+ embedding_function_instructions: str
25
+
26
+ @classmethod
27
+ def load_data(cls):
28
+ cls.hf_data = load_huggingface_dataset(
29
+ cls.hf_dataset_name,
30
+ split_name="data"
31
+ )
32
+
33
+ @classmethod
34
+ def raw_text(cls) -> str:
35
+ if cls.hf_data is None:
36
+ cls.load_data()
37
+ return "\n".join(cls.hf_data["document"])
38
+
39
+ @classmethod
40
+ def chunked(cls) -> List[Datapoint]:
41
+ if cls.hf_data is None:
42
+ cls.load_data()
43
+ return cls.hf_data
44
+
45
+ @classmethod
46
+ def to_chroma(cls) -> AddEmbedding:
47
+ return to_chroma_schema(cls.chunked())
48
+
49
+
50
+ # class Memoires_DS(Dataset):
51
+ # """
52
+ # """
53
+ # hf_data = None
54
+ # hf_dataset_name = "eliot-hub/memoires_vec_800"
55
+ # embedding_function = "HFEmbeddingFunction"
56
+ # embedding_function_instructions = ef_instruction_dict[embedding_function]