File size: 2,599 Bytes
9cc7e25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# import packages
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
from sentence_transformers import SentenceTransformer
import chromadb
from datasets import load_dataset
from gpt4all import GPT4All

# Embedding vector
class VectorStore:
    def __init__(self, collection_name):
       # Initialize the embedding model
        self.embedding_model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
        self.chroma_client = chromadb.Client()
        self.collection = self.chroma_client.create_collection(name=collection_name)

    # Method to populate the vector store with embeddings from a dataset
    def populate_vectors(self, dataset):
        # Select the text columns to concatenate
        title = dataset['train']['title_cleaned'][:5000]  # Limiting to 100 examples for the demo
        recipe = dataset['train']['recipe_new'][:5000]
        meal_type = dataset['train']['meal_type'][:5000]
        allergy = dataset['train']['allergy_type'][:5000]
        ingredients_alternative = dataset['train']['ingredients_alternatives'][:5000]

        # Concatenate the text from both columns
        texts = [f"{tit} {rep} {meal} {alle} {ingr} " for tit, rep, meal,alle, ingr in zip(title,recipe,meal_type,allergy,ingredients_alternative)]
        for i, item in enumerate(texts):
            embeddings = self.embedding_model.encode(item).tolist()
            self.collection.add(embeddings=[embeddings], documents=[item], ids=[str(i)])

    # # Method to search the ChromaDB collection for relevant context based on a query
    def search_context(self, query, n_results=1):
        query_embeddings = self.embedding_model.encode(query).tolist()
        return self.collection.query(query_embeddings=query_embeddings, n_results=n_results)


# importing dataset hosted on huggingface
# dataset details - https://huggingface.co/datasets/Thefoodprocessor/recipe_new_with_features_full
dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full')

# create a vector embedding
vector_store = VectorStore("embedding_vector")
vector_store.populate_vectors(dataset)


# loading gpt4all language model
# load model Chat based model mistral-7b-openorca.gguf2.Q4_0.gguf
# detail about gpt4all and model information - https://gpt4all.io/index.html
model_name = 'Meta-Llama-3-8B-Instruct.Q4_0.gguf' # .gguf represents quantized model
model_path = "gpt4all"
# add path to download load the model locally, download once and load for subsequent inference
model = GPT4All(model_name=model_name, model_path=model_path,device="cuda")