Omar Solano commited on
Commit
362f139
·
1 Parent(s): 9d31d4d

refactor cfg.py

Browse files
Files changed (1) hide show
  1. cfg.py +79 -65
cfg.py CHANGED
@@ -2,19 +2,23 @@ import logging
2
  import os
3
 
4
  import deeplake
5
-
6
  from buster.busterbot import Buster, BusterConfig
7
  from buster.completers import ChatGPTCompleter, DocumentAnswerer
8
  from buster.formatters.documents import DocumentsFormatterJSON
9
  from buster.formatters.prompts import PromptFormatter
 
10
  from buster.retriever import DeepLakeRetriever, Retriever
11
  from buster.tokenizers import GPTTokenizer
12
- from buster.validators import QuestionAnswerValidator, Validator
 
 
13
  from utils import init_mongo_db
14
 
15
  logger = logging.getLogger(__name__)
16
  logging.basicConfig(level=logging.INFO)
17
 
 
 
18
  MONGODB_URI = os.getenv("MONGODB_URI")
19
  mongo_db = (
20
  init_mongo_db(uri=MONGODB_URI, db_name="towardsai-buster")
@@ -28,25 +32,20 @@ logging.basicConfig(level=logging.INFO)
28
 
29
  ACTIVELOOP_TOKEN = os.getenv("ACTIVELOOP_TOKEN")
30
  if ACTIVELOOP_TOKEN is None:
31
- logger.warning("No activeloop token found, you will not be able to fetch data.")
32
 
33
- DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
34
- DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
35
 
36
- # if you want to use a local dataset, set the env. variable, it overrides all others
37
- DEEPLAKE_DATASET_PATH = os.getenv(
38
- "DEEPLAKE_DATASET_PATH", f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
39
- )
40
-
41
- deeplake.deepcopy(
42
- DEEPLAKE_DATASET_PATH,
43
- "local_dataset",
44
- overwrite=True,
45
- num_workers=12,
46
- )
47
- # DEEPLAKE_DATASET_PATH = "local_dataset"
48
 
49
- logger.info(f"{DEEPLAKE_DATASET_PATH=}")
 
 
 
 
50
 
51
  example_questions = [
52
  "What is the LLama model?",
@@ -54,51 +53,69 @@ example_questions = [
54
  "What is an embedding?",
55
  ]
56
 
 
 
 
 
 
 
 
 
 
57
 
58
  buster_cfg = BusterConfig(
59
  validator_cfg={
60
- "unknown_response_templates": [
61
- "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
62
- ],
63
- "unknown_threshold": 0.85,
64
- "embedding_model": "text-embedding-ada-002",
65
- "use_reranking": True,
66
- "invalid_question_response": "This question does not seem relevant my AI knowledge. If the question is related to AI, please send us feedback! \n PS: I'm still learning, so I might not know the answer to your question, you can also try without acronyms in your question. Email us at [email protected] for any issue with the bot!",
67
- "check_question_prompt": """You are a chatbot, answering questions about large language models and artificial intelligence.
68
- Your job is to determine whether user's question is valid or not. Users will not always submit a question either.
69
- Users will ask all sorts of questions, and some might be tangentially related to artificial intelligence (AI), machine learning (ML) and natural language processing (NLP).
70
- Users will learn to build LLM-powered apps, with LangChain, LlamaIndex & Deep Lake among other technologies including OpenAI, RAG and more.
71
- As long as a question is somewhat related to the topic of AI, ML, NLP, RAG, data and techniques used in AI like vectors, memories, embeddings, tokenization, encoding, databases, RAG (Retrieval-Augmented Generation), Langchain, LlamaIndex, LLM (Large Language Models), Preprocessing techniques, Document loading, Chunking, Indexing of document segments, Embedding models, Chains, Memory modules, Vector stores, Chat models, Sequential chains, Information Retrieval, Data connectors, LlamaHub, Node objects, Query engines, Fine-tuning, Activeloop’s Deep Memory, Prompt engineering, Synthetic training dataset, Inference, Recall rates, Query construction, Query expansion, Query transformation, Re-ranking, Cohere Reranker, Recursive retrieval, Small-to-big retrieval, Hybrid searches, Hit Rate, Mean Reciprocal Rank (MRR), GPT-4, Agents, OpenGPTs, Zero-shot ReAct, Conversational Agent, OpenAI Assistants API, Hugging Face Inference API, Code Interpreter, Knowledge Retrieval, Function Calling, Whisper, Dall-E 3, GPT-4 Vision, Unstructured, Deep Lake, FaithfulnessEvaluator, RAGAS, LangSmith, LangChain Hub, LangServe, REST API, respond 'true'. If a question is on a different subject or unrelated, respond 'false'.
72
- Make sure the question is a valid question.
73
-
74
- Here is a list of acronyms and concepts related to Artificial Intelligence AI that you can accept from users, they can be uppercase or lowercase:
75
- [TQL, Deep Memory, LLM, Llama, llamaindex, llama-index, lang chain, langchain, llama index, GPT, NLP, RLHF, RLAIF, Mistral, SFT, Cohere, NanoGPT, ReAct, LoRA, QLoRA, LMMOps, Alpaca, Flan, Weights and Biases, W&B, IDEFICS, Flamingo, LLaVA, BLIP, Falcon]
76
-
77
- Here are some examples:
78
-
79
- Q: How can I setup my own chatbot?
80
- true
81
-
82
- Q: What is the meaning of life?
83
- false
84
-
85
- Q: What is rlhf?
86
- true
87
-
88
- Q:
89
- """,
90
- "completion_kwargs": {
91
- "model": "gpt-3.5-turbo-0125",
92
- "stream": False,
93
- "temperature": 0,
94
  },
 
 
 
 
 
 
 
 
 
 
95
  },
96
  retriever_cfg={
97
  "path": f"{DEEPLAKE_DATASET_PATH}",
98
  "top_k": 5,
99
- "thresh": 0.55,
100
- "max_tokens": 13000,
101
- "embedding_model": "text-embedding-ada-002",
102
  "exec_option": "compute_engine",
103
  "use_tql": True,
104
  "deep_memory": False,
@@ -109,20 +126,20 @@ Q:
109
  },
110
  completion_cfg={
111
  "completion_kwargs": {
112
- "model": "gpt-3.5-turbo-0125",
113
  "stream": True,
114
  "temperature": 0,
115
  },
116
  },
117
  tokenizer_cfg={
118
- "model_name": "gpt-3.5-turbo-0125",
119
  },
120
  documents_formatter_cfg={
121
- "max_tokens": 13500,
122
  "columns": ["content", "source", "title"],
123
  },
124
  prompt_formatter_cfg={
125
- "max_tokens": 13500,
126
  "text_before_docs": (
127
  "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
128
  "You are provided information found in the json documentation. "
@@ -143,11 +160,8 @@ Q:
143
  "* Do not refer to the json documentation directly, but use the instructions provided within it to answer questions. "
144
  "* Do not reference any links, urls or hyperlinks in your answers.\n"
145
  "* Make sure to format your answers in Markdown format, including code block and snippets.\n"
146
- "* If you do not know the answer to a question, or if it is completely irrelevant to the AI courses, simply reply with:\n"
147
- "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?'"
148
- "For example:\n"
149
- "What is the meaning of life for a qa bot?\n"
150
- "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?"
151
  "Now answer the following question:\n"
152
  ),
153
  },
@@ -167,7 +181,7 @@ def setup_buster(buster_cfg):
167
  ),
168
  **buster_cfg.documents_answerer_cfg,
169
  )
170
- validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
171
  buster: Buster = Buster(
172
  retriever=retriever, document_answerer=document_answerer, validator=validator
173
  )
 
2
  import os
3
 
4
  import deeplake
 
5
  from buster.busterbot import Buster, BusterConfig
6
  from buster.completers import ChatGPTCompleter, DocumentAnswerer
7
  from buster.formatters.documents import DocumentsFormatterJSON
8
  from buster.formatters.prompts import PromptFormatter
9
+ from buster.llm_utils import get_openai_embedding_constructor
10
  from buster.retriever import DeepLakeRetriever, Retriever
11
  from buster.tokenizers import GPTTokenizer
12
+ from buster.validators import Validator
13
+ from dotenv import load_dotenv
14
+
15
  from utils import init_mongo_db
16
 
17
  logger = logging.getLogger(__name__)
18
  logging.basicConfig(level=logging.INFO)
19
 
20
+ load_dotenv()
21
+
22
  MONGODB_URI = os.getenv("MONGODB_URI")
23
  mongo_db = (
24
  init_mongo_db(uri=MONGODB_URI, db_name="towardsai-buster")
 
32
 
33
  ACTIVELOOP_TOKEN = os.getenv("ACTIVELOOP_TOKEN")
34
  if ACTIVELOOP_TOKEN is None:
35
+ logger.warning("No activeloop token found.")
36
 
 
 
37
 
38
+ DEEPLAKE_DATASET_PATH = "local_dataset"
39
+ if os.path.exists(DEEPLAKE_DATASET_PATH):
40
+ logger.info(f"{DEEPLAKE_DATASET_PATH=}")
41
+ else:
42
+ from huggingface_hub import snapshot_download
 
 
 
 
 
 
 
43
 
44
+ snapshot_download(
45
+ repo_id="towardsai-tutors/buster-ai-tutor-data",
46
+ local_dir="local_dataset",
47
+ repo_type="dataset",
48
+ )
49
 
50
  example_questions = [
51
  "What is the LLama model?",
 
53
  "What is an embedding?",
54
  ]
55
 
56
+ # kwargs to pass to the client
57
+ client_kwargs = {
58
+ "timeout": 60,
59
+ "max_retries": 0,
60
+ }
61
+
62
+ embedding_fn = get_openai_embedding_constructor(
63
+ model="text-embedding-3-small", client_kwargs=client_kwargs
64
+ )
65
 
66
  buster_cfg = BusterConfig(
67
  validator_cfg={
68
+ "question_validator_cfg": {
69
+ "invalid_question_response": "This question does not seem relevant my AI knowledge. If the question is related to AI, please send us feedback! \n PS: I'm still learning, so I might not know the answer to your question, you can also try without acronyms in your question. Email us at [email protected] for any issue with the bot!",
70
+ "completion_kwargs": {
71
+ "model": "gpt-4o-mini",
72
+ "stream": False,
73
+ "temperature": 1,
74
+ },
75
+ "client_kwargs": client_kwargs,
76
+ # check_question_prompt is a system prompt
77
+ "check_question_prompt": """You are a chatbot, answering questions about large language models and artificial intelligence.
78
+ # Your job is to determine whether user's question is valid or not. Users will not always submit a question either.
79
+ # Users will ask all sorts of questions, and some might be tangentially related to artificial intelligence (AI), machine learning (ML) and natural language processing (NLP).
80
+ # Users will learn to build LLM-powered apps, with LangChain, LlamaIndex & Deep Lake among other technologies including OpenAI, RAG and more.
81
+ # As long as a question is somewhat related to the topic of AI, ML, NLP, RAG, data and techniques used in AI like vectors, memories, embeddings, tokenization, encoding, databases, RAG (Retrieval-Augmented Generation), Langchain, LlamaIndex, LLM (Large Language Models), Preprocessing techniques, Document loading, Chunking, Indexing of document segments, Embedding models, Chains, Memory modules, Vector stores, Chat models, Sequential chains, Information Retrieval, Data connectors, LlamaHub, Node objects, Query engines, Fine-tuning, Activeloop’s Deep Memory, Prompt engineering, Synthetic training dataset, Inference, Recall rates, Query construction, Query expansion, Query transformation, Re-ranking, Cohere Reranker, Recursive retrieval, Small-to-big retrieval, Hybrid searches, Hit Rate, Mean Reciprocal Rank (MRR), GPT-4, Agents, OpenGPTs, Zero-shot ReAct, Conversational Agent, OpenAI Assistants API, Hugging Face Inference API, Code Interpreter, Knowledge Retrieval, Function Calling, Whisper, Dall-E 3, GPT-4 Vision, Unstructured, Deep Lake, FaithfulnessEvaluator, RAGAS, LangSmith, LangChain Hub, LangServe, REST API, respond 'true'. If a question is on a different subject or unrelated, respond 'false'.
82
+ # Make sure the question is a valid question.
83
+ # Here is a list of acronyms and concepts related to Artificial Intelligence AI that you can accept from users, they can be uppercase or lowercase:
84
+ # [TQL, Deep Memory, LLM, Llama, llamaindex, llama-index, lang chain, langchain, llama index, GPT, NLP, RLHF, RLAIF, Mistral, SFT, Cohere, NanoGPT, ReAct, LoRA, QLoRA, LMMOps, Alpaca, Flan, Weights and Biases, W&B, IDEFICS, Flamingo, LLaVA, BLIP, Falcon]
85
+ # Here are some examples:
86
+ # Q: How can I setup my own chatbot?
87
+ # true
88
+ # Q: What is the meaning of life?
89
+ # false
90
+ # Q: What is rlhf?
91
+ # true
92
+ # Q:
93
+ # """,
94
+ },
95
+ "answer_validator_cfg": {
96
+ "unknown_response_templates": [
97
+ "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
98
+ ],
99
+ "unknown_threshold": 0.3, # compare the embedding of the response to the embedding of the prompt-engineered "I don't know" embedding. if above threshold, we assume answer is not relevant
100
+ "embedding_fn": embedding_fn,
 
101
  },
102
+ "documents_validator_cfg": {
103
+ "completion_kwargs": {
104
+ "model": "gpt-4o-mini",
105
+ "stream": False,
106
+ "temperature": 1,
107
+ },
108
+ "client_kwargs": client_kwargs,
109
+ },
110
+ "use_reranking": True,
111
+ "validate_documents": False,
112
  },
113
  retriever_cfg={
114
  "path": f"{DEEPLAKE_DATASET_PATH}",
115
  "top_k": 5,
116
+ "thresh": 0.2,
117
+ "max_tokens": 100_000,
118
+ "embedding_model": embedding_fn,
119
  "exec_option": "compute_engine",
120
  "use_tql": True,
121
  "deep_memory": False,
 
126
  },
127
  completion_cfg={
128
  "completion_kwargs": {
129
+ "model": "gpt-4o-mini",
130
  "stream": True,
131
  "temperature": 0,
132
  },
133
  },
134
  tokenizer_cfg={
135
+ "model_name": "gpt-4o-mini",
136
  },
137
  documents_formatter_cfg={
138
+ "max_tokens": 100_000,
139
  "columns": ["content", "source", "title"],
140
  },
141
  prompt_formatter_cfg={
142
+ "max_tokens": 100_000,
143
  "text_before_docs": (
144
  "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
145
  "You are provided information found in the json documentation. "
 
160
  "* Do not refer to the json documentation directly, but use the instructions provided within it to answer questions. "
161
  "* Do not reference any links, urls or hyperlinks in your answers.\n"
162
  "* Make sure to format your answers in Markdown format, including code block and snippets.\n"
163
+ "* If the documents retrieved do not answer the question, simply reply with:\n"
164
+ "I'm sorry, but I couldn't find any relevant information in the documents retrieved. If you have any other questions, feel free to ask!"
 
 
 
165
  "Now answer the following question:\n"
166
  ),
167
  },
 
181
  ),
182
  **buster_cfg.documents_answerer_cfg,
183
  )
184
+ validator: Validator = Validator(**buster_cfg.validator_cfg)
185
  buster: Buster = Buster(
186
  retriever=retriever, document_answerer=document_answerer, validator=validator
187
  )