File size: 6,796 Bytes
e9698e9
b4b5bdf
e9698e9
 
 
7710388
e9698e9
b4b5bdf
e9698e9
 
b4b5bdf
e9698e9
b4b5bdf
e9698e9
 
 
 
51727c4
 
 
e9698e9
 
a3a378d
e9698e9
fc1544a
 
 
 
a3a378d
 
 
fc1544a
a3a378d
 
 
51727c4
fc1544a
b4b5bdf
a3a378d
01b468b
 
 
a3a378d
 
e9698e9
 
 
 
 
 
 
 
 
 
01b468b
 
 
 
e9698e9
 
 
 
 
 
 
 
 
01b468b
e9698e9
 
 
 
 
 
 
fc1544a
e9698e9
 
 
 
e0e448c
 
e9698e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7710388
e9698e9
 
 
 
 
51727c4
 
e9698e9
51727c4
 
e9698e9
 
 
 
51727c4
e9698e9
51727c4
 
 
 
 
 
 
 
 
e9698e9
 
51727c4
e9698e9
 
 
 
 
51727c4
 
 
 
 
 
7710388
51727c4
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import logging
import os

from buster.busterbot import Buster, BusterConfig
from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
from buster.formatters.documents import DocumentsFormatterJSON
from buster.formatters.prompts import PromptFormatter
from buster.retriever import DeepLakeRetriever, Retriever
from buster.tokenizers import GPTTokenizer
from buster.validators import QuestionAnswerValidator, Validator
from huggingface_hub import hf_hub_download

from utils import extract_zip

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# For authentication
USERNAME = os.getenv("BUSTER_USERNAME")
PASSWORD = os.getenv("BUSTER_PASSWORD")

HUB_TOKEN = os.getenv("HUB_TOKEN")
REPO_ID = os.getenv("HF_DATASET")

DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "wiki_tai_langchain")
ZIP_FILE = DEEPLAKE_DATASET + ".zip"

logger.info(f"Downloading {ZIP_FILE} from hub...")
hf_hub_download(
    repo_id=REPO_ID,
    repo_type="dataset",
    filename=ZIP_FILE,
    token=HUB_TOKEN,
    local_dir=".",
)

extract_zip(zip_file_path=ZIP_FILE, output_path=DEEPLAKE_DATASET)

example_questions = [
    "What is the LLama model?",
    "What is a LLM?",
    "What is an embedding?",
]


buster_cfg = BusterConfig(
    validator_cfg={
        "unknown_response_templates": [
            "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
        ],
        "unknown_threshold": 0.85,
        "embedding_model": "text-embedding-ada-002",
        "use_reranking": True,
        "invalid_question_response": "This question does not seem relevant to my current knowledge.",
        "check_question_prompt": """You are a chatbot, answering questions about large language models and artificial intelligence.
Users will ask all sorts of questions, and some might be tangentially related.
Users will learn to build LLM-powered apps, with LangChain & Deep Lake among other technologies.
As long as a question is somewhat related to the topic, respond 'true'. If a question is completely unrelated, respond 'false'.

For example:

Q: How can I setup my own chatbot?
true

Q: What is the meaning of life?
false

A user will now submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
        "completion_kwargs": {
            "model": "gpt-3.5-turbo",
            "stream": False,
            "temperature": 0,
        },
    },
    retriever_cfg={
        "path": f"./{DEEPLAKE_DATASET}",
        "top_k": 3,
        "thresh": 0.7,
        "max_tokens": 2000,
        "embedding_model": "text-embedding-ada-002",
        "exec_option": "compute_engine",
        "use_tql": True,
    },
    documents_answerer_cfg={
        "no_documents_message": "No blog posts are available for this question.",
    },
    completion_cfg={
        "completion_kwargs": {
            "model": "gpt-3.5-turbo",
            "stream": True,
            "temperature": 0,
        },
    },
    tokenizer_cfg={
        "model_name": "gpt-3.5-turbo",
    },
    documents_formatter_cfg={
        "max_tokens": 3500,
        "columns": ["content", "source", "title"],
    },
    prompt_formatter_cfg={
        "max_tokens": 3500,
        "text_before_docs": (
            "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
            "You are provided information found in the <DOCUMENTS> tag. "
            "Only respond with infomration inside the <DOCUMENTS> tag. DO NOT use additional information, even if you know the answer. "
            "If the answer is in the documentation, summarize it in a helpful way to the user. "
            "If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
            "Here is the information you can use: "
        ),
        "text_after_docs": (
            "REMEMBER:\n"
            "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
            "You are provided information found in the <DOCUMENTS> tag. "
            "Here are the rules you must follow:\n"
            "* Only respond with infomration inside the <DOCUMENTS> tag. DO NOT providew additional information, even if you know the answer. "
            "* If the answer is in the documentation, summarize it in a helpful way to the user. "
            "* If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
            "* Only summarize the information in the <DOCUMENTS> tag, do not respond otherwise. "
            "* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
            "* Do not reference any links, urls or hyperlinks in your answers.\n"
            "* Make sure to format your answers in Markdown format, including code block and snippets.\n"
            "* If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
            "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?'"
            "For example:\n"
            "What is the meaning of life for a qa bot?\n"
            "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?"
            "Now answer the following question:\n"
        ),
    },
)


def setup_buster(buster_cfg):
    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
    document_answerer: DocumentAnswerer = DocumentAnswerer(
        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
        documents_formatter=DocumentsFormatterJSON(
            tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
        ),
        prompt_formatter=PromptFormatter(
            tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
        ),
        **buster_cfg.documents_answerer_cfg,
    )
    validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
    buster: Buster = Buster(
        retriever=retriever, document_answerer=document_answerer, validator=validator
    )
    return buster