Spaces:
Running
Running
import logging | |
import os | |
from buster.busterbot import Buster, BusterConfig | |
from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer | |
from buster.formatters.documents import DocumentsFormatterJSON | |
from buster.formatters.prompts import PromptFormatter | |
from buster.retriever import DeepLakeRetriever, Retriever | |
from buster.tokenizers import GPTTokenizer | |
from buster.validators import QuestionAnswerValidator, Validator | |
from huggingface_hub import hf_hub_download | |
from utils import extract_zip | |
logger = logging.getLogger(__name__) | |
logging.basicConfig(level=logging.INFO) | |
# For authentication | |
USERNAME = os.getenv("BUSTER_USERNAME") | |
PASSWORD = os.getenv("BUSTER_PASSWORD") | |
HUB_TOKEN = os.getenv("HUB_TOKEN") | |
REPO_ID = os.getenv("HF_DATASET") | |
DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "wiki_tai_langchain") | |
ZIP_FILE = DEEPLAKE_DATASET + ".zip" | |
logger.info(f"Downloading {ZIP_FILE} from hub...") | |
hf_hub_download( | |
repo_id=REPO_ID, | |
repo_type="dataset", | |
filename=ZIP_FILE, | |
token=HUB_TOKEN, | |
local_dir=".", | |
) | |
extract_zip(zip_file_path=ZIP_FILE, output_path=DEEPLAKE_DATASET) | |
example_questions = [ | |
"What is the LLama model?", | |
"What is a LLM?", | |
"What is an embedding?", | |
] | |
buster_cfg = BusterConfig( | |
validator_cfg={ | |
"unknown_response_templates": [ | |
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?", | |
], | |
"unknown_threshold": 0.85, | |
"embedding_model": "text-embedding-ada-002", | |
"use_reranking": True, | |
"invalid_question_response": "This question does not seem relevant to my current knowledge.", | |
"check_question_prompt": """You are a chatbot, answering questions about large language models and artificial intelligence. | |
Users will ask all sorts of questions, and some might be tangentially related. | |
Users will learn to build LLM-powered apps, with LangChain & Deep Lake among other technologies. | |
As long as a question is somewhat related to the topic, respond 'true'. If a question is completely unrelated, respond 'false'. | |
For example: | |
Q: How can I setup my own chatbot? | |
true | |
Q: What is the meaning of life? | |
false | |
A user will now submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""", | |
"completion_kwargs": { | |
"model": "gpt-3.5-turbo", | |
"stream": False, | |
"temperature": 0, | |
}, | |
}, | |
retriever_cfg={ | |
"path": f"./{DEEPLAKE_DATASET}", | |
"top_k": 3, | |
"thresh": 0.7, | |
"max_tokens": 2000, | |
"embedding_model": "text-embedding-ada-002", | |
"exec_option": "compute_engine", | |
"use_tql": True, | |
}, | |
documents_answerer_cfg={ | |
"no_documents_message": "No blog posts are available for this question.", | |
}, | |
completion_cfg={ | |
"completion_kwargs": { | |
"model": "gpt-3.5-turbo", | |
"stream": True, | |
"temperature": 0, | |
}, | |
}, | |
tokenizer_cfg={ | |
"model_name": "gpt-3.5-turbo", | |
}, | |
documents_formatter_cfg={ | |
"max_tokens": 3500, | |
"columns": ["content", "source", "title"], | |
}, | |
prompt_formatter_cfg={ | |
"max_tokens": 3500, | |
"text_before_docs": ( | |
"You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)." | |
"You are provided information found in the <DOCUMENTS> tag. " | |
"Only respond with infomration inside the <DOCUMENTS> tag. DO NOT use additional information, even if you know the answer. " | |
"If the answer is in the documentation, summarize it in a helpful way to the user. " | |
"If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. " | |
"Here is the information you can use: " | |
), | |
"text_after_docs": ( | |
"REMEMBER:\n" | |
"You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)." | |
"You are provided information found in the <DOCUMENTS> tag. " | |
"Here are the rules you must follow:\n" | |
"* Only respond with infomration inside the <DOCUMENTS> tag. DO NOT providew additional information, even if you know the answer. " | |
"* If the answer is in the documentation, summarize it in a helpful way to the user. " | |
"* If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. " | |
"* Only summarize the information in the <DOCUMENTS> tag, do not respond otherwise. " | |
"* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. " | |
"* Do not reference any links, urls or hyperlinks in your answers.\n" | |
"* Make sure to format your answers in Markdown format, including code block and snippets.\n" | |
"* If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n" | |
"'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?'" | |
"For example:\n" | |
"What is the meaning of life for a qa bot?\n" | |
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?" | |
"Now answer the following question:\n" | |
), | |
}, | |
) | |
def setup_buster(buster_cfg): | |
retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg) | |
tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg) | |
document_answerer: DocumentAnswerer = DocumentAnswerer( | |
completer=ChatGPTCompleter(**buster_cfg.completion_cfg), | |
documents_formatter=DocumentsFormatterJSON( | |
tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg | |
), | |
prompt_formatter=PromptFormatter( | |
tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg | |
), | |
**buster_cfg.documents_answerer_cfg, | |
) | |
validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg) | |
buster: Buster = Buster( | |
retriever=retriever, document_answerer=document_answerer, validator=validator | |
) | |
return buster | |