File size: 2,333 Bytes
f493920
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6a1e5d
 
 
 
 
 
 
 
f493920
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import torch
from datasets import Dataset as hfd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

from config import DATASET_HF_NAME, LLAMA3_CHECKPOINT

# Adapted from HF https://huggingface.co/blog/not-lain/rag-chatbot-using-llama3


def search_topk(
    data: hfd,
    feature_extractor: SentenceTransformer,
    query: str,
    k: int = 3,
    embedding_col: str = "embedding",
):
    """a function that embeds a new query and returns the most probable results"""
    embedded_query = feature_extractor.encode(query)  # embed new query
    scores, retrieved_examples = data.get_nearest_examples(  # retrieve results
        embedding_col,
        embedded_query,  # compare our new embedded query with the dataset embeddings
        k=k,  # get only top k results
    )
    return scores, retrieved_examples


def format_prompt(
    prompt: str, retrieved_documents: hfd, k: int, text_col: str = "chunk"
):
    """using the retrieved documents we will prompt the model to generate our responses"""
    PROMPT = f"Question:{prompt}\nContext:"
    for idx in range(k):
        PROMPT += f"{retrieved_documents[text_col][idx]}\n"
    return PROMPT


# Quantization Config
#bnb_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_use_double_quant=True,
#    bnb_4bit_quant_type="nf4",
#    bnb_4bit_compute_dtype=torch.bfloat16,
#)

bnb_config=BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.bfloat16)

# Tokenizer & Model
# You must request access to the checkpoints
TOKENIZER = AutoTokenizer.from_pretrained(LLAMA3_CHECKPOINT)
MODEL = AutoModelForCausalLM.from_pretrained(
    LLAMA3_CHECKPOINT,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config,
)
TERMINATORS = [TOKENIZER.eos_token_id, TOKENIZER.convert_tokens_to_ids("<|eot_id|>")]

DATA = load_dataset(DATASET_HF_NAME)["train"]

TEXT_GENERATION_PIPELINE = pipeline(
    model=MODEL,
    tokenizer=TOKENIZER,
    task="text-generation",
    device_map="auto",
)
TEXT_GENERATION_PIPELINE.tokenizer

PIPELINE_INFERENCE_ARGS = {
    "max_new_tokens": 256,
    "eos_token_id": TERMINATORS,
    "do_sample": True,
    "temperature": 0.1,
    "top_p": 0.9,
}