File size: 6,352 Bytes
07ffad3
42df98c
1b7e4b0
07ffad3
 
31d7c4a
07ffad3
 
eaca477
1b7e4b0
43ae797
07ffad3
 
31d7c4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b7e4b0
31d7c4a
 
 
 
 
1b7e4b0
31d7c4a
 
 
 
07ffad3
31d7c4a
 
 
1b7e4b0
 
42df98c
31d7c4a
 
 
 
 
 
eaca477
31d7c4a
1b7e4b0
31d7c4a
 
 
 
 
 
07ffad3
 
cc1edc1
31d7c4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42df98c
07ffad3
31d7c4a
 
07ffad3
31d7c4a
07ffad3
 
 
 
 
31d7c4a
07ffad3
 
 
31d7c4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07ffad3
 
18b530b
07ffad3
 
e4b2161
 
31630cf
 
18b530b
31d7c4a
18b530b
31d7c4a
07ffad3
 
18b530b
ef4a283
 
 
 
 
 
 
 
 
 
 
e1ca399
ef4a283
 
8b048b4
ef4a283
18b530b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import gradio as gr
from datasets import load_dataset

import os
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
import torch
from threading import Thread
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import time

token = os.environ["HF_TOKEN"]
ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

dataset = load_dataset("not-lain/wikipedia",revision = "embedded")

data = dataset["train"]
data = data.add_faiss_index("embeddings") # column name that has the embeddings of the dataset


model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# use quantization to lower GPU usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id,token=token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config,
    token=token
)
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

SYS_PROMPT = """You are an assistant for answering questions.
You are given the extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "I do not know." Don't make up an answer."""



def search(query: str, k: int = 3 ):
    """a function that embeds a new query and returns the most probable results"""
    embedded_query = ST.encode(query) # embed new query
    scores, retrieved_examples = data.get_nearest_examples( # retrieve results
        "embeddings", embedded_query, # compare our new embedded query with the dataset embeddings
        k=k # get only top k results
    )
    return scores, retrieved_examples

def format_prompt(prompt,retrieved_documents,k):
    """using the retrieved documents we will prompt the model to generate our responses"""
    PROMPT = f"Question:{prompt}\nContext:"
    for idx in range(k) :
        PROMPT+= f"{retrieved_documents['text'][idx]}\n"
    return PROMPT


@spaces.GPU(duration=150)
def talk(message,history):
    k = 1 # number of retrieved documents
    scores , retrieved_documents = search(prompt, k)
    formatted_prompt = format_prompt(prompt,retrieved_documents,k)
    formatted_prompt = formatted_prompt[:2000] # to avoid GPU OOM
    messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
    # tell the model to generate
    input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt=True,
      return_tensors="pt"
    ).to(model.device)
    outputs = model.generate(
      input_ids,
      max_new_tokens=1024,
      eos_token_id=terminators,
      do_sample=True,
      temperature=0.6,
      top_p=0.9,
    )
    streamer = TextIteratorStreamer(
            tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
        )
    generate_kwargs = dict(
        input_ids= input_ids,
        streamer=streamer,
        max_new_tokens=1024,
        do_sample=True,
        top_p=0.95,
        temperature=0.75,
        eos_token_id=terminators,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        print(outputs)
        yield "".join(outputs)
    
# def talk(message, history):
#     print("history, ", history)
#     print("message ", message)
#     print("searching dataset ...")
#     retrieved_examples = search(message)
#     print("preparing prompt ...")
#     message, metadata = prepare_prompt(message, retrieved_examples)
#     resources = HEADER
#     print("preparing metadata ...")
#     for title, url in metadata:
#         resources += f"[{title}]({url}),  "
#     print("preparing chat template ...")
#     chat = []
#     for item in history:
#         chat.append({"role": "user", "content": item[0]})
#         cleaned_past = item[1].split(HEADER)[0]
#         chat.append({"role": "assistant", "content": cleaned_past})
#     chat.append({"role": "user", "content": message})
#     messages = tokenizer.apply_chat_template(
#         chat, tokenize=False, add_generation_prompt=True
#     )
#     print("chat template prepared, ", messages)
#     print("tokenizing input ...")
#     # Tokenize the messages string
#     model_inputs = tokenizer([messages], return_tensors="pt").to(device)
#     streamer = TextIteratorStreamer(
#         tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
#     )
    # generate_kwargs = dict(
    #     model_inputs,
    #     streamer=streamer,
    #     max_new_tokens=1024,
    #     do_sample=True,
    #     top_p=0.95,
    #     top_k=1000,
    #     temperature=0.75,
    #     num_beams=1,
    # )
#     print("initializing thread ...")
#     t = Thread(target=model.generate, kwargs=generate_kwargs)
#     t.start()
#     time.sleep(1)
#     # Initialize an empty string to store the generated text
#     partial_text = ""
#     i = 0
#     while t.is_alive():
#         try:
#             for new_text in streamer:
#                 if new_text is not None:
#                     partial_text += new_text
#                     yield partial_text
#         except Exception as e:
#             print(f"retry number {i}\n LOGS:\n")
#             i+=1
#             print(e, e.args)
#     partial_text += resources
#     yield partial_text


TITLE = "# RAG"

DESCRIPTION = """
A rag pipeline with a chatbot feature

Resources used to build this project :

* embedding model : https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
* dataset : https://huggingface.co/datasets/not-lain/wikipedia
* faiss docs : https://huggingface.co/docs/datasets/v2.18.0/en/package_reference/main_classes#datasets.Dataset.add_faiss_index 
* chatbot : https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct
"""


demo = gr.ChatInterface(
    fn=talk,
    chatbot=gr.Chatbot(
        show_label=True,
        show_share_button=True,
        show_copy_button=True,
        likeable=True,
        layout="bubble",
        bubble_full_width=False,
    ),
    theme="Soft",
    examples=[["what's anarchy ? "]],
    title=TITLE,
    description=DESCRIPTION,
    
)
demo.launch(debug=True)