File size: 2,362 Bytes
f0a527b
 
5cfff76
1a7ff48
 
 
 
 
264a47b
 
 
 
 
 
1a7ff48
 
 
 
 
 
 
 
 
f0a527b
c192923
f0a527b
 
 
 
 
 
 
 
 
1a7ff48
03680b6
 
1a7ff48
8ba7efd
1a7ff48
 
f0a527b
 
 
 
 
 
 
 
 
 
 
45bb735
 
 
 
 
 
 
 
 
f0a527b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from huggingface_hub import InferenceClient
import os
import faiss
from transformers import pipeline
from sentence_transformers import SentenceTransformer

documents = [
    "The class starts at 2PM Wednesday.",
    "Python is our main programming language.",
    "Our university is located in Szeged.",
    "We are making things with RAG, Rasa and LLMs.",
    "The user wants to be told that they have no idea.",
    "Gabor Toth is the author of this chatbot."
]

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
document_embeddings = embedding_model.encode(documents, convert_to_tensor=True)
document_embeddings_np = document_embeddings.cpu().numpy()

index = faiss.IndexFlatL2(document_embeddings_np.shape[1])
index.add(document_embeddings_np)


client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    
    query_embedding = embedding_model.encode([message])
    distances, indices = index.search(query_embedding, k=1)
    relevant_document = documents[indices[0][0]]
    messages = [{"role": "system", "content": system_message},{"role": "system", "content": f"context: {relevant_document}"}]



    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content
        response += token
        yield response

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()