File size: 2,370 Bytes
48d7b1f
fa734d4
48d7b1f
 
6fafc47
48d7b1f
4d1a0d0
f1c9abf
4d1a0d0
fa734d4
4d1a0d0
48d7b1f
e4f8800
 
 
 
 
 
 
48d7b1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from transformers import pipeline
import gradio as gr

llama_model = "meta-llama/Llama-2-7b-chat-hf"
access_token = " "

model = AutoModelForCausalLM.from_pretrained(llama_model, token=access_token)
tokenizer = AutoTokenizer.from_pretrained(llama_model, token=access_token)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

SYSTEM_PROMPT = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

"""

# Formatting function for message and history
def message_format(message: str, history: list, memory_limit: int = 5) -> str:
    
    # always keep len(history) <= memory_limit
    if len(history) > memory_limit:
        history = history[-memory_limit:]

    if len(history) == 0:
        return SYSTEM_PROMPT + f"{message} [/INST]"

    formatted_message = SYSTEM_PROMPT + f"{history[0][0]} [/INST] {history[0][1]} </s>"

    # Handle conversation history
    for user_msg, model_answer in history[1:]:
        formatted_message += f"<s>[INST] {user_msg} [/INST] {model_answer} </s>"

    # Handle the current message
    formatted_message += f"<s>[INST] {message} [/INST]"

    return formatted_message

# Generate a response from the Llama model
def llama_response(message: str, history: list) -> str:
  
    query = message_format(message, history)
    response = ""

    sequences = pipeline(
        query,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=1050,
    )

    generated_text = sequences[0]['generated_text']
    response = generated_text[len(query):]  # Remove the prompt from the output

    print("Chatbot:", response.strip())
    return response.strip()


gr.ChatInterface(llama_response).launch()