Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,144 +1,128 @@
|
|
1 |
-
import gradio as gr
|
2 |
import os
|
|
|
3 |
import spaces
|
4 |
-
|
5 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
6 |
-
from threading import Thread
|
7 |
-
|
8 |
-
# Set an environment variable
|
9 |
-
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
10 |
|
|
|
|
|
|
|
11 |
|
12 |
-
|
13 |
-
<div>
|
14 |
-
<h1 style="text-align: center;">Meta Llama3 8B</h1>
|
15 |
-
<p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
|
16 |
-
<p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
|
17 |
-
<p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
|
18 |
-
</div>
|
19 |
-
'''
|
20 |
|
21 |
-
|
22 |
-
<p/>
|
23 |
-
---
|
24 |
-
Built with Meta Llama 3
|
25 |
-
"""
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
</div>
|
33 |
-
"""
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
display: block;
|
40 |
-
}
|
41 |
-
#duplicate-button {
|
42 |
-
margin: auto;
|
43 |
-
color: white;
|
44 |
-
background: #1565c0;
|
45 |
-
border-radius: 100vh;
|
46 |
-
}
|
47 |
-
"""
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
terminators = [
|
53 |
-
tokenizer.eos_token_id,
|
54 |
-
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
55 |
-
]
|
56 |
|
57 |
-
@spaces.GPU(duration=120)
|
58 |
-
def chat_llama3_8b(message: str,
|
59 |
-
history: list,
|
60 |
-
temperature: float,
|
61 |
-
max_new_tokens: int
|
62 |
-
) -> str:
|
63 |
-
"""
|
64 |
-
Generate a streaming response using the llama3-8b model.
|
65 |
-
Args:
|
66 |
-
message (str): The input message.
|
67 |
-
history (list): The conversation history used by ChatInterface.
|
68 |
-
temperature (float): The temperature for generating the response.
|
69 |
-
max_new_tokens (int): The maximum number of new tokens to generate.
|
70 |
-
Returns:
|
71 |
-
str: The generated response.
|
72 |
-
"""
|
73 |
-
conversation = []
|
74 |
-
for user, assistant in history:
|
75 |
-
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
|
76 |
conversation.append({"role": "user", "content": message})
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
generate_kwargs = dict(
|
83 |
-
input_ids
|
84 |
streamer=streamer,
|
85 |
-
max_new_tokens=max_new_tokens,
|
86 |
do_sample=True,
|
87 |
temperature=temperature,
|
|
|
88 |
eos_token_id=terminators,
|
|
|
|
|
|
|
89 |
)
|
90 |
-
|
91 |
-
if temperature == 0:
|
92 |
-
generate_kwargs['do_sample'] = False
|
93 |
-
|
94 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
95 |
t.start()
|
96 |
-
|
97 |
outputs = []
|
98 |
-
for
|
99 |
-
outputs.append(
|
100 |
-
#print(outputs)
|
101 |
yield "".join(outputs)
|
102 |
-
|
103 |
|
104 |
-
# Gradio block
|
105 |
-
chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import torch
|
3 |
import spaces
|
4 |
+
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
from threading import Thread
|
7 |
+
from huggingface_hub import login
|
8 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
|
9 |
|
10 |
+
login(os.environ.get("HF_TOKEN"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
15 |
+
model = AutoModelForCausalLM.from_pretrained(
|
16 |
+
model_id,
|
17 |
+
device_map="auto"
|
18 |
+
)
|
|
|
|
|
19 |
|
20 |
+
@spaces.GPU()
|
21 |
+
def generate(
|
22 |
+
message: str,
|
23 |
+
chat_history: list[tuple[str, str]],
|
24 |
+
system_prompt: str,
|
25 |
+
max_new_tokens: int,
|
26 |
+
temperature: float,
|
27 |
+
top_p: float,
|
28 |
+
top_k: int,
|
29 |
+
repetition_penalty: int
|
30 |
+
):
|
31 |
|
32 |
+
conversation = []
|
33 |
+
if system_prompt:
|
34 |
+
conversation.append({"role": "system", "content": system_prompt})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
+
for user, assistant in chat_history:
|
37 |
+
conversation.append({"role": "user", "content": user})
|
38 |
+
conversation.append({"role": "assistant", "content": assistant})
|
|
|
|
|
|
|
|
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
conversation.append({"role": "user", "content": message})
|
41 |
|
42 |
+
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
43 |
+
input_ids, attention_mask = tokenizer.apply_chat_template(
|
44 |
+
conversation,
|
45 |
+
add_generation_prompt=True,
|
46 |
+
return_tensors="pt",
|
47 |
+
return_dict=True
|
48 |
+
).to(model.device).values()
|
49 |
+
|
50 |
+
terminators = [
|
51 |
+
tokenizer.eos_token_id,
|
52 |
+
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
53 |
+
]
|
54 |
|
55 |
generate_kwargs = dict(
|
56 |
+
{"input_ids": input_ids, "attention_mask": attention_mask},
|
57 |
streamer=streamer,
|
|
|
58 |
do_sample=True,
|
59 |
temperature=temperature,
|
60 |
+
max_new_tokens=max_new_tokens,
|
61 |
eos_token_id=terminators,
|
62 |
+
top_k=top_k,
|
63 |
+
repetition_penalty=repetition_penalty,
|
64 |
+
top_p=top_p
|
65 |
)
|
66 |
+
|
|
|
|
|
|
|
67 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
68 |
t.start()
|
|
|
69 |
outputs = []
|
70 |
+
for new_token in streamer:
|
71 |
+
outputs.append(new_token)
|
|
|
72 |
yield "".join(outputs)
|
|
|
73 |
|
|
|
|
|
74 |
|
75 |
+
gr.ChatInterface(
|
76 |
+
fn=generate,
|
77 |
+
title="🦙 Llama-3 8B Chat",
|
78 |
+
description="",
|
79 |
+
additional_inputs=[
|
80 |
+
gr.Textbox(
|
81 |
+
label="System prompt",
|
82 |
+
lines=5,
|
83 |
+
value="Anda adalah asisten cerdas yang mahir berbahasa Indonesia. Anda dapat memahami dan merespons pertanyaan dalam berbagai bahasa, tetapi selalu menggunakan bahasa Indonesia yang baik dan benar dalam merespons. Anda ramah, sopan, dan berusaha memberikan jawaban yang jelas dan bermanfaat bagi pengguna. Jangan merespon dengan bahasa selain bahasa Indonesia!"
|
84 |
+
),
|
85 |
+
gr.Slider(
|
86 |
+
label="Max new tokens",
|
87 |
+
minimum=1,
|
88 |
+
maximum=2048,
|
89 |
+
step=1,
|
90 |
+
value=1024,
|
91 |
+
),
|
92 |
+
gr.Slider(
|
93 |
+
label="Temperature",
|
94 |
+
minimum=0.1,
|
95 |
+
maximum=4.0,
|
96 |
+
step=0.1,
|
97 |
+
value=0.6,
|
98 |
+
),
|
99 |
+
gr.Slider(
|
100 |
+
label="Top-p (nucleus sampling)",
|
101 |
+
minimum=0.05,
|
102 |
+
maximum=1.0,
|
103 |
+
step=0.05,
|
104 |
+
value=0.9,
|
105 |
+
),
|
106 |
+
gr.Slider(
|
107 |
+
label="Top-k",
|
108 |
+
minimum=1,
|
109 |
+
maximum=1000,
|
110 |
+
step=1,
|
111 |
+
value=50,
|
112 |
+
),
|
113 |
+
gr.Slider(
|
114 |
+
label="Repetition penalty",
|
115 |
+
minimum=1.0,
|
116 |
+
maximum=2.0,
|
117 |
+
step=0.05,
|
118 |
+
value=1.2,
|
119 |
+
),
|
120 |
+
],
|
121 |
+
stop_btn=None,
|
122 |
+
examples=[
|
123 |
+
["Halo apa kabar?"],
|
124 |
+
["Apa manfaat berolahraga secara teratur?"],
|
125 |
+
["Jika Budi berjalan sejauh 5 meter, berapa jumlah anak ayam bapaknya Budi?"],
|
126 |
+
["Siapa presiden pertama Indonesia?"]
|
127 |
+
],
|
128 |
+
).queue().launch()
|