SarowarSaurav commited on
Commit
370110e
·
verified ·
1 Parent(s): 25e01a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -119
app.py CHANGED
@@ -1,144 +1,128 @@
1
- import gradio as gr
2
  import os
 
3
  import spaces
4
- from transformers import GemmaTokenizer, AutoModelForCausalLM
5
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
- from threading import Thread
7
-
8
- # Set an environment variable
9
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
10
 
 
 
 
11
 
12
- DESCRIPTION = '''
13
- <div>
14
- <h1 style="text-align: center;">Meta Llama3 8B</h1>
15
- <p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
16
- <p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
17
- <p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
18
- </div>
19
- '''
20
 
21
- LICENSE = """
22
- <p/>
23
- ---
24
- Built with Meta Llama 3
25
- """
26
 
27
- PLACEHOLDER = """
28
- <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
29
- <img src="https://ysharma-dummy-chat-app.hf.space/file=/tmp/gradio/8e75e61cc9bab22b7ce3dec85ab0e6db1da5d107/Meta_lockup_positive%20primary_RGB.jpg" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55; ">
30
- <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Meta llama3</h1>
31
- <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
32
- </div>
33
- """
34
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- css = """
37
- h1 {
38
- text-align: center;
39
- display: block;
40
- }
41
- #duplicate-button {
42
- margin: auto;
43
- color: white;
44
- background: #1565c0;
45
- border-radius: 100vh;
46
- }
47
- """
48
 
49
- # Load the tokenizer and model
50
- tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
51
- model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto") # to("cuda:0")
52
- terminators = [
53
- tokenizer.eos_token_id,
54
- tokenizer.convert_tokens_to_ids("<|eot_id|>")
55
- ]
56
 
57
- @spaces.GPU(duration=120)
58
- def chat_llama3_8b(message: str,
59
- history: list,
60
- temperature: float,
61
- max_new_tokens: int
62
- ) -> str:
63
- """
64
- Generate a streaming response using the llama3-8b model.
65
- Args:
66
- message (str): The input message.
67
- history (list): The conversation history used by ChatInterface.
68
- temperature (float): The temperature for generating the response.
69
- max_new_tokens (int): The maximum number of new tokens to generate.
70
- Returns:
71
- str: The generated response.
72
- """
73
- conversation = []
74
- for user, assistant in history:
75
- conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
76
  conversation.append({"role": "user", "content": message})
77
 
78
- input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
79
-
80
- streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
81
 
82
  generate_kwargs = dict(
83
- input_ids= input_ids,
84
  streamer=streamer,
85
- max_new_tokens=max_new_tokens,
86
  do_sample=True,
87
  temperature=temperature,
 
88
  eos_token_id=terminators,
 
 
 
89
  )
90
- # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
91
- if temperature == 0:
92
- generate_kwargs['do_sample'] = False
93
-
94
  t = Thread(target=model.generate, kwargs=generate_kwargs)
95
  t.start()
96
-
97
  outputs = []
98
- for text in streamer:
99
- outputs.append(text)
100
- #print(outputs)
101
  yield "".join(outputs)
102
-
103
 
104
- # Gradio block
105
- chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
106
 
107
- with gr.Blocks(fill_height=True, css=css) as demo:
108
-
109
- gr.Markdown(DESCRIPTION)
110
- gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
111
- gr.ChatInterface(
112
- fn=chat_llama3_8b,
113
- chatbot=chatbot,
114
- fill_height=True,
115
- additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
116
- additional_inputs=[
117
- gr.Slider(minimum=0,
118
- maximum=1,
119
- step=0.1,
120
- value=0.95,
121
- label="Temperature",
122
- render=False),
123
- gr.Slider(minimum=128,
124
- maximum=4096,
125
- step=1,
126
- value=512,
127
- label="Max new tokens",
128
- render=False ),
129
- ],
130
- examples=[
131
- ['How to setup a human base on Mars? Give short answer.'],
132
- ['Explain theory of relativity to me like I’m 8 years old.'],
133
- ['What is 9,000 * 9,000?'],
134
- ['Write a pun-filled happy birthday message to my friend Alex.'],
135
- ['Justify why a penguin might make a good king of the jungle.']
136
- ],
137
- cache_examples=False,
138
- )
139
-
140
- gr.Markdown(LICENSE)
141
-
142
- if __name__ == "__main__":
143
- demo.launch()
144
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import torch
3
  import spaces
4
+ import gradio as gr
 
 
 
 
 
5
 
6
+ from threading import Thread
7
+ from huggingface_hub import login
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
9
 
10
+ login(os.environ.get("HF_TOKEN"))
 
 
 
 
 
 
 
11
 
12
+ model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 
 
 
13
 
14
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ model_id,
17
+ device_map="auto"
18
+ )
 
 
19
 
20
+ @spaces.GPU()
21
+ def generate(
22
+ message: str,
23
+ chat_history: list[tuple[str, str]],
24
+ system_prompt: str,
25
+ max_new_tokens: int,
26
+ temperature: float,
27
+ top_p: float,
28
+ top_k: int,
29
+ repetition_penalty: int
30
+ ):
31
 
32
+ conversation = []
33
+ if system_prompt:
34
+ conversation.append({"role": "system", "content": system_prompt})
 
 
 
 
 
 
 
 
 
35
 
36
+ for user, assistant in chat_history:
37
+ conversation.append({"role": "user", "content": user})
38
+ conversation.append({"role": "assistant", "content": assistant})
 
 
 
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  conversation.append({"role": "user", "content": message})
41
 
42
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
43
+ input_ids, attention_mask = tokenizer.apply_chat_template(
44
+ conversation,
45
+ add_generation_prompt=True,
46
+ return_tensors="pt",
47
+ return_dict=True
48
+ ).to(model.device).values()
49
+
50
+ terminators = [
51
+ tokenizer.eos_token_id,
52
+ tokenizer.convert_tokens_to_ids("<|eot_id|>")
53
+ ]
54
 
55
  generate_kwargs = dict(
56
+ {"input_ids": input_ids, "attention_mask": attention_mask},
57
  streamer=streamer,
 
58
  do_sample=True,
59
  temperature=temperature,
60
+ max_new_tokens=max_new_tokens,
61
  eos_token_id=terminators,
62
+ top_k=top_k,
63
+ repetition_penalty=repetition_penalty,
64
+ top_p=top_p
65
  )
66
+
 
 
 
67
  t = Thread(target=model.generate, kwargs=generate_kwargs)
68
  t.start()
 
69
  outputs = []
70
+ for new_token in streamer:
71
+ outputs.append(new_token)
 
72
  yield "".join(outputs)
 
73
 
 
 
74
 
75
+ gr.ChatInterface(
76
+ fn=generate,
77
+ title="🦙 Llama-3 8B Chat",
78
+ description="",
79
+ additional_inputs=[
80
+ gr.Textbox(
81
+ label="System prompt",
82
+ lines=5,
83
+ value="Anda adalah asisten cerdas yang mahir berbahasa Indonesia. Anda dapat memahami dan merespons pertanyaan dalam berbagai bahasa, tetapi selalu menggunakan bahasa Indonesia yang baik dan benar dalam merespons. Anda ramah, sopan, dan berusaha memberikan jawaban yang jelas dan bermanfaat bagi pengguna. Jangan merespon dengan bahasa selain bahasa Indonesia!"
84
+ ),
85
+ gr.Slider(
86
+ label="Max new tokens",
87
+ minimum=1,
88
+ maximum=2048,
89
+ step=1,
90
+ value=1024,
91
+ ),
92
+ gr.Slider(
93
+ label="Temperature",
94
+ minimum=0.1,
95
+ maximum=4.0,
96
+ step=0.1,
97
+ value=0.6,
98
+ ),
99
+ gr.Slider(
100
+ label="Top-p (nucleus sampling)",
101
+ minimum=0.05,
102
+ maximum=1.0,
103
+ step=0.05,
104
+ value=0.9,
105
+ ),
106
+ gr.Slider(
107
+ label="Top-k",
108
+ minimum=1,
109
+ maximum=1000,
110
+ step=1,
111
+ value=50,
112
+ ),
113
+ gr.Slider(
114
+ label="Repetition penalty",
115
+ minimum=1.0,
116
+ maximum=2.0,
117
+ step=0.05,
118
+ value=1.2,
119
+ ),
120
+ ],
121
+ stop_btn=None,
122
+ examples=[
123
+ ["Halo apa kabar?"],
124
+ ["Apa manfaat berolahraga secara teratur?"],
125
+ ["Jika Budi berjalan sejauh 5 meter, berapa jumlah anak ayam bapaknya Budi?"],
126
+ ["Siapa presiden pertama Indonesia?"]
127
+ ],
128
+ ).queue().launch()