Sergidev commited on
Commit
577b78e
Β·
verified Β·
1 Parent(s): 25e5204
Files changed (1) hide show
  1. app.py +10 -11
app.py CHANGED
@@ -5,11 +5,10 @@ import gradio as gr
5
  import spaces
6
  import torch
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
8
-
9
  DESCRIPTION = """\
10
- # Qwen2 0.5B Instruct Text Completion
11
 
12
- This is a demo of [`Qwen/Qwen2-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct), fine-tuned for instruction following.
13
 
14
  Enter your text in the box below and click "Complete" to have the AI generate a completion for your input. The generated text will be appended to your input. You can stop the generation at any time by clicking the "Stop" button.
15
  """
@@ -19,26 +18,26 @@ DEFAULT_MAX_NEW_TOKENS = 1024
19
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
20
 
21
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
22
-
23
- model_id = "Qwen/Qwen2-0.5B-Instruct"
24
  tokenizer = AutoTokenizer.from_pretrained(model_id)
25
  model = AutoModelForCausalLM.from_pretrained(
26
  model_id,
27
  device_map="auto",
28
- torch_dtype=torch.bfloat16,
 
29
  )
30
  model.eval()
31
-
32
  @spaces.GPU(duration=90)
33
  def generate(
34
  message: str,
35
- max_new_tokens: int = 420,
36
- temperature: float = 0,
37
  top_p: float = 0.9,
38
  top_k: int = 50,
39
  repetition_penalty: float = 1.2,
40
  ) -> Iterator[str]:
41
- input_ids = tokenizer.encode(message, return_tensors="pt")
 
42
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
43
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
44
  gr.Warning(f"Trimmed input as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
@@ -90,7 +89,7 @@ with gr.Blocks(css="style.css", fill_height=True) as demo:
90
  minimum=0.1,
91
  maximum=1.0,
92
  step=0.1,
93
- value=0,
94
  )
95
  top_p = gr.Slider(
96
  label="Top-p (nucleus sampling)",
 
5
  import spaces
6
  import torch
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
8
  DESCRIPTION = """\
9
+ # Llama-3.1-70B-Instruct Text Completion
10
 
11
+ This is a demo of [`meta-llama/Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct), a large language model fine-tuned for instruction following.
12
 
13
  Enter your text in the box below and click "Complete" to have the AI generate a completion for your input. The generated text will be appended to your input. You can stop the generation at any time by clicking the "Stop" button.
14
  """
 
18
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
19
 
20
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
+ model_id = "meta-llama/Llama-3.1-70B-Instruct"
 
22
  tokenizer = AutoTokenizer.from_pretrained(model_id)
23
  model = AutoModelForCausalLM.from_pretrained(
24
  model_id,
25
  device_map="auto",
26
+ torch_dtype=torch.float16,
27
+ load_in_8bit=True,
28
  )
29
  model.eval()
 
30
  @spaces.GPU(duration=90)
31
  def generate(
32
  message: str,
33
+ max_new_tokens: int = 1024,
34
+ temperature: float = 0.6,
35
  top_p: float = 0.9,
36
  top_k: int = 50,
37
  repetition_penalty: float = 1.2,
38
  ) -> Iterator[str]:
39
+ prompt = f"[INST] {message} [/INST]"
40
+ input_ids = tokenizer.encode(prompt, return_tensors="pt")
41
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
42
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
43
  gr.Warning(f"Trimmed input as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
 
89
  minimum=0.1,
90
  maximum=1.0,
91
  step=0.1,
92
+ value=0.6,
93
  )
94
  top_p = gr.Slider(
95
  label="Top-p (nucleus sampling)",