bobber commited on
Commit
5609bb1
·
verified ·
1 Parent(s): 19d9fe4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -26
app.py CHANGED
@@ -41,20 +41,21 @@ model = Llama4ForConditionalGeneration.from_pretrained(
41
  # quantization_config=bnb_config,
42
  device_map="auto",
43
  )
44
-
45
  tokenizer = AutoTokenizer.from_pretrained(model_name
46
  # , gguf_file=filename
47
  # , subfolder=subfolder
48
  )
49
- SYSTEM_PROMPT = """
50
- Respond in the following format:
51
- <reasoning>
52
- ...
53
- </reasoning>
54
- <answer>
55
- ...
56
- </answer>
57
- """
 
58
 
59
  @spaces.GPU
60
  def generate(prompt, history):
@@ -62,25 +63,36 @@ def generate(prompt, history):
62
  {"role": "system", "content": SYSTEM_PROMPT},
63
  {"role": "user", "content": prompt}
64
  ]
65
- text = tokenizer.apply_chat_template(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  messages,
67
- # tokenize=False,
68
  tokenize=True,
69
- add_generation_prompt=True
70
- )
71
- model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
72
-
73
- generated_ids = model.generate(
74
- **model_inputs,
75
- max_new_tokens=512
76
  )
77
- generated_ids = [
78
- output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
79
- ]
80
-
81
- response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
82
- return response
83
-
84
 
85
 
86
  chat_interface = gr.ChatInterface(
 
41
  # quantization_config=bnb_config,
42
  device_map="auto",
43
  )
44
+ # processor = AutoProcessor.from_pretrained(model_name, cache_dir = cache_dir)
45
  tokenizer = AutoTokenizer.from_pretrained(model_name
46
  # , gguf_file=filename
47
  # , subfolder=subfolder
48
  )
49
+ SYSTEM_PROMPT = "You are a friendly Chatbot."
50
+ # """
51
+ # Respond in the following format:
52
+ # <reasoning>
53
+ # ...
54
+ # </reasoning>
55
+ # <answer>
56
+ # ...
57
+ # </answer>
58
+ # """
59
 
60
  @spaces.GPU
61
  def generate(prompt, history):
 
63
  {"role": "system", "content": SYSTEM_PROMPT},
64
  {"role": "user", "content": prompt}
65
  ]
66
+ # text = tokenizer.apply_chat_template(
67
+ # messages,
68
+ # # tokenize=False,
69
+ # tokenize=True,
70
+ # add_generation_prompt=True
71
+ # )
72
+ # model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
73
+
74
+ # generated_ids = model.generate(
75
+ # **model_inputs,
76
+ # max_new_tokens=512
77
+ # )
78
+ # generated_ids = [
79
+ # output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
80
+ # ]
81
+
82
+ # response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
83
+ # return response
84
+ inputs = tokenizer.apply_chat_template(
85
  messages,
86
+ add_generation_prompt=True,
87
  tokenize=True,
88
+ return_dict=True,
89
+ return_tensors="pt",
90
+ ).to(gpu_model.device)
91
+ outputs = gpu_model.generate(
92
+ **inputs,
93
+ max_new_tokens=512,
 
94
  )
95
+ response = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
 
 
 
 
 
 
96
 
97
 
98
  chat_interface = gr.ChatInterface(