jedick commited on
Commit
f42e9e5
·
1 Parent(s): 09d7140

Add padding="longest"

Browse files
Files changed (2) hide show
  1. main.py +3 -0
  2. pipeline.py +0 -1
main.py CHANGED
@@ -164,6 +164,9 @@ def GetChatModel(compute_mode, ckpt_dir=None):
164
  return_full_text=False,
165
  # It seems that max_new_tokens has to be specified here, not in .invoke()
166
  max_new_tokens=2000,
 
 
 
167
  )
168
  # We need the task so HuggingFacePipeline can deal with our class
169
  pipe.task = "text-generation"
 
164
  return_full_text=False,
165
  # It seems that max_new_tokens has to be specified here, not in .invoke()
166
  max_new_tokens=2000,
167
+ # Use padding for FlashAttention alignment
168
+ # https://github.com/google-deepmind/gemma/issues/169
169
+ padding="longest",
170
  )
171
  # We need the task so HuggingFacePipeline can deal with our class
172
  pipe.task = "text-generation"
pipeline.py CHANGED
@@ -22,7 +22,6 @@ class MyTextGenerationPipeline(TextGenerationPipeline):
22
  continue_final_message=None,
23
  **generate_kwargs,
24
  ):
25
- print(f"PADDING: {padding}")
26
  # Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults
27
  tokenizer_kwargs = {
28
  "add_special_tokens": add_special_tokens,
 
22
  continue_final_message=None,
23
  **generate_kwargs,
24
  ):
 
25
  # Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults
26
  tokenizer_kwargs = {
27
  "add_special_tokens": add_special_tokens,