Tu2003716
/

COCOM_disabled_flash_attn

Model card Files Files and versions Community

Tu2003716 commited on Dec 7, 2024

Commit

00a78e8

·

verified ·

1 Parent(s): 0706b6b

Upload modeling_cocom.py

Files changed (1) hide show

modeling_cocom.py +5 -1

modeling_cocom.py CHANGED Viewed

@@ -61,7 +61,7 @@ class COCOMConfig(PretrainedConfig):
     model_type = "COCOM"
     def __init__(self,
-                decoder_model_name="google-t5/t5-base",
                 quantization = 'no',
                 generation_top_k = 1,
                 sep = False,
@@ -100,6 +100,7 @@ class COCOM(PreTrainedModel):
                 torch_dtype=torch.float16,
                 attn_implementation=attn_impl,
                 low_cpu_mem_usage = True,
                 )
         elif cfg.quantization == "int4":
             quant_config = BitsAndBytesConfig(
@@ -116,6 +117,7 @@ class COCOM(PreTrainedModel):
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,
             )
         elif cfg.quantization == "int8":
             quant_config = BitsAndBytesConfig(
@@ -132,6 +134,7 @@ class COCOM(PreTrainedModel):
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,
             )
         else:
             raise NotImplementedError()
@@ -237,6 +240,7 @@ class COCOM(PreTrainedModel):
         # Perform compression with gradient tracking
         inputs_embeds = self.compress_and_replace_emb(enc_input_ids, enc_attention_mask, dec_input_ids)
         # if training_form is compressor, then detach the inputs_embeds, to make gradient not count in decoder
         if (self.training_form == "compressor") and (self.compr is None):
             inputs_embeds  = inputs_embeds.detach()

     model_type = "COCOM"
     def __init__(self,
+                decoder_model_name="meta-llama/Llama-2-7b-chat-hf",
                 quantization = 'no',
                 generation_top_k = 1,
                 sep = False,
                 torch_dtype=torch.float16,
                 attn_implementation=attn_impl,
                 low_cpu_mem_usage = True,
+                device_map='auto'
                 )
         elif cfg.quantization == "int4":
             quant_config = BitsAndBytesConfig(
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,
+                device_map='auto'
             )
         elif cfg.quantization == "int8":
             quant_config = BitsAndBytesConfig(
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,
+                device_map='auto'
             )
         else:
             raise NotImplementedError()
         # Perform compression with gradient tracking
         inputs_embeds = self.compress_and_replace_emb(enc_input_ids, enc_attention_mask, dec_input_ids)
         # if training_form is compressor, then detach the inputs_embeds, to make gradient not count in decoder
         if (self.training_form == "compressor") and (self.compr is None):
             inputs_embeds  = inputs_embeds.detach()