Tu2003716
/

COCOM_disabled_flash_attn

Safetensors

COCOM

custom_code

Model card Files Files and versions Community

Shiro2 commited on Dec 7, 2024

Commit

f322d23

verified ·

1 Parent(s): 00a78e8

temp

Browse files

Files changed (1) hide show

modeling_cocom.py +9 -7

modeling_cocom.py CHANGED Viewed

@@ -72,6 +72,7 @@ class COCOMConfig(PretrainedConfig):
                 training_form="both",
                 lora_r=16,
                 attn_implementation="eager",
                  **kwargs):
         super().__init__(**kwargs)
@@ -86,6 +87,7 @@ class COCOMConfig(PretrainedConfig):
         self.training_form = training_form # training form, could be compressor: training only comprssor; both:
         self.lora_r = lora_r # lora_r for lora training, we use 16 throughout the experiment.
         self.attn_implementation = attn_implementation
 class COCOM(PreTrainedModel):
     config_class = COCOMConfig
@@ -100,7 +102,7 @@ class COCOM(PreTrainedModel):
                 torch_dtype=torch.float16,
                 attn_implementation=attn_impl,
                 low_cpu_mem_usage = True,
-                device_map='auto'
                 )
         elif cfg.quantization == "int4":
             quant_config = BitsAndBytesConfig(
@@ -117,7 +119,7 @@ class COCOM(PreTrainedModel):
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,
-                device_map='auto'
             )
         elif cfg.quantization == "int8":
             quant_config = BitsAndBytesConfig(
@@ -134,7 +136,7 @@ class COCOM(PreTrainedModel):
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,
-                device_map='auto'
             )
         else:
             raise NotImplementedError()
@@ -300,10 +302,10 @@ class COCOM(PreTrainedModel):
         # generate
         model_input = {
-            'enc_input_ids': enc_input['input_ids'],
-            'enc_attention_mask': enc_input['attention_mask'],
-            'dec_input_ids': inp_dec['input_ids'],
-            'dec_attention_mask': inp_dec['attention_mask']
         }
         return self.generate(model_input, max_new_tokens)

                 training_form="both",
                 lora_r=16,
                 attn_implementation="eager",
+                device_map = "cuda",
                  **kwargs):
         super().__init__(**kwargs)
         self.training_form = training_form # training form, could be compressor: training only comprssor; both:
         self.lora_r = lora_r # lora_r for lora training, we use 16 throughout the experiment.
         self.attn_implementation = attn_implementation
+        self.device_map = device_map
 class COCOM(PreTrainedModel):
     config_class = COCOMConfig
                 torch_dtype=torch.float16,
                 attn_implementation=attn_impl,
                 low_cpu_mem_usage = True,
+                device_map =cfg.device_map
                 )
         elif cfg.quantization == "int4":
             quant_config = BitsAndBytesConfig(
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,
+                device_map =cfg.device_map
             )
         elif cfg.quantization == "int8":
             quant_config = BitsAndBytesConfig(
                 resume_download=True,
                 low_cpu_mem_usage = True,
                 trust_remote_code=True,
+                device_map =cfg.device_map
             )
         else:
             raise NotImplementedError()
         # generate
         model_input = {
+            'enc_input_ids': enc_input['input_ids'].to(self.decoder.device),
+            'enc_attention_mask': enc_input['attention_mask'].to(self.decoder.device),
+            'dec_input_ids': inp_dec['input_ids'].to(self.decoder.device),
+            'dec_attention_mask': inp_dec['attention_mask'].to(self.decoder.device)
         }
         return self.generate(model_input, max_new_tokens)