aehrc
/

medicap

@@ -136,98 +136,6 @@ class MedICapEncoderDecoderModel(VisionEncoderDecoderModel):
         self.encoder.config = self.config.encoder
         self.decoder.config = self.config.decoder
-    @classmethod
-    def from_encoder_decoder_pretrained(
-        cls,
-        encoder_pretrained_model_name_or_path: str = None,
-        decoder_pretrained_model_name_or_path: str = None,
-        *model_args,
-        **kwargs,
-    ) -> PreTrainedModel:
-        kwargs_encoder = {
-            argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
-        }
-        kwargs_decoder = {
-            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
-        }
-        # remove encoder, decoder kwargs from kwargs
-        for key in kwargs_encoder.keys():
-            del kwargs["encoder_" + key]
-        for key in kwargs_decoder.keys():
-            del kwargs["decoder_" + key]
-        # Load and initialize the encoder and decoder
-        # The distinction between encoder and decoder at the model level is made
-        # by the value of the flag `is_decoder` that we need to set correctly.
-        encoder = kwargs_encoder.pop("model", None)
-        if encoder is None:
-            if encoder_pretrained_model_name_or_path is None:
-                raise ValueError(
-                    "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
-                    "to be defined."
-                )
-            if "config" not in kwargs_encoder:
-                encoder_config, kwargs_encoder = AutoConfig.from_pretrained(
-                    encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
-                )
-                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
-                    logger.info(
-                        f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
-                        "from a decoder model. Cross-attention and casual mask are disabled."
-                    )
-                    encoder_config.is_decoder = False
-                    encoder_config.add_cross_attention = False
-                kwargs_encoder["config"] = encoder_config
-            encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
-        decoder = kwargs_decoder.pop("model", None)
-        if decoder is None:
-            if decoder_pretrained_model_name_or_path is None:
-                raise ValueError(
-                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
-                    "to be defined."
-                )
-            if "config" not in kwargs_decoder:
-                decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
-                    decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
-                )
-                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
-                    logger.info(
-                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
-                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
-                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
-                    )
-                    decoder_config.is_decoder = True
-                    decoder_config.add_cross_attention = True
-                kwargs_decoder["config"] = decoder_config
-            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
-                logger.warning(
-                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
-                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
-                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
-                    "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
-                    "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
-                )
-            decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
-        # instantiate config with corresponding kwargs
-        config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
-        # make sure input & output embeddings is not tied
-        config.tie_word_embeddings = False
-        return cls(encoder=encoder, decoder=decoder, config=config)
     def forward(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,
@@ -265,11 +173,6 @@ class MedICapEncoderDecoderModel(VisionEncoderDecoderModel):
         elif isinstance(encoder_outputs, tuple):
             encoder_outputs = BaseModelOutput(*encoder_outputs)
-        # encoder_hidden_states = encoder_outputs[0]
-        # encoder_attention_mask = None
-        # image_features = self.encoder(images).projected_last_hidden_state
         embeddings = self.decoder.transformer.wte(decoder_input_ids)
         embeddings = torch.cat([encoder_outputs[0], embeddings], dim=1)
@@ -314,143 +217,43 @@ class MedICapEncoderDecoderModel(VisionEncoderDecoderModel):
             decoder_attentions=decoder_outputs.attentions,
             cross_attentions=decoder_outputs.cross_attentions,
             encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            # encoder_hidden_states=encoder_outputs.hidden_states,
-            # encoder_attentions=encoder_outputs.attentions,
         )
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        special_token_ids,
-        past_key_values=None,
-        attention_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs,
     ):
         """
-        Modification of:
-            https://github.com/huggingface/transformers/blob/main/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py#L660
-        """
-        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
-        decoder_attention_mask = decoder_inputs['attention_mask'] if 'attention_mask' in decoder_inputs else None
-        if not past_key_values:
-            token_type_ids = self.token_ids_to_token_type_ids(input_ids, special_token_ids)
-        else:
-            token_type_ids = self.token_ids_to_token_type_ids_past(input_ids, special_token_ids)
-        input_dict = {
-            'attention_mask': attention_mask,
-            'decoder_attention_mask': decoder_attention_mask,
-            'decoder_input_ids': decoder_inputs['input_ids'],
-            'decoder_token_type_ids': token_type_ids,
-            'encoder_outputs': encoder_outputs,
-            'past_key_values': decoder_inputs['past_key_values'],
-            'use_cache': use_cache,
-        }
-        return input_dict
-    def token_ids_to_token_type_ids(self, token_ids, special_token_ids, token_type_id_sections=None):
-        """
-        Extract token type identifiers from the token identifiers.
-        Argument/s:
-            token_ids - token identifiers.
-            special_token_ids - special token identifiers that indicate the separation between sections.
-            token_type_id_section - token type identifier for each section.
-        Returns:
-            token_type_ids - token type identifiers.
-        """
-        token_type_id_sections = token_type_id_sections if token_type_id_sections is not None else list(range(len(special_token_ids) + 1))
-        mbatch_size, seq_len = token_ids.shape
-        token_type_ids = torch.full_like(token_ids, token_type_id_sections[0], dtype=torch.long, device=token_ids.device)
-        for i, j in enumerate(special_token_ids):
-            # Find first occurrence of special tokens that indicate the boundary between sections:
-            cols = (token_ids == j).int().argmax(dim=1)
-            rows = torch.arange(mbatch_size, device=token_ids.device)
-            # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer.create_token_type_ids_from_sequences.example
-            cols += 1
-            # Ensure that the column index is not out of bounds. If 0, then token_id not present.
-            # This is safe as index 0 is always a special token (now equal to 1 due to +1):
-            rows = rows[torch.logical_and(cols != 1, cols < seq_len)]
-            cols = cols[torch.logical_and(cols != 1, cols < seq_len)]
-            # Indices to that correspond to the second sequence:
-            if rows.nelement() != 0:
-                ids = torch.stack([
-                    torch.stack([x, z]) for (x, y) in zip(rows, cols) for z in torch.arange(
-                        y, seq_len, device=token_ids.device,
-                    )
-                ])
-                token_type_ids[ids[:, 0], ids[:, 1]] = token_type_id_sections[i + 1]
-        return token_type_ids
-    def token_ids_to_token_type_ids_past(self, token_ids, special_token_ids, token_type_id_sections=None):
-        """
-        Extract token type identifiers from the token identifiers if past != None.
         Argument/s:
-            token_ids - token identifiers.
-            special_token_ids - special token identifiers that indicate the separation between sections.
-        Returns:
-            token_type_ids - token type identifiers.
-        """
-        token_type_id_sections = token_type_id_sections if token_type_id_sections is not None else list(range(len(special_token_ids) + 1))
-        token_type_ids = torch.full([token_ids.shape[0], 1], token_type_id_sections[0], dtype=torch.long, device=token_ids.device)
-        # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer.create_token_type_ids_from_sequences.example
-        token_ids = token_ids[:, :-1]
-        for i, j in enumerate(special_token_ids):
-            # Find first occurrence of special token, which indicates the boundary between sections:
-            exists = torch.any(token_ids == j, dim=1, keepdim=True)
-            token_type_ids[exists] = token_type_id_sections[i + 1]
-        return token_type_ids
-    def tokenize_report_teacher_forcing(self, findings: str, impression: str, tokenizer: PreTrainedTokenizerFast, max_len: int):
-        """
-        Tokenize the reports and creates the inputs and targets for teacher forcing.
-        Argument/s:
-            findings - findings section.
-            impression - impression section.
-            return_token_type_ids - return the token type identifiers.
             tokenizer - Hugging Face tokenizer.
             max_len - maximum number of tokens.
         Returns:
-            decoder_input_ids - the token identifiers for the input of the decoder.
-            decoder_attention_mask - the attention mask for the decoder_input_ids.
-            label_ids - the label token identifiers for the decoder.
         """
-        # Prepare the sections for the tokenizer by placing special tokens between each section:
-        report = [f'{tokenizer.bos_token}{i}{tokenizer.sep_token}{j}{tokenizer.eos_token}' for i, j in
-                  zip(findings, impression)]
-        # Tokenize the report:
-        tokenized = tokenizer(
-            report,
             padding='longest',
             truncation=True,
-            max_length=max_len + 1,  # +1 to account for the bias between input and target.
             return_tensors='pt',
             return_token_type_ids=False,
-            add_special_tokens=False,
         ).to(self.device)
         # Modify for language modelling:
@@ -466,50 +269,4 @@ class MedICapEncoderDecoderModel(VisionEncoderDecoderModel):
             'decoder_attention_mask': tokenized['attention_mask'][:, 1:],
         }
-        return batch_dict
-    def split_and_decode_sections(self, token_ids, special_token_ids, tokenizer: PreTrainedTokenizerFast):
-        """
-        Split the token identifiers into sections, then convert the token identifiers into strings.
-        Argument/s:
-            token_ids - token identifiers.
-            special_token_ids - special token identifiers that indicate the end of each section.
-            tokenizer - Hugging Face tokenizer.
-        Returns:
-            token_type_ids - token type identifiers.
-        """
-        _, seq_len = token_ids.shape
-        # The number of sections is the same as the number of special_token_ids:
-        num_sections = len(special_token_ids)
-        sections = {k: [] for k in range(num_sections)}
-        for i in token_ids:
-            prev_col = 0
-            for j, k in enumerate(special_token_ids):
-                # The maximum sequence length was exceeded, thus no more tokens:
-                if prev_col >= seq_len:
-                    sections[j].append('')
-                    continue
-                # Find first occurrence of special tokens that indicate the boundary between sections:
-                col = (i == k).int().argmax().item()
-                # If equal to 0, token was not found, set the column to the sequence length (as the decoder exceeded
-                # the maximum sequence length):
-                if col == 0:
-                    col = seq_len
-                # Extract section token identifiers:
-                section_token_ids = i[prev_col:col]
-                prev_col = col
-                section_string = tokenizer.decode(section_token_ids, skip_special_tokens=True)
-                sections[j].append(section_string)
-        return tuple(sections.values())

         self.encoder.config = self.config.encoder
         self.decoder.config = self.config.decoder
     def forward(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,
         elif isinstance(encoder_outputs, tuple):
             encoder_outputs = BaseModelOutput(*encoder_outputs)
         embeddings = self.decoder.transformer.wte(decoder_input_ids)
         embeddings = torch.cat([encoder_outputs[0], embeddings], dim=1)
             decoder_attentions=decoder_outputs.attentions,
             cross_attentions=decoder_outputs.cross_attentions,
             encoder_last_hidden_state=encoder_outputs.last_hidden_state,
         )
+    def tokenize_captions_teacher_forcing(
+        self,
+        captions: str,
+        tokenizer: PreTrainedTokenizerFast,
+        max_len: int,
     ):
         """
+        Tokenizes the captions and creates the inputs and targets for teacher forcing.
         Argument/s:
+            captions - the captions.
             tokenizer - Hugging Face tokenizer.
             max_len - maximum number of tokens.
         Returns:
+            batch_dict = {
+                decoder_input_ids - the token identifiers for the input of the decoder.
+                decoder_attention_mask - the attention mask for the decoder_input_ids.
+                decoder_token_type_ids - the token type identifiers for the decoder_input_ids.
+                label_ids - the label token identifiers for the decoder.
+            }
         """
+        # Prepare the caption for the tokenizer by placing the special tokens:
+        caption = [f'{tokenizer.bos_token}{i}{tokenizer.eos_token}' for i in captions]
+        # Tokenize the caption:
+        tokenized = self.tokenizer(
+            caption,
             padding='longest',
             truncation=True,
+            max_length=max_len + 1,  # +1 to account for the shift between input and target.
             return_tensors='pt',
             return_token_type_ids=False,
+            add_special_tokens=False,  # Done in prepare_sections_for_tokenizer()
         ).to(self.device)
         # Modify for language modelling:
             'decoder_attention_mask': tokenized['attention_mask'][:, 1:],
         }
+        return batch_dict