Llama-3.1-8B-DALv0.1
/
venv
/lib
/python3.12
/site-packages
/transformers
/models
/dpr
/modeling_dpr.py
# coding=utf-8 | |
# Copyright 2018 DPR Authors, The Hugging Face Team. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""PyTorch DPR model for Open Domain Question Answering.""" | |
from dataclasses import dataclass | |
from typing import Optional, Tuple, Union | |
import torch | |
from torch import Tensor, nn | |
from ...modeling_outputs import BaseModelOutputWithPooling | |
from ...modeling_utils import PreTrainedModel | |
from ...utils import ( | |
ModelOutput, | |
add_start_docstrings, | |
add_start_docstrings_to_model_forward, | |
logging, | |
replace_return_docstrings, | |
) | |
from ..bert.modeling_bert import BertModel | |
from .configuration_dpr import DPRConfig | |
logger = logging.get_logger(__name__) | |
_CONFIG_FOR_DOC = "DPRConfig" | |
_CHECKPOINT_FOR_DOC = "facebook/dpr-ctx_encoder-single-nq-base" | |
########## | |
# Outputs | |
########## | |
class DPRContextEncoderOutput(ModelOutput): | |
""" | |
Class for outputs of [`DPRQuestionEncoder`]. | |
Args: | |
pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`): | |
The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer | |
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer. | |
This output is to be used to embed contexts for nearest neighbors queries with questions embeddings. | |
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): | |
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of | |
shape `(batch_size, sequence_length, hidden_size)`. | |
Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): | |
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, | |
sequence_length)`. | |
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |
heads. | |
""" | |
pooler_output: torch.FloatTensor | |
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None | |
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None | |
class DPRQuestionEncoderOutput(ModelOutput): | |
""" | |
Class for outputs of [`DPRQuestionEncoder`]. | |
Args: | |
pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`): | |
The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer | |
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer. | |
This output is to be used to embed questions for nearest neighbors queries with context embeddings. | |
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): | |
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of | |
shape `(batch_size, sequence_length, hidden_size)`. | |
Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): | |
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, | |
sequence_length)`. | |
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |
heads. | |
""" | |
pooler_output: torch.FloatTensor | |
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None | |
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None | |
class DPRReaderOutput(ModelOutput): | |
""" | |
Class for outputs of [`DPRQuestionEncoder`]. | |
Args: | |
start_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`): | |
Logits of the start index of the span for each passage. | |
end_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`): | |
Logits of the end index of the span for each passage. | |
relevance_logits (`torch.FloatTensor` of shape `(n_passages, )`): | |
Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the | |
question, compared to all the other passages. | |
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): | |
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of | |
shape `(batch_size, sequence_length, hidden_size)`. | |
Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): | |
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, | |
sequence_length)`. | |
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |
heads. | |
""" | |
start_logits: torch.FloatTensor | |
end_logits: torch.FloatTensor = None | |
relevance_logits: torch.FloatTensor = None | |
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None | |
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None | |
class DPRPreTrainedModel(PreTrainedModel): | |
_supports_sdpa = True | |
def _init_weights(self, module): | |
"""Initialize the weights""" | |
if isinstance(module, nn.Linear): | |
# Slightly different from the TF version which uses truncated_normal for initialization | |
# cf https://github.com/pytorch/pytorch/pull/5617 | |
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) | |
if module.bias is not None: | |
module.bias.data.zero_() | |
elif isinstance(module, nn.Embedding): | |
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) | |
if module.padding_idx is not None: | |
module.weight.data[module.padding_idx].zero_() | |
elif isinstance(module, nn.LayerNorm): | |
module.bias.data.zero_() | |
module.weight.data.fill_(1.0) | |
class DPREncoder(DPRPreTrainedModel): | |
base_model_prefix = "bert_model" | |
def __init__(self, config: DPRConfig): | |
super().__init__(config) | |
self.bert_model = BertModel(config, add_pooling_layer=False) | |
if self.bert_model.config.hidden_size <= 0: | |
raise ValueError("Encoder hidden_size can't be zero") | |
self.projection_dim = config.projection_dim | |
if self.projection_dim > 0: | |
self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim) | |
# Initialize weights and apply final processing | |
self.post_init() | |
def forward( | |
self, | |
input_ids: Tensor, | |
attention_mask: Optional[Tensor] = None, | |
token_type_ids: Optional[Tensor] = None, | |
inputs_embeds: Optional[Tensor] = None, | |
output_attentions: bool = False, | |
output_hidden_states: bool = False, | |
return_dict: bool = False, | |
) -> Union[BaseModelOutputWithPooling, Tuple[Tensor, ...]]: | |
outputs = self.bert_model( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
token_type_ids=token_type_ids, | |
inputs_embeds=inputs_embeds, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
sequence_output = outputs[0] | |
pooled_output = sequence_output[:, 0, :] | |
if self.projection_dim > 0: | |
pooled_output = self.encode_proj(pooled_output) | |
if not return_dict: | |
return (sequence_output, pooled_output) + outputs[2:] | |
return BaseModelOutputWithPooling( | |
last_hidden_state=sequence_output, | |
pooler_output=pooled_output, | |
hidden_states=outputs.hidden_states, | |
attentions=outputs.attentions, | |
) | |
def embeddings_size(self) -> int: | |
if self.projection_dim > 0: | |
return self.encode_proj.out_features | |
return self.bert_model.config.hidden_size | |
class DPRSpanPredictor(DPRPreTrainedModel): | |
base_model_prefix = "encoder" | |
def __init__(self, config: DPRConfig): | |
super().__init__(config) | |
self.encoder = DPREncoder(config) | |
self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2) | |
self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1) | |
# Initialize weights and apply final processing | |
self.post_init() | |
def forward( | |
self, | |
input_ids: Tensor, | |
attention_mask: Tensor, | |
inputs_embeds: Optional[Tensor] = None, | |
output_attentions: bool = False, | |
output_hidden_states: bool = False, | |
return_dict: bool = False, | |
) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]: | |
# notations: N - number of questions in a batch, M - number of passages per questions, L - sequence length | |
n_passages, sequence_length = input_ids.size() if input_ids is not None else inputs_embeds.size()[:2] | |
# feed encoder | |
outputs = self.encoder( | |
input_ids, | |
attention_mask=attention_mask, | |
inputs_embeds=inputs_embeds, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
sequence_output = outputs[0] | |
# compute logits | |
logits = self.qa_outputs(sequence_output) | |
start_logits, end_logits = logits.split(1, dim=-1) | |
start_logits = start_logits.squeeze(-1).contiguous() | |
end_logits = end_logits.squeeze(-1).contiguous() | |
relevance_logits = self.qa_classifier(sequence_output[:, 0, :]) | |
# resize | |
start_logits = start_logits.view(n_passages, sequence_length) | |
end_logits = end_logits.view(n_passages, sequence_length) | |
relevance_logits = relevance_logits.view(n_passages) | |
if not return_dict: | |
return (start_logits, end_logits, relevance_logits) + outputs[2:] | |
return DPRReaderOutput( | |
start_logits=start_logits, | |
end_logits=end_logits, | |
relevance_logits=relevance_logits, | |
hidden_states=outputs.hidden_states, | |
attentions=outputs.attentions, | |
) | |
################## | |
# PreTrainedModel | |
################## | |
class DPRPretrainedContextEncoder(DPRPreTrainedModel): | |
""" | |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained | |
models. | |
""" | |
config_class = DPRConfig | |
load_tf_weights = None | |
base_model_prefix = "ctx_encoder" | |
class DPRPretrainedQuestionEncoder(DPRPreTrainedModel): | |
""" | |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained | |
models. | |
""" | |
config_class = DPRConfig | |
load_tf_weights = None | |
base_model_prefix = "question_encoder" | |
class DPRPretrainedReader(DPRPreTrainedModel): | |
""" | |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained | |
models. | |
""" | |
config_class = DPRConfig | |
load_tf_weights = None | |
base_model_prefix = "span_predictor" | |
############### | |
# Actual Models | |
############### | |
DPR_START_DOCSTRING = r""" | |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the | |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads | |
etc.) | |
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. | |
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage | |
and behavior. | |
Parameters: | |
config ([`DPRConfig`]): Model configuration class with all the parameters of the model. | |
Initializing with a config file does not load the weights associated with the model, only the | |
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. | |
""" | |
DPR_ENCODERS_INPUTS_DOCSTRING = r""" | |
Args: | |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): | |
Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be | |
formatted with [CLS] and [SEP] tokens as follows: | |
(a) For sequence pairs (for a pair title+text for example): | |
``` | |
tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] | |
token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 | |
``` | |
(b) For single sequences (for a question for example): | |
``` | |
tokens: [CLS] the dog is hairy . [SEP] | |
token_type_ids: 0 0 0 0 0 0 0 | |
``` | |
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right | |
rather than the left. | |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and | |
[`PreTrainedTokenizer.__call__`] for details. | |
[What are input IDs?](../glossary#input-ids) | |
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: | |
- 1 for tokens that are **not masked**, | |
- 0 for tokens that are **masked**. | |
[What are attention masks?](../glossary#attention-mask) | |
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, | |
1]`: | |
- 0 corresponds to a *sentence A* token, | |
- 1 corresponds to a *sentence B* token. | |
[What are token type IDs?](../glossary#token-type-ids) | |
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): | |
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This | |
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the | |
model's internal embedding lookup matrix. | |
output_attentions (`bool`, *optional*): | |
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned | |
tensors for more detail. | |
output_hidden_states (`bool`, *optional*): | |
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for | |
more detail. | |
return_dict (`bool`, *optional*): | |
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. | |
""" | |
DPR_READER_INPUTS_DOCSTRING = r""" | |
Args: | |
input_ids (`Tuple[torch.LongTensor]` of shapes `(n_passages, sequence_length)`): | |
Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question | |
and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should | |
be formatted with [CLS] and [SEP] with the format: | |
`[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>` | |
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right | |
rather than the left. | |
Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details. | |
[What are input IDs?](../glossary#input-ids) | |
attention_mask (`torch.FloatTensor` of shape `(n_passages, sequence_length)`, *optional*): | |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: | |
- 1 for tokens that are **not masked**, | |
- 0 for tokens that are **masked**. | |
[What are attention masks?](../glossary#attention-mask) | |
inputs_embeds (`torch.FloatTensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*): | |
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This | |
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the | |
model's internal embedding lookup matrix. | |
output_attentions (`bool`, *optional*): | |
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned | |
tensors for more detail. | |
output_hidden_states (`bool`, *optional*): | |
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for | |
more detail. | |
return_dict (`bool`, *optional*): | |
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. | |
""" | |
class DPRContextEncoder(DPRPretrainedContextEncoder): | |
def __init__(self, config: DPRConfig): | |
super().__init__(config) | |
self.config = config | |
self.ctx_encoder = DPREncoder(config) | |
# Initialize weights and apply final processing | |
self.post_init() | |
def forward( | |
self, | |
input_ids: Optional[Tensor] = None, | |
attention_mask: Optional[Tensor] = None, | |
token_type_ids: Optional[Tensor] = None, | |
inputs_embeds: Optional[Tensor] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]: | |
r""" | |
Return: | |
Examples: | |
```python | |
>>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer | |
>>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base") | |
>>> model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base") | |
>>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"] | |
>>> embeddings = model(input_ids).pooler_output | |
```""" | |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions | |
output_hidden_states = ( | |
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states | |
) | |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
if input_ids is not None and inputs_embeds is not None: | |
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") | |
elif input_ids is not None: | |
input_shape = input_ids.size() | |
elif inputs_embeds is not None: | |
input_shape = inputs_embeds.size()[:-1] | |
else: | |
raise ValueError("You have to specify either input_ids or inputs_embeds") | |
device = input_ids.device if input_ids is not None else inputs_embeds.device | |
if attention_mask is None: | |
attention_mask = ( | |
torch.ones(input_shape, device=device) | |
if input_ids is None | |
else (input_ids != self.config.pad_token_id) | |
) | |
if token_type_ids is None: | |
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) | |
outputs = self.ctx_encoder( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
token_type_ids=token_type_ids, | |
inputs_embeds=inputs_embeds, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
if not return_dict: | |
return outputs[1:] | |
return DPRContextEncoderOutput( | |
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions | |
) | |
class DPRQuestionEncoder(DPRPretrainedQuestionEncoder): | |
def __init__(self, config: DPRConfig): | |
super().__init__(config) | |
self.config = config | |
self.question_encoder = DPREncoder(config) | |
# Initialize weights and apply final processing | |
self.post_init() | |
def forward( | |
self, | |
input_ids: Optional[Tensor] = None, | |
attention_mask: Optional[Tensor] = None, | |
token_type_ids: Optional[Tensor] = None, | |
inputs_embeds: Optional[Tensor] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]: | |
r""" | |
Return: | |
Examples: | |
```python | |
>>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer | |
>>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base") | |
>>> model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base") | |
>>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"] | |
>>> embeddings = model(input_ids).pooler_output | |
``` | |
""" | |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions | |
output_hidden_states = ( | |
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states | |
) | |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
if input_ids is not None and inputs_embeds is not None: | |
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") | |
elif input_ids is not None: | |
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) | |
input_shape = input_ids.size() | |
elif inputs_embeds is not None: | |
input_shape = inputs_embeds.size()[:-1] | |
else: | |
raise ValueError("You have to specify either input_ids or inputs_embeds") | |
device = input_ids.device if input_ids is not None else inputs_embeds.device | |
if attention_mask is None: | |
attention_mask = ( | |
torch.ones(input_shape, device=device) | |
if input_ids is None | |
else (input_ids != self.config.pad_token_id) | |
) | |
if token_type_ids is None: | |
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) | |
outputs = self.question_encoder( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
token_type_ids=token_type_ids, | |
inputs_embeds=inputs_embeds, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
if not return_dict: | |
return outputs[1:] | |
return DPRQuestionEncoderOutput( | |
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions | |
) | |
class DPRReader(DPRPretrainedReader): | |
def __init__(self, config: DPRConfig): | |
super().__init__(config) | |
self.config = config | |
self.span_predictor = DPRSpanPredictor(config) | |
# Initialize weights and apply final processing | |
self.post_init() | |
def forward( | |
self, | |
input_ids: Optional[Tensor] = None, | |
attention_mask: Optional[Tensor] = None, | |
inputs_embeds: Optional[Tensor] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]: | |
r""" | |
Return: | |
Examples: | |
```python | |
>>> from transformers import DPRReader, DPRReaderTokenizer | |
>>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base") | |
>>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base") | |
>>> encoded_inputs = tokenizer( | |
... questions=["What is love ?"], | |
... titles=["Haddaway"], | |
... texts=["'What Is Love' is a song recorded by the artist Haddaway"], | |
... return_tensors="pt", | |
... ) | |
>>> outputs = model(**encoded_inputs) | |
>>> start_logits = outputs.start_logits | |
>>> end_logits = outputs.end_logits | |
>>> relevance_logits = outputs.relevance_logits | |
``` | |
""" | |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions | |
output_hidden_states = ( | |
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states | |
) | |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
if input_ids is not None and inputs_embeds is not None: | |
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") | |
elif input_ids is not None: | |
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) | |
input_shape = input_ids.size() | |
elif inputs_embeds is not None: | |
input_shape = inputs_embeds.size()[:-1] | |
else: | |
raise ValueError("You have to specify either input_ids or inputs_embeds") | |
device = input_ids.device if input_ids is not None else inputs_embeds.device | |
if attention_mask is None: | |
attention_mask = torch.ones(input_shape, device=device) | |
return self.span_predictor( | |
input_ids, | |
attention_mask, | |
inputs_embeds=inputs_embeds, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |