Spaces:
Runtime error
Runtime error
File size: 14,168 Bytes
daeb223 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
import numpy as np
from .constants import (
QUESTION_COLUMN_NAME,
CONTEXT_COLUMN_NAME,
ANSWER_COLUMN_NAME,
ANSWERABLE_COLUMN_NAME,
ID_COLUMN_NAME,
)
def get_sketch_features(tokenizer, mode, data_args):
pad_on_right = tokenizer.padding_side == "right"
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
def tokenize_fn(examples):
"""Tokenize questions and contexts
Args:
examples (Dict): DatasetDict
Returns:
Dict: Tokenized examples
"""
# truncation과 padding을 통해 tokenization을 진행
# stride를 이용하여 overflow를 유지
# 각 example들은 이전의 context와 조금씩 겹침
# overflow 발생 시 지정한 batch size보다 더 많은 sample이 들어올 수 있음 -> data augmentation
tokenized_examples = tokenizer(
examples[QUESTION_COLUMN_NAME if pad_on_right else CONTEXT_COLUMN_NAME],
examples[CONTEXT_COLUMN_NAME if pad_on_right else QUESTION_COLUMN_NAME],
# 길이가 긴 context가 등장할 경우 truncation을 진행
truncation="only_second" if pad_on_right else "only_first",
max_length=max_seq_length,
stride=data_args.doc_stride,
# overflow 발생 시 원래 인덱스를 찾을 수 있게 mapping 가능한 값이 필요
return_overflowing_tokens=True,
return_offsets_mapping=False,
# sentence pair가 입력으로 들어올 때 0과 1로 구분지음
return_token_type_ids=data_args.return_token_type_ids,
padding="max_length" if data_args.pad_to_max_length else False,
# return_tensors='pt'
)
return tokenized_examples
def prepare_train_features(examples):
tokenized_examples = tokenize_fn(examples)
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
tokenized_examples["labels"] = []
for i in range(len(tokenized_examples["input_ids"])):
# 하나의 example이 여러 개의 span을 가질 수 있음
sample_index = sample_mapping[i]
# unanswerable label 생성
# answerable: 0, unanswerable: 1
is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
tokenized_examples["labels"].append(0 if not is_impossible else 1)
return tokenized_examples
def prepare_eval_features(examples):
tokenized_examples = tokenize_fn(examples)
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
tokenized_examples["example_id"] = []
tokenized_examples["labels"] = []
for i in range(len(tokenized_examples["input_ids"])):
# 하나의 example이 여러 개의 span을 가질 수 있음
sample_index = sample_mapping[i]
id_col = examples[ID_COLUMN_NAME][sample_index]
tokenized_examples["example_id"].append(id_col)
# unanswerable label 생성
# answerable: 0, unanswerable: 1
is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
tokenized_examples["labels"].append(0 if not is_impossible else 1)
return tokenized_examples
def prepare_test_features(examples):
tokenized_examples = tokenize_fn(examples)
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
tokenized_examples["example_id"] = []
for i in range(len(tokenized_examples["input_ids"])):
# 하나의 example이 여러 개의 span을 가질 수 있음
sample_index = sample_mapping[i]
id_col = examples[ID_COLUMN_NAME][sample_index]
tokenized_examples["example_id"].append(id_col)
return tokenized_examples
if mode == "train":
get_features_fn = prepare_train_features
elif mode == "eval":
get_features_fn = prepare_eval_features
elif mode == "test":
get_features_fn = prepare_test_features
return get_features_fn, True
def get_intensive_features(tokenizer, mode, data_args):
pad_on_right = tokenizer.padding_side == "right"
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
beam_based = data_args.intensive_model_type in ["xlnet", "xlm"]
def tokenize_fn(examples):
"""Tokenize questions and contexts
Args:
examples (Dict): DatasetDict
Returns:
Dict: Tokenized examples
"""
# truncation과 padding을 통해 tokenization을 진행
# stride를 이용하여 overflow를 유지
# 각 example들은 이전의 context와 조금씩 겹침
# overflow 발생 시 지정한 batch size보다 더 많은 sample이 들어올 수 있음
tokenized_examples = tokenizer(
examples[QUESTION_COLUMN_NAME if pad_on_right else CONTEXT_COLUMN_NAME],
examples[CONTEXT_COLUMN_NAME if pad_on_right else QUESTION_COLUMN_NAME],
# 길이가 긴 context가 등장할 경우 truncation을 진행
truncation="only_second" if pad_on_right else "only_first",
max_length=max_seq_length,
stride=data_args.doc_stride,
# overflow 발생 시 원래 인덱스를 찾을 수 있게 mapping 가능한 값이 필요
return_overflowing_tokens=True,
# token의 캐릭터 단위 position을 찾을 수 있는 offset을 반환
# start position과 end position을 찾는데 도움을 줌
return_offsets_mapping=True,
# sentence pair가 입력으로 들어올 때 0과 1로 구분지음
return_token_type_ids=data_args.return_token_type_ids,
padding="max_length" if data_args.pad_to_max_length else False,
# return_tensors='pt'
)
return tokenized_examples
def prepare_train_features(examples):
tokenized_examples = tokenize_fn(examples)
# Since one example might give us several features if it has a long context,
# we need a map from a feature to its corresponding example.
# This key gives us just that.
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
# The offset mappings will give us a map from token to character position in the original context
# This will help us compute the start_positions and end_positions.
offset_mapping = tokenized_examples.pop("offset_mapping")
# Let's label those exmaples!
tokenized_examples["start_positions"] = []
tokenized_examples["end_positions"] = []
tokenized_examples["is_impossibles"] = []
if beam_based:
tokenized_examples["cls_index"] = []
tokenized_examples["p_mask"] = []
for i, offsets in enumerate(offset_mapping):
# We will label impossible answers with the index of the CLS token.
input_ids = tokenized_examples["input_ids"][i]
cls_index = input_ids.index(tokenizer.cls_token_id)
# Grab the sequence corresponding to that example
# (to know what is the context and what is the question.)
sequence_ids = tokenized_examples.sequence_ids(i)
context_index = 1 if pad_on_right else 0
# `p_mask` which indicates the tokens that can't be in answers
# Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
# The cls token gets 0.0 too (for predictions of empty answers).
# iInspired by XLNet.
if beam_based:
tokenized_examples["cls_index"].append(cls_index)
tokenized_examples["p_mask"].append(
[
0.0 if s == context_index or k == cls_index else 1.0
for k, s in enumerate(sequence_ids)
]
)
# One example can give several spans,
# this is the index of the example containing this span of text.
sample_index = sample_mapping[i]
answers = examples[ANSWER_COLUMN_NAME][sample_index]
is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
# If no answers are given, set the cls_index as answer.
if is_impossible or len(answers["answer_start"]) == 0:
tokenized_examples["start_positions"].append(cls_index)
tokenized_examples["end_positions"].append(cls_index)
tokenized_examples["is_impossibles"].append(1.0) # unanswerable
else:
# Start/end character index of the answer in the text.
start_char = answers["answer_start"][0]
end_char = start_char + len(answers["text"][0])
# sequence_ids는 0, 1, None의 세 값만 가짐
# None 0 0 ... 0 None 1 1 ... 1 None
# Start token index of the current span in the text.
token_start_index = 0
while sequence_ids[token_start_index] != context_index:
token_start_index += 1
# End token index of the current span in the text.
token_end_index = len(input_ids) - 1
while sequence_ids[token_end_index] != context_index:
token_end_index -= 1
# Detect if the answer is out of the span
# (in which case this feature is labeled with the CLS index.)
if not (
offsets[token_start_index][0] <= start_char and
offsets[token_end_index][1] >= end_char
):
tokenized_examples["start_positions"].append(cls_index)
tokenized_examples["end_positions"].append(cls_index)
tokenized_examples["is_impossibles"].append(1.0) # unanswerable
else:
# Otherwise move the token_start_index and token_end_index to the two ends of the answer.
# Note: we could go after the last offset if the answer is the last word (edge case).
while (
token_start_index < len(offsets) and
offsets[token_start_index][0] <= start_char
):
token_start_index += 1
tokenized_examples["start_positions"].append(token_start_index - 1)
while offsets[token_end_index][1] >= end_char:
token_end_index -= 1
tokenized_examples["end_positions"].append(token_end_index + 1)
tokenized_examples["is_impossibles"].append(0.0) # answerable
return tokenized_examples
def prepare_eval_features(examples):
tokenized_examples = tokenize_fn(examples)
# Since one example might give us several features if it has a long context,
# we need a map from a feature to its corresponding example.
# This key gives us just that.
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
# For evaluation, we will need to convert our predictions to substrings of the context,
# so we keep the corresponding example_id and we will store the offset mappings.
tokenized_examples["example_id"] = []
# We will provide the index of the CLS token ans the p_mask to the model,
# but not the is_impossible label.
if beam_based:
tokenized_examples["cls_index"] = []
tokenized_examples["p_mask"] = []
for i, input_ids in enumerate(tokenized_examples["input_ids"]):
# Find the CLS token in the input ids.
cls_index = input_ids.index(tokenizer.cls_token_id)
# Grab the sequence corresponding to that example
# (to know what is the context and what is the question.)
sequence_ids = tokenized_examples.sequence_ids(i)
context_index = 1 if pad_on_right else 0
# `p_mask` which indicates the tokens that can't be in answers
# Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
# The cls token gets 0.0 too (for predictions of empty answers).
# iInspired by XLNet.
if beam_based:
tokenized_examples["cls_index"].append(cls_index)
tokenized_examples["p_mask"].append(
[
0.0 if s == context_index or k == cls_index else 1.0
for k, s in enumerate(sequence_ids)
]
)
# One example can give several spans,
# this is the index of the example containing this span of text.
sample_index = sample_mapping[i]
id_col = examples[ID_COLUMN_NAME][sample_index]
tokenized_examples["example_id"].append(id_col)
# Set to None the offset_mapping that are note part of the context
# so it's easy to determine if a token position is part of the context or not.
tokenized_examples["offset_mapping"][i] = [
(o if sequence_ids[k] == context_index else None)
for k, o in enumerate(tokenized_examples["offset_mapping"][i])
]
return tokenized_examples
if mode == "train":
get_features_fn = prepare_train_features
elif mode == "eval":
get_features_fn = prepare_eval_features
elif mode == "test":
get_features_fn = prepare_eval_features
return get_features_fn, True |