File size: 8,086 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
"""Transforms relate to hamming distance sampling."""
import random
import numpy as np
from onmt.constants import DefaultTokens
from onmt.transforms import register_transform
from .transform import Transform, ObservableStats
class HammingDistanceSampling(object):
"""Functions related to (negative) Hamming Distance Sampling."""
def _softmax(self, x):
softmax = np.exp(x)/sum(np.exp(x))
return softmax
def _sample_replace(self, vocab, reject):
"""Sample a token from `vocab` other than `reject`."""
token = reject
while token == reject:
token = random.choice(vocab)
return token
def _sample_distance(self, tokens, temperature):
"""Sample number of tokens to corrupt from `tokens`."""
n_tokens = len(tokens)
indices = np.arange(n_tokens)
logits = indices * -1 * temperature
probs = self._softmax(logits)
distance = np.random.choice(indices, p=probs)
return distance
def _sample_position(self, tokens, distance):
n_tokens = len(tokens)
chosen_indices = random.sample(range(n_tokens), k=distance)
return chosen_indices
class HammingDistanceSamplingTransform(Transform, HammingDistanceSampling):
"""Abstract Transform class based on HammingDistanceSampling."""
def _set_seed(self, seed):
"""set seed to ensure reproducibility."""
np.random.seed(seed)
random.seed(seed)
class SwitchOutStats(ObservableStats):
"""Runing statistics for counting tokens being switched out."""
__slots__ = ["changed", "total"]
def __init__(self, changed: int, total: int):
self.changed = changed
self.total = total
def update(self, other: "SwitchOutStats"):
self.changed += other.changed
self.total += other.total
@register_transform(name='switchout')
class SwitchOutTransform(HammingDistanceSamplingTransform):
"""
SwitchOut.
:cite:`DBLP:journals/corr/abs-1808-07512`
"""
def __init__(self, opts):
super().__init__(opts)
@classmethod
def require_vocab(cls):
"""Override this method to inform it need vocab to start."""
return True
@classmethod
def add_options(cls, parser):
"""Avalilable options relate to this Transform."""
group = parser.add_argument_group("Transform/SwitchOut")
group.add("-switchout_temperature", "--switchout_temperature",
type=float, default=1.0,
help="Sampling temperature for SwitchOut. :math:`\\tau^{-1}`"
" in :cite:`DBLP:journals/corr/abs-1808-07512`. "
"Smaller value makes data more diverse.")
def _parse_opts(self):
self.temperature = self.opts.switchout_temperature
def _switchout(self, tokens, vocab, stats=None):
# 1. sample number of tokens to corrupt
n_chosen = self._sample_distance(tokens, self.temperature)
# 2. sample positions to corrput
chosen_indices = self._sample_position(tokens, distance=n_chosen)
# 3. sample corrupted values
for i in chosen_indices:
tokens[i] = self._sample_replace(vocab, reject=tokens[i])
if stats is not None:
stats.update(SwitchOutStats(n_chosen, len(tokens)))
return tokens
def apply(self, example, is_train=False, stats=None, **kwargs):
"""Apply switchout to both src and tgt side tokens."""
if is_train:
example['src'] = self._switchout(
example['src'], self.vocabs['src'].itos, stats)
example['tgt'] = self._switchout(
example['tgt'], self.vocabs['tgt'].itos, stats)
return example
def _repr_args(self):
"""Return str represent key arguments for class."""
return '{}={}'.format('switchout_temperature', self.temperature)
class TokenDropStats(ObservableStats):
"""Runing statistics for counting tokens being switched out."""
__slots__ = ["dropped", "total"]
def __init__(self, dropped: int, total: int):
self.dropped = dropped
self.total = total
def update(self, other: "TokenDropStats"):
self.dropped += other.dropped
self.total += other.total
@register_transform(name='tokendrop')
class TokenDropTransform(HammingDistanceSamplingTransform):
"""Random drop tokens from sentence."""
def __init__(self, opts):
super().__init__(opts)
@classmethod
def add_options(cls, parser):
"""Avalilable options relate to this Transform."""
group = parser.add_argument_group("Transform/Token_Drop")
group.add("-tokendrop_temperature", "--tokendrop_temperature",
type=float, default=1.0,
help="Sampling temperature for token deletion.")
def _parse_opts(self):
self.temperature = self.opts.tokendrop_temperature
def _token_drop(self, tokens, stats=None):
n_items = len(tokens)
# 1. sample number of tokens to corrupt
n_chosen = self._sample_distance(tokens, self.temperature)
# 2. sample positions to corrput
chosen_indices = self._sample_position(tokens, distance=n_chosen)
# 3. Drop token on chosen position
out = [tok for (i, tok) in enumerate(tokens)
if i not in chosen_indices]
if stats is not None:
stats.update(TokenDropStats(n_chosen, n_items))
return out
def apply(self, example, is_train=False, stats=None, **kwargs):
"""Apply token drop to both src and tgt side tokens."""
if is_train:
example['src'] = self._token_drop(example['src'], stats)
example['tgt'] = self._token_drop(example['tgt'], stats)
return example
def _repr_args(self):
"""Return str represent key arguments for class."""
return '{}={}'.format('tokendrop_temperature', self.temperature)
class TokenMaskStats(ObservableStats):
"""Runing statistics for counting tokens being switched out."""
__slots__ = ["masked", "total"]
def __init__(self, masked: int, total: int):
self.masked = masked
self.total = total
def update(self, other: "TokenMaskStats"):
self.masked += other.masked
self.total += other.total
@register_transform(name='tokenmask')
class TokenMaskTransform(HammingDistanceSamplingTransform):
"""Random mask tokens from src sentence."""
MASK_TOK = DefaultTokens.MASK
def __init__(self, opts):
super().__init__(opts)
@classmethod
def add_options(cls, parser):
"""Avalilable options relate to this Transform."""
group = parser.add_argument_group("Transform/Token_Mask")
group.add('-tokenmask_temperature', '--tokenmask_temperature',
type=float, default=1.0,
help="Sampling temperature for token masking.")
def _parse_opts(self):
self.temperature = self.opts.tokenmask_temperature
@classmethod
def get_specials(cls, opts):
"""Get special vocabs added by prefix transform."""
return ({cls.MASK_TOK}, set())
def _token_mask(self, tokens, stats=None):
# 1. sample number of tokens to corrupt
n_chosen = self._sample_distance(tokens, self.temperature)
# 2. sample positions to corrput
chosen_indices = self._sample_position(tokens, distance=n_chosen)
# 3. mask word on chosen position
for i in chosen_indices:
tokens[i] = self.MASK_TOK
if stats is not None:
stats.update(TokenDropStats(n_chosen, len(tokens)))
return tokens
def apply(self, example, is_train=False, stats=None, **kwargs):
"""Apply word drop to both src and tgt side tokens."""
if is_train:
example['src'] = self._token_mask(example['src'], stats)
return example
def _repr_args(self):
"""Return str represent key arguments for class."""
return '{}={}'.format('tokenmask_temperature', self.temperature)
|