Spaces:
Running
Running
File size: 5,485 Bytes
650c5f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
from typing import Optional
import torch
from torch import Tensor
from examples.simultaneous_translation.utils.functions import (
exclusive_cumprod,
prob_check,
moving_sum,
)
def expected_alignment_from_p_choose(
p_choose: Tensor,
padding_mask: Optional[Tensor] = None,
eps: float = 1e-6
):
"""
Calculating expected alignment for from stepwise probability
Reference:
Online and Linear-Time Attention by Enforcing Monotonic Alignments
https://arxiv.org/pdf/1704.00784.pdf
q_ij = (1 − p_{ij−1})q_{ij−1} + a+{i−1j}
a_ij = p_ij q_ij
Parallel solution:
ai = p_i * cumprod(1 − pi) * cumsum(a_i / cumprod(1 − pi))
============================================================
Expected input size
p_choose: bsz, tgt_len, src_len
"""
prob_check(p_choose)
# p_choose: bsz, tgt_len, src_len
bsz, tgt_len, src_len = p_choose.size()
dtype = p_choose.dtype
p_choose = p_choose.float()
if padding_mask is not None:
p_choose = p_choose.masked_fill(padding_mask.unsqueeze(1), 0.0)
# cumprod_1mp : bsz, tgt_len, src_len
cumprod_1mp = exclusive_cumprod(1 - p_choose, dim=2, eps=eps)
cumprod_1mp_clamp = torch.clamp(cumprod_1mp, eps, 1.0)
alpha_0 = p_choose.new_zeros([bsz, 1, src_len])
alpha_0[:, :, 0] = 1.0
previous_alpha = [alpha_0]
for i in range(tgt_len):
# p_choose: bsz , tgt_len, src_len
# cumprod_1mp_clamp : bsz, tgt_len, src_len
# previous_alpha[i]: bsz, 1, src_len
# alpha_i: bsz, src_len
alpha_i = (
p_choose[:, i]
* cumprod_1mp[:, i]
* torch.cumsum(
previous_alpha[i][:, 0] / cumprod_1mp_clamp[:, i], dim=1
)
).clamp(0, 1.0)
previous_alpha.append(alpha_i.unsqueeze(1))
# alpha: bsz * num_heads, tgt_len, src_len
alpha = torch.cat(previous_alpha[1:], dim=1)
# Mix precision to prevent overflow for fp16
alpha = alpha.type(dtype)
prob_check(alpha)
return alpha
def expected_soft_attention(
alpha: Tensor,
soft_energy: Tensor,
padding_mask: Optional[Tensor] = None,
chunk_size: Optional[int] = None,
eps: float = 1e-10
):
"""
Function to compute expected soft attention for
monotonic infinite lookback attention from
expected alignment and soft energy.
Reference:
Monotonic Chunkwise Attention
https://arxiv.org/abs/1712.05382
Monotonic Infinite Lookback Attention for Simultaneous Machine Translation
https://arxiv.org/abs/1906.05218
alpha: bsz, tgt_len, src_len
soft_energy: bsz, tgt_len, src_len
padding_mask: bsz, src_len
left_padding: bool
"""
if padding_mask is not None:
alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0.0)
soft_energy = soft_energy.masked_fill(
padding_mask.unsqueeze(1), -float("inf")
)
prob_check(alpha)
dtype = alpha.dtype
alpha = alpha.float()
soft_energy = soft_energy.float()
soft_energy = soft_energy - soft_energy.max(dim=2, keepdim=True)[0]
exp_soft_energy = torch.exp(soft_energy) + eps
if chunk_size is not None:
# Chunkwise
beta = (
exp_soft_energy
* moving_sum(
alpha / (eps + moving_sum(exp_soft_energy, chunk_size, 1)),
1, chunk_size
)
)
else:
# Infinite lookback
# Notice that infinite lookback is a special case of chunkwise
# where chunksize = inf
inner_items = alpha / (eps + torch.cumsum(exp_soft_energy, dim=2))
beta = (
exp_soft_energy
* torch.cumsum(inner_items.flip(dims=[2]), dim=2)
.flip(dims=[2])
)
if padding_mask is not None:
beta = beta.masked_fill(
padding_mask.unsqueeze(1).to(torch.bool), 0.0)
# Mix precision to prevent overflow for fp16
beta = beta.type(dtype)
beta = beta.clamp(0, 1)
prob_check(beta)
return beta
def mass_preservation(
alpha: Tensor,
padding_mask: Optional[Tensor] = None,
left_padding: bool = False
):
"""
Function to compute the mass perservation for alpha.
This means that the residual weights of alpha will be assigned
to the last token.
Reference:
Monotonic Infinite Lookback Attention for Simultaneous Machine Translation
https://arxiv.org/abs/1906.05218
alpha: bsz, tgt_len, src_len
padding_mask: bsz, src_len
left_padding: bool
"""
prob_check(alpha)
if padding_mask is not None:
if not left_padding:
assert not padding_mask[:, 0].any(), (
"Find padding on the beginning of the sequence."
)
alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0.0)
if left_padding or padding_mask is None:
residuals = 1 - alpha[:, :, :-1].sum(dim=-1).clamp(0, 1)
alpha[:, :, -1] = residuals
else:
# right padding
_, tgt_len, src_len = alpha.size()
residuals = 1 - alpha.sum(dim=-1, keepdim=True).clamp(0, 1)
src_lens = src_len - padding_mask.sum(dim=1, keepdim=True)
src_lens = src_lens.expand(-1, tgt_len).contiguous()
# add back the last value
residuals += alpha.gather(2, src_lens.unsqueeze(2) - 1)
alpha = alpha.scatter(2, src_lens.unsqueeze(2) - 1, residuals)
prob_check(alpha)
return alpha
|