Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import numpy as np | |
from transformers.tokenization_bert import BertTokenizer | |
from .f import flatten_, assoc, memoize, GetAttr | |
from typing import List | |
def fix_byte_spaces(toks: List[str]) -> List[str]: | |
return [t.replace("\u0120", " ").replace("\u010A", "\\n") for t in toks] | |
def get_bpe(bpe_pretrained_name_or_path): | |
return BertTokenizer.from_pretrained(bpe_pretrained_name_or_path) | |
# [String] -> [String] | |
def remove_CLS_SEP(toks): | |
return [t for t in toks if t not in set(["[CLS]", "[SEP]"])] | |
# torch.Tensor -> np.Array | |
def process_hidden_tensors(t): | |
"""Embeddings are returned from the BERT model in a non-ideal embedding shape: | |
- unnecessary batch dimension | |
- Undesired second sentence "[SEP]". | |
Drop the unnecessary information and just return what we need for the first sentence | |
""" | |
# Drop unnecessary batch dim and second sent | |
t = t.squeeze(0)[:-1] | |
# Drop second sentence sep ?? | |
t = t[1:-1] | |
# Convert to numpy | |
return t.data.numpy() | |
# np.Array -> np.Array | |
def normalize(a): | |
"""Divide each head by its norm""" | |
norms = np.linalg.norm(a, axis=-1, keepdims=True) | |
return a / norms | |
# np.Array:<a,b,c,d> -> np.Array<a,b,c*d> | |
def reshape(a): | |
"""Combine the last two dimensions of a numpy array""" | |
all_head_size = a.shape[-2] * a.shape[-1] | |
new_shape = a.shape[:-2] + (all_head_size,) | |
return a.reshape(new_shape) |