Spaces:
Build error
Build error
import gradio as gr | |
import numpy as np | |
import torch | |
from transformers import pipeline | |
from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline | |
from transformers import AutoModelForMaskedLM | |
# unmasker = pipeline("temp-scale", model="anferico/bert-for-patents") | |
examples = [['A crustless [MASK] made from two slices of baked bread.', 1.2], | |
['The invention provides a method for altering or modifying [MASK] of one or more gene products.', 1.1], | |
['The graphite [MASK] is composed of a two-dimensional hexagonal lattice of carbon atoms.', 1.4]] | |
def add_mask(text, size=1): | |
split_text = text.split() | |
# If the user supplies a mask, don't add more | |
if '[MASK]' in split_text: | |
return text | |
idx = np.random.randint(len(split_text), size=size) | |
for i in idx: | |
split_text[i] = '[MASK]' | |
return ' '.join(split_text) | |
class TempScalePipe(FillMaskPipeline): | |
def _sanitize_parameters(self, top_k=None, targets=None, temp=None): | |
postprocess_params = {} | |
if targets is not None: | |
target_ids = self.get_target_ids(targets, top_k) | |
postprocess_params["target_ids"] = target_ids | |
if top_k is not None: | |
postprocess_params["top_k"] = top_k | |
if temp is not None: | |
postprocess_params["temp"] = temp | |
return {}, {}, postprocess_params | |
def __call__(self, inputs, *args, **kwargs): | |
""" | |
Fill the masked token in the text(s) given as inputs. | |
Args: | |
args (`str` or `List[str]`): | |
One or several texts (or one list of prompts) with masked tokens. | |
targets (`str` or `List[str]`, *optional*): | |
When passed, the model will limit the scores to the passed targets instead of looking up in the whole | |
vocab. If the provided targets are not in the model vocab, they will be tokenized and the first | |
resulting token will be used (with a warning, and that might be slower). | |
top_k (`int`, *optional*): | |
When passed, overrides the number of predictions to return. | |
Return: | |
A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys: | |
- **sequence** (`str`) -- The corresponding input with the mask token prediction. | |
- **score** (`float`) -- The corresponding probability. | |
- **token** (`int`) -- The predicted token id (to replace the masked one). | |
- **token** (`str`) -- The predicted token (to replace the masked one). | |
""" | |
outputs = super().__call__(inputs, **kwargs) | |
if isinstance(inputs, list) and len(inputs) == 1: | |
return outputs[0] | |
return outputs | |
def postprocess(self, model_outputs, top_k=10, target_ids=None, temp=1): | |
# Cap top_k if there are targets | |
if target_ids is not None and target_ids.shape[0] < top_k: | |
top_k = target_ids.shape[0] | |
input_ids = model_outputs["input_ids"][0] | |
outputs = model_outputs["logits"] | |
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1) | |
# Fill mask pipeline supports only one ${mask_token} per sample | |
logits = outputs[0, masked_index, :] / temp | |
probs = logits.softmax(dim=-1) | |
sampling = False | |
if sampling: | |
predictions = torch.multinomial(probs, num_samples=3) | |
values = probs[0, predictions] | |
if target_ids is not None: | |
probs = probs[..., target_ids] | |
if not sampling: | |
values, predictions = probs.topk(top_k) | |
result = [] | |
single_mask = values.shape[0] == 1 | |
for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())): | |
row = [] | |
for v, p in zip(_values, _predictions): | |
# Copy is important since we're going to modify this array in place | |
tokens = input_ids.numpy().copy() | |
if target_ids is not None: | |
p = target_ids[p].tolist() | |
tokens[masked_index[i]] = p | |
# Filter padding out: | |
tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)] | |
# Originally we skip special tokens to give readable output. | |
# For multi masks though, the other [MASK] would be removed otherwise | |
# making the output look odd, so we add them back | |
sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask) | |
proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence} | |
row.append(proposition) | |
result.append(row) | |
if single_mask: | |
return result[0] | |
return result | |
PIPELINE_REGISTRY.register_pipeline( | |
"temp-scale", | |
pipeline_class=TempScalePipe, | |
pt_model=AutoModelForMaskedLM, | |
) | |
scrambler = pipeline("temp-scale", model="anferico/bert-for-patents") | |
def unmask(text, temp, rounds): | |
sampling = 'multi' | |
for _ in range(rounds): | |
text = add_mask(text, size=1) | |
split_text = text.split() | |
res = scrambler(text, temp=temp, top_k=10) | |
mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0] | |
out = {item["token_str"]: item["score"] for item in res} | |
score_to_str = {out[k]:k for k in out.keys()} | |
score_list = list(score_to_str.keys()) | |
if sampling == 'multi': | |
idx = np.argmax(np.random.multinomial(1, score_list, 1)) | |
else: | |
idx = np.random.randint(0, len(score_list)) | |
score = score_list[idx] | |
new_token = score_to_str[score] | |
split_text[mask_pos] = new_token | |
text = ' '.join(split_text) | |
return text | |
textbox = gr.Textbox(label="Type language here", lines=5) | |
textbox2 = gr.Textbox(placeholder="", lines=4) | |
temp_slider = gr.Slider(1.0, 2.0, value=1.0, label='Creativity') | |
edit_slider = gr.Slider(1, 50, step=1, value=1.0, label='Number of edits') | |
demo = gr.Interface( | |
fn=unmask, | |
inputs=[textbox, temp_slider, edit_slider], | |
outputs=[textbox2], | |
examples=examples, | |
) | |
demo.launch() | |