Spaces:

B-patents
/

patent-bert

Build error

patent-bert / app.py

danseith

Added edit slider and changed sampling back to multinomial.

c16370c over 2 years ago

6.25 kB

	import gradio as gr
	import numpy as np
	import torch
	from transformers import pipeline
	from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline
	from transformers import AutoModelForMaskedLM

	# unmasker = pipeline("temp-scale", model="anferico/bert-for-patents")
	examples = [['A crustless [MASK] made from two slices of baked bread.', 1.2],
	['The invention provides a method for altering or modifying [MASK] of one or more gene products.', 1.1],
	['The graphite [MASK] is composed of a two-dimensional hexagonal lattice of carbon atoms.', 1.4]]

	def add_mask(text, size=1):
	split_text = text.split()

	# If the user supplies a mask, don't add more
	if '[MASK]' in split_text:
	return text
	idx = np.random.randint(len(split_text), size=size)
	for i in idx:
	split_text[i] = '[MASK]'
	return ' '.join(split_text)


	class TempScalePipe(FillMaskPipeline):
	def _sanitize_parameters(self, top_k=None, targets=None, temp=None):
	postprocess_params = {}

	if targets is not None:
	target_ids = self.get_target_ids(targets, top_k)
	postprocess_params["target_ids"] = target_ids

	if top_k is not None:
	postprocess_params["top_k"] = top_k

	if temp is not None:
	postprocess_params["temp"] = temp
	return {}, {}, postprocess_params


	def __call__(self, inputs, args, *kwargs):
	"""
	Fill the masked token in the text(s) given as inputs.

	Args:
	args (`str` or `List[str]`):
	One or several texts (or one list of prompts) with masked tokens.
	targets (`str` or `List[str]`, optional):
	When passed, the model will limit the scores to the passed targets instead of looking up in the whole
	vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
	resulting token will be used (with a warning, and that might be slower).
	top_k (`int`, optional):
	When passed, overrides the number of predictions to return.

	Return:
	A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:

	- sequence (`str`) -- The corresponding input with the mask token prediction.
	- score (`float`) -- The corresponding probability.
	- token (`int`) -- The predicted token id (to replace the masked one).
	- token (`str`) -- The predicted token (to replace the masked one).
	"""
	outputs = super().__call__(inputs, **kwargs)
	if isinstance(inputs, list) and len(inputs) == 1:
	return outputs[0]
	return outputs

	def postprocess(self, model_outputs, top_k=10, target_ids=None, temp=1):
	# Cap top_k if there are targets
	if target_ids is not None and target_ids.shape[0] < top_k:
	top_k = target_ids.shape[0]
	input_ids = model_outputs["input_ids"][0]
	outputs = model_outputs["logits"]

	masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
	# Fill mask pipeline supports only one ${mask_token} per sample

	logits = outputs[0, masked_index, :] / temp
	probs = logits.softmax(dim=-1)
	sampling = False
	if sampling:
	predictions = torch.multinomial(probs, num_samples=3)
	values = probs[0, predictions]
	if target_ids is not None:
	probs = probs[..., target_ids]
	if not sampling:
	values, predictions = probs.topk(top_k)

	result = []
	single_mask = values.shape[0] == 1
	for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
	row = []
	for v, p in zip(_values, _predictions):
	# Copy is important since we're going to modify this array in place
	tokens = input_ids.numpy().copy()
	if target_ids is not None:
	p = target_ids[p].tolist()

	tokens[masked_index[i]] = p
	# Filter padding out:
	tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
	# Originally we skip special tokens to give readable output.
	# For multi masks though, the other [MASK] would be removed otherwise
	# making the output look odd, so we add them back
	sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
	proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
	row.append(proposition)
	result.append(row)
	if single_mask:
	return result[0]
	return result


	PIPELINE_REGISTRY.register_pipeline(
	"temp-scale",
	pipeline_class=TempScalePipe,
	pt_model=AutoModelForMaskedLM,
	)
	scrambler = pipeline("temp-scale", model="anferico/bert-for-patents")


	def unmask(text, temp, rounds):
	sampling = 'multi'

	for _ in range(rounds):
	text = add_mask(text, size=1)
	split_text = text.split()
	res = scrambler(text, temp=temp, top_k=10)
	mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0]
	out = {item["token_str"]: item["score"] for item in res}
	score_to_str = {out[k]:k for k in out.keys()}
	score_list = list(score_to_str.keys())
	if sampling == 'multi':
	idx = np.argmax(np.random.multinomial(1, score_list, 1))
	else:
	idx = np.random.randint(0, len(score_list))
	score = score_list[idx]
	new_token = score_to_str[score]
	split_text[mask_pos] = new_token
	text = ' '.join(split_text)
	return text

	textbox = gr.Textbox(label="Type language here", lines=5)
	textbox2 = gr.Textbox(placeholder="", lines=4)
	temp_slider = gr.Slider(1.0, 2.0, value=1.0, label='Creativity')
	edit_slider = gr.Slider(1, 50, step=1, value=1.0, label='Number of edits')

	demo = gr.Interface(
	fn=unmask,
	inputs=[textbox, temp_slider, edit_slider],
	outputs=[textbox2],
	examples=examples,
	)

	demo.launch()