Spaces:
Build error
Build error
import gradio as gr | |
import numpy as np | |
import torch | |
import re | |
from nltk.stem import PorterStemmer | |
from collections import defaultdict | |
from transformers import pipeline | |
from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline | |
from transformers import AutoModelForMaskedLM | |
ex_str1 = "A crustless sandwich made from two slices of baked bread. The sandwich includes first and second matching " \ | |
"crustless bread pieces. The bread pieces have the same general outer shape defined by an outer periphery " \ | |
"with central portions surrounded by an outer peripheral area, the bread pieces being at least partially " \ | |
"crimped together at the outer peripheral area." | |
ex_key1 = "sandwich bread crimped" | |
ex_str2 = "The present disclosure provides a DNA-targeting RNA that comprises a targeting sequence and, together with" \ | |
" a modifying polypeptide, provides for site-specific modification of a target DNA and/or a polypeptide" \ | |
" associated with the target DNA. " | |
ex_key2 = "DNA target modification" | |
ex_str3 = "The graphite plane is composed of a two-dimensional hexagonal lattice of carbon atoms and the plate has a " \ | |
"length and a width parallel to the graphite plane and a thickness orthogonal to the graphite plane with at " \ | |
"least one of the length, width, and thickness values being 100 nanometers or smaller. " | |
ex_key3 = "graphite lattice orthogonal " | |
tab_two_examples = [[ex_str1, ex_key1], | |
[ex_str2, ex_key2], | |
[ex_str3, ex_key3]] | |
# | |
# tab_one_examples = [['A crustless _ made from two slices of baked bread.'], | |
# ['The present disclosure provides a DNA-targeting RNA that comprises a targeting _.'], | |
# ['The _ plane is composed of a two-dimensional hexagonal lattice of carbon atoms.'] | |
# ] | |
ignore_str = ['a', 'an', 'the', 'is', 'and', 'or', '!', '(', ')', '-', '[', ']', '{', '}', ';', ':', "'", '"', '\\', | |
',', '<', '>', '.', '/', '?', '@', '#', '$', '%', '^', '&', '*', '_', '~'] | |
def add_mask(text, lower_bound=0, index=None): | |
split_text = text.split() | |
if index is not None: | |
split_text[index] = '[MASK]' | |
return ' '.join(split_text), None | |
# If the user supplies a mask, don't add more | |
if '_' in split_text: | |
u_pos = [i for i, s in enumerate(split_text) if '_' in s][0] | |
split_text[u_pos] = '[MASK]' | |
return ' '.join(split_text), '[MASK]' | |
idx = np.random.randint(low=lower_bound, high=len(split_text), size=1).astype(int)[0] | |
# Don't mask certain words | |
num_iters = 0 | |
while split_text[idx].lower() in ignore_str: | |
num_iters += 1 | |
idx = np.random.randint(len(split_text), size=1).astype(int)[0] | |
if num_iters > 10: | |
break | |
masked_string = split_text[idx] | |
split_text[idx] = '[MASK]' | |
masked_output = ' '.join(split_text) | |
return masked_output, masked_string | |
class TempScalePipe(FillMaskPipeline): | |
def _sanitize_parameters(self, top_k=None, targets=None, temp=None): | |
postprocess_params = {} | |
if targets is not None: | |
target_ids = self.get_target_ids(targets, top_k) | |
postprocess_params["target_ids"] = target_ids | |
if top_k is not None: | |
postprocess_params["top_k"] = top_k | |
if temp is not None: | |
postprocess_params["temp"] = temp | |
return {}, {}, postprocess_params | |
def __call__(self, inputs, *args, **kwargs): | |
""" | |
Fill the masked token in the text(s) given as inputs. | |
Args: | |
args (`str` or `List[str]`): | |
One or several texts (or one list of prompts) with masked tokens. | |
targets (`str` or `List[str]`, *optional*): | |
When passed, the model will limit the scores to the passed targets instead of looking up in the whole | |
vocab. If the provided targets are not in the model vocab, they will be tokenized and the first | |
resulting token will be used (with a warning, and that might be slower). | |
top_k (`int`, *optional*): | |
When passed, overrides the number of predictions to return. | |
Return: | |
A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys: | |
- **sequence** (`str`) -- The corresponding input with the mask token prediction. | |
- **score** (`float`) -- The corresponding probability. | |
- **token** (`int`) -- The predicted token id (to replace the masked one). | |
- **token** (`str`) -- The predicted token (to replace the masked one). | |
""" | |
outputs = super().__call__(inputs, **kwargs) | |
if isinstance(inputs, list) and len(inputs) == 1: | |
return outputs[0] | |
return outputs | |
def postprocess(self, model_outputs, top_k=10, target_ids=None, temp=1): | |
# Cap top_k if there are targets | |
if target_ids is not None and target_ids.shape[0] < top_k: | |
top_k = target_ids.shape[0] | |
input_ids = model_outputs["input_ids"][0] | |
outputs = model_outputs["logits"] | |
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1) | |
# Fill mask pipeline supports only one ${mask_token} per sample | |
logits = outputs[0, masked_index, :] / temp | |
probs = logits.softmax(dim=-1) | |
sampling = False | |
if sampling: | |
predictions = torch.multinomial(probs, num_samples=3) | |
values = probs[0, predictions] | |
if target_ids is not None: | |
probs = probs[..., target_ids] | |
if not sampling: | |
values, predictions = probs.topk(top_k) | |
result = [] | |
single_mask = values.shape[0] == 1 | |
for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())): | |
row = [] | |
for v, p in zip(_values, _predictions): | |
# Copy is important since we're going to modify this array in place | |
tokens = input_ids.numpy().copy() | |
if target_ids is not None: | |
p = target_ids[p].tolist() | |
tokens[masked_index[i]] = p | |
# Filter padding out: | |
tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)] | |
# Originally we skip special tokens to give readable output. | |
# For multi masks though, the other [MASK] would be removed otherwise | |
# making the output look odd, so we add them back | |
sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask) | |
proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence} | |
row.append(proposition) | |
result.append(row) | |
if single_mask: | |
return result[0] | |
return result | |
PIPELINE_REGISTRY.register_pipeline( | |
"temp-scale", | |
pipeline_class=TempScalePipe, | |
pt_model=AutoModelForMaskedLM, | |
) | |
scrambler = pipeline("temp-scale", model="anferico/bert-for-patents") | |
generator = pipeline('text-generation', model='gpt2') | |
def sample_output(out, sampling): | |
score_to_str = {out[k]: k for k in out.keys()} | |
score_list = list(score_to_str.keys()) | |
if sampling == 'multi': | |
idx = np.argmax(np.random.multinomial(1, score_list, 1)) | |
else: | |
idx = np.random.randint(0, len(score_list)) | |
score = score_list[idx] | |
return score_to_str[score] | |
def unmask_single(text, temp=1): | |
masked_text, _ = add_mask(text) | |
res = scrambler(masked_text, temp=temp, top_k=10) | |
out = {item["token_str"]: item["score"] for item in res} | |
return out | |
def unmask(text, temp, rounds, lower_bound=0): | |
sampling = 'multi' | |
for _ in range(rounds): | |
masked_text, masked = add_mask(text, lower_bound) | |
split_text = masked_text.split() | |
res = scrambler(masked_text, temp=temp, top_k=15) | |
mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0] | |
out = {item["token_str"]: item["score"] for item in res} | |
new_token = sample_output(out, sampling) | |
unsuccessful_iters = 0 | |
while masked in new_token: | |
if unsuccessful_iters > 5: | |
break | |
print('skipped', new_token) | |
new_token = sample_output(out, sampling=sampling) | |
unsuccessful_iters += 1 | |
if masked in new_token: | |
split_text[mask_pos] = new_token | |
else: | |
split_text[mask_pos] = '*' + new_token + '*' | |
text = ' '.join(split_text) | |
text = list(text) | |
text[0] = text[0].upper() | |
return ''.join(text) | |
def autocomplete(text, temp): | |
output = generator(text, max_length=30, num_return_sequences=1) | |
gpt_out = output[0]['generated_text'] | |
# diff = gpt_out.replace(text, '') | |
patent_bert_out = unmask(gpt_out, temp=temp, rounds=5, lower_bound=len(text.split())) | |
# Take the output from gpt-2 and randomly mask, if a mask is confident, swap it in. Iterate 5 times | |
return patent_bert_out | |
def extract_keywords(text, queries): | |
q_dict = {} | |
temp = 1 # set temperature to 1 | |
for query in queries.split(): | |
# Iterate through text and mask each token | |
ps = PorterStemmer() | |
top_scores = defaultdict(list) | |
top_k_range = 30 | |
text_no_punc = re.sub(r'[^\w\s]', '', text) | |
indices = [i for i, t in enumerate(text_no_punc.split()) if t.lower() == query.lower()] | |
for i in indices: | |
masked_text, masked = add_mask(text, index=i) | |
res = scrambler(masked_text, temp=temp, top_k=top_k_range) | |
out = {item["token_str"]: item["score"] for item in res} | |
sorted_keys = sorted(out, key=out.get) | |
# If the key does not appear, floor its rank for that round | |
for rank, token_str in enumerate(sorted_keys): | |
if token_str in ignore_str: | |
continue | |
stemmed = ps.stem(token_str) | |
norm_rank = rank / top_k_range | |
top_scores[stemmed].append(norm_rank) | |
for key in top_scores.keys(): | |
if key not in out.keys(): | |
top_scores[key].append(0) | |
# Calc mean | |
for key in top_scores.keys(): | |
top_scores[key] = np.mean(top_scores[key]) | |
# Normalize | |
for key in top_scores.keys(): | |
top_scores[key] = top_scores[key] / np.sum(list(top_scores.values())) | |
# Get top_k | |
top_n = sorted(list(top_scores.values()))[-3] | |
for key in list(top_scores.keys()): | |
if top_scores[key] < top_n: | |
del top_scores[key] | |
q_dict[query] = top_scores | |
keywords = '' | |
for i, q in enumerate(q_dict.keys()): | |
keywords += '[' | |
for ii, k in enumerate(q_dict[q].keys()): | |
keywords += k | |
if ii < len(q_dict[q].keys()) - 1: | |
keywords += ' OR ' | |
else: | |
keywords += ']' | |
if i < len(q_dict.keys()) - 1: | |
keywords += ' AND ' | |
# keywords = set([k for q in q_dict.keys() for k in q_dict[q].keys()]) | |
# search_str = ' OR '.join(keywords) | |
output = [q_dict[q] for q in q_dict] | |
output.append(keywords) | |
return output | |
# fig, ax = plt.subplots(nrows=1, ncols=3) | |
# for q in q_dict: | |
# ax.bar(q_dict[q]) | |
# return fig | |
label0 = gr.Label(label='keyword 1', num_top_classes=3) | |
label01 = gr.Label(label='keyword 2', num_top_classes=3) | |
label02 = gr.Label(label='keyword 3', num_top_classes=3) | |
textbox02 = gr.Textbox(label="Input Keywords", lines=3) | |
textbox01 = gr.Textbox(label="Input Keywords", placeholder="Type keywords here", lines=1) | |
textbox0 = gr.Textbox(label="Input Sentences", placeholder="Type sentences here", lines=5) | |
output_textbox0 = gr.Textbox(label='Search String of Keywords', placeholder="Output will appear here", lines=4) | |
# temp_slider0 = gr.Slider(1.0, 3.0, value=1.0, label='Creativity') | |
textbox1 = gr.Textbox(label="Input Sentence", lines=5) | |
# output_textbox1 = gr.Textbox(placeholder="Output will appear here", lines=4) | |
title1 = "Patent-BERT: Context-Dependent Synonym Generator" | |
description1 = """<p> | |
Try inserting a few sentences from a patent, and pick keywords for the model to analyze. The model will analyze the | |
context of the keywords in the sentences and generate the top three most likely candidates for each word. | |
This can be used for more creative patent drafting or patent searches using the generated search string. The base model is | |
<a href= "https://github.com/google/patents-public-data/blob/master/models/BERT%20for%20Patents.md">Patent BERT</a> created and trained by Google. | |
<strong>Note:</strong> Current pipeline only allows for <strong>three</strong> keyword submissions. Stemming (e.g., altering -> alter) is built into the output for | |
broader search string. <br/> | |
Beta features (currently work-in-progress) include: (<strong>A</strong>) adjustment options for (i) the number of keywords, (ii) the number of context-dependent synonyms, | |
and (iii) a 'creativity' parameter of the model; (<strong>B</strong>) analysis of where these words appear in the patent (e.g., | |
claim, summary, etc.); and (<strong>C</strong>) a stemming option for input keywords. | |
<br/> | |
<p/>""" | |
# textbox2 = gr.Textbox(label="Input Sentences", lines=5) | |
# output_textbox2 = gr.Textbox(placeholder="Output will appear here", lines=4) | |
# temp_slider2 = gr.Slider(1.0, 3.0, value=1.0, label='Creativity') | |
# edit_slider2 = gr.Slider(1, 20, step=1, value=1.0, label='Number of edits') | |
# title2 = "Patent-BERT Sentence Remix-er: Multiple Edits" | |
# description2 = """<p> | |
# | |
# Try typing in a sentence for the model to remix. Adjust the 'creativity' scale bar to change the | |
# the model's confidence in its likely substitutions and the 'number of edits' for the number of edits you want | |
# the model to attempt to make. The words substituted in the output sentence will be enclosed in asterisks (e.g., *word*). | |
# <br/> <p/> """ | |
demo0 = gr.Interface( | |
fn=extract_keywords, | |
inputs=[textbox0, textbox01], | |
outputs=[label0, label01, label02, output_textbox0], | |
examples=tab_two_examples, | |
allow_flagging='never', | |
title=title1, | |
description=description1 | |
) | |
# demo1 = gr.Interface( | |
# fn=unmask_single, | |
# inputs=[textbox1], | |
# outputs='label', | |
# examples=tab_one_examples, | |
# allow_flagging='never', | |
# title=title1, | |
# description=description1 | |
# ) | |
# demo2 = gr.Interface( | |
# fn=unmask, | |
# inputs=[textbox2, temp_slider2, edit_slider2], | |
# outputs=[output_textbox2], | |
# examples=tab_two_examples, | |
# allow_flagging='never', | |
# title=title2, | |
# description=description2 | |
# ) | |
gr.TabbedInterface( | |
[demo0], ["Keyword generator"] | |
).launch() | |