Spaces:
Build error
Build error
File size: 14,882 Bytes
8c0b646 d5cc744 f023836 4db26d9 ca69fee d5cc744 ca69fee 8c0b646 ba4ce98 19d0d27 ba4ce98 4db26d9 ba4ce98 4db26d9 ba4ce98 4db26d9 ba4ce98 4db26d9 a95bc58 f023836 ba4ce98 fce4c33 4db26d9 fce4c33 4db26d9 c16370c a95bc58 616c8c6 4db26d9 616c8c6 f023836 616c8c6 eabdff9 616c8c6 fce4c33 d5cc744 ca69fee d5cc744 2ce1788 d5cc744 ca69fee d5cc744 ca69fee d5cc744 ca69fee fce4c33 4db26d9 c16370c cd3e092 a95bc58 616c8c6 a95bc58 4db26d9 c16370c cd3e092 4db26d9 eabdff9 cd3e092 c16370c cd3e092 f51fca8 cd3e092 470c4c2 a95bc58 cd3e092 f51fca8 470c4c2 c16370c cd3e092 ba4ce98 1ca245c ba4ce98 616c8c6 4db26d9 f023836 4db26d9 f023836 4db26d9 f023836 4db26d9 a95bc58 4db26d9 b7321ed 4db26d9 9580f49 4b598b3 4db26d9 4b598b3 187c035 4b598b3 a95bc58 4db26d9 a95bc58 3566540 4db26d9 8c0b646 a95bc58 4db26d9 a95bc58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 |
import gradio as gr
import numpy as np
import torch
import re
from nltk.stem import PorterStemmer
from collections import defaultdict
from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline
from transformers import AutoModelForMaskedLM
ex_str1 = "A crustless sandwich made from two slices of baked bread. The sandwich includes first and second matching " \
"crustless bread pieces. The bread pieces have the same general outer shape defined by an outer periphery " \
"with central portions surrounded by an outer peripheral area, the bread pieces being at least partially " \
"crimped together at the outer peripheral area."
ex_key1 = "sandwich bread crimped"
ex_str2 = "The present disclosure provides a DNA-targeting RNA that comprises a targeting sequence and, together with" \
" a modifying polypeptide, provides for site-specific modification of a target DNA and/or a polypeptide" \
" associated with the target DNA. "
ex_key2 = "DNA target modification"
ex_str3 = "The graphite plane is composed of a two-dimensional hexagonal lattice of carbon atoms and the plate has a " \
"length and a width parallel to the graphite plane and a thickness orthogonal to the graphite plane with at " \
"least one of the length, width, and thickness values being 100 nanometers or smaller. "
ex_key3 = "graphite lattice orthogonal "
tab_two_examples = [[ex_str1, ex_key1],
[ex_str2, ex_key2],
[ex_str3, ex_key3]]
#
# tab_one_examples = [['A crustless _ made from two slices of baked bread.'],
# ['The present disclosure provides a DNA-targeting RNA that comprises a targeting _.'],
# ['The _ plane is composed of a two-dimensional hexagonal lattice of carbon atoms.']
# ]
ignore_str = ['a', 'an', 'the', 'is', 'and', 'or', '!', '(', ')', '-', '[', ']', '{', '}', ';', ':', "'", '"', '\\',
',', '<', '>', '.', '/', '?', '@', '#', '$', '%', '^', '&', '*', '_', '~']
def add_mask(text, lower_bound=0, index=None):
split_text = text.split()
if index is not None:
split_text[index] = '[MASK]'
return ' '.join(split_text), None
# If the user supplies a mask, don't add more
if '_' in split_text:
u_pos = [i for i, s in enumerate(split_text) if '_' in s][0]
split_text[u_pos] = '[MASK]'
return ' '.join(split_text), '[MASK]'
idx = np.random.randint(low=lower_bound, high=len(split_text), size=1).astype(int)[0]
# Don't mask certain words
num_iters = 0
while split_text[idx].lower() in ignore_str:
num_iters += 1
idx = np.random.randint(len(split_text), size=1).astype(int)[0]
if num_iters > 10:
break
masked_string = split_text[idx]
split_text[idx] = '[MASK]'
masked_output = ' '.join(split_text)
return masked_output, masked_string
class TempScalePipe(FillMaskPipeline):
def _sanitize_parameters(self, top_k=None, targets=None, temp=None):
postprocess_params = {}
if targets is not None:
target_ids = self.get_target_ids(targets, top_k)
postprocess_params["target_ids"] = target_ids
if top_k is not None:
postprocess_params["top_k"] = top_k
if temp is not None:
postprocess_params["temp"] = temp
return {}, {}, postprocess_params
def __call__(self, inputs, *args, **kwargs):
"""
Fill the masked token in the text(s) given as inputs.
Args:
args (`str` or `List[str]`):
One or several texts (or one list of prompts) with masked tokens.
targets (`str` or `List[str]`, *optional*):
When passed, the model will limit the scores to the passed targets instead of looking up in the whole
vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
resulting token will be used (with a warning, and that might be slower).
top_k (`int`, *optional*):
When passed, overrides the number of predictions to return.
Return:
A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
- **sequence** (`str`) -- The corresponding input with the mask token prediction.
- **score** (`float`) -- The corresponding probability.
- **token** (`int`) -- The predicted token id (to replace the masked one).
- **token** (`str`) -- The predicted token (to replace the masked one).
"""
outputs = super().__call__(inputs, **kwargs)
if isinstance(inputs, list) and len(inputs) == 1:
return outputs[0]
return outputs
def postprocess(self, model_outputs, top_k=10, target_ids=None, temp=1):
# Cap top_k if there are targets
if target_ids is not None and target_ids.shape[0] < top_k:
top_k = target_ids.shape[0]
input_ids = model_outputs["input_ids"][0]
outputs = model_outputs["logits"]
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
# Fill mask pipeline supports only one ${mask_token} per sample
logits = outputs[0, masked_index, :] / temp
probs = logits.softmax(dim=-1)
sampling = False
if sampling:
predictions = torch.multinomial(probs, num_samples=3)
values = probs[0, predictions]
if target_ids is not None:
probs = probs[..., target_ids]
if not sampling:
values, predictions = probs.topk(top_k)
result = []
single_mask = values.shape[0] == 1
for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
row = []
for v, p in zip(_values, _predictions):
# Copy is important since we're going to modify this array in place
tokens = input_ids.numpy().copy()
if target_ids is not None:
p = target_ids[p].tolist()
tokens[masked_index[i]] = p
# Filter padding out:
tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
# Originally we skip special tokens to give readable output.
# For multi masks though, the other [MASK] would be removed otherwise
# making the output look odd, so we add them back
sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
row.append(proposition)
result.append(row)
if single_mask:
return result[0]
return result
PIPELINE_REGISTRY.register_pipeline(
"temp-scale",
pipeline_class=TempScalePipe,
pt_model=AutoModelForMaskedLM,
)
scrambler = pipeline("temp-scale", model="anferico/bert-for-patents")
generator = pipeline('text-generation', model='gpt2')
def sample_output(out, sampling):
score_to_str = {out[k]: k for k in out.keys()}
score_list = list(score_to_str.keys())
if sampling == 'multi':
idx = np.argmax(np.random.multinomial(1, score_list, 1))
else:
idx = np.random.randint(0, len(score_list))
score = score_list[idx]
return score_to_str[score]
def unmask_single(text, temp=1):
masked_text, _ = add_mask(text)
res = scrambler(masked_text, temp=temp, top_k=10)
out = {item["token_str"]: item["score"] for item in res}
return out
def unmask(text, temp, rounds, lower_bound=0):
sampling = 'multi'
for _ in range(rounds):
masked_text, masked = add_mask(text, lower_bound)
split_text = masked_text.split()
res = scrambler(masked_text, temp=temp, top_k=15)
mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0]
out = {item["token_str"]: item["score"] for item in res}
new_token = sample_output(out, sampling)
unsuccessful_iters = 0
while masked in new_token:
if unsuccessful_iters > 5:
break
print('skipped', new_token)
new_token = sample_output(out, sampling=sampling)
unsuccessful_iters += 1
if masked in new_token:
split_text[mask_pos] = new_token
else:
split_text[mask_pos] = '*' + new_token + '*'
text = ' '.join(split_text)
text = list(text)
text[0] = text[0].upper()
return ''.join(text)
def autocomplete(text, temp):
output = generator(text, max_length=30, num_return_sequences=1)
gpt_out = output[0]['generated_text']
# diff = gpt_out.replace(text, '')
patent_bert_out = unmask(gpt_out, temp=temp, rounds=5, lower_bound=len(text.split()))
# Take the output from gpt-2 and randomly mask, if a mask is confident, swap it in. Iterate 5 times
return patent_bert_out
def extract_keywords(text, queries):
q_dict = {}
temp = 1 # set temperature to 1
for query in queries.split():
# Iterate through text and mask each token
ps = PorterStemmer()
top_scores = defaultdict(list)
top_k_range = 30
text_no_punc = re.sub(r'[^\w\s]', '', text)
indices = [i for i, t in enumerate(text_no_punc.split()) if t.lower() == query.lower()]
for i in indices:
masked_text, masked = add_mask(text, index=i)
res = scrambler(masked_text, temp=temp, top_k=top_k_range)
out = {item["token_str"]: item["score"] for item in res}
sorted_keys = sorted(out, key=out.get)
# If the key does not appear, floor its rank for that round
for rank, token_str in enumerate(sorted_keys):
if token_str in ignore_str:
continue
stemmed = ps.stem(token_str)
norm_rank = rank / top_k_range
top_scores[stemmed].append(norm_rank)
for key in top_scores.keys():
if key not in out.keys():
top_scores[key].append(0)
# Calc mean
for key in top_scores.keys():
top_scores[key] = np.mean(top_scores[key])
# Normalize
for key in top_scores.keys():
top_scores[key] = top_scores[key] / np.sum(list(top_scores.values()))
# Get top_k
top_n = sorted(list(top_scores.values()))[-3]
for key in list(top_scores.keys()):
if top_scores[key] < top_n:
del top_scores[key]
q_dict[query] = top_scores
keywords = ''
for i, q in enumerate(q_dict.keys()):
keywords += '['
for ii, k in enumerate(q_dict[q].keys()):
keywords += k
if ii < len(q_dict[q].keys()) - 1:
keywords += ' OR '
else:
keywords += ']'
if i < len(q_dict.keys()) - 1:
keywords += ' AND '
# keywords = set([k for q in q_dict.keys() for k in q_dict[q].keys()])
# search_str = ' OR '.join(keywords)
output = [q_dict[q] for q in q_dict]
output.append(keywords)
return output
# fig, ax = plt.subplots(nrows=1, ncols=3)
# for q in q_dict:
# ax.bar(q_dict[q])
# return fig
label0 = gr.Label(label='keyword 1', num_top_classes=3)
label01 = gr.Label(label='keyword 2', num_top_classes=3)
label02 = gr.Label(label='keyword 3', num_top_classes=3)
textbox02 = gr.Textbox(label="Input Keywords", lines=3)
textbox01 = gr.Textbox(label="Input Keywords", placeholder="Type keywords here", lines=1)
textbox0 = gr.Textbox(label="Input Sentences", placeholder="Type sentences here", lines=5)
output_textbox0 = gr.Textbox(label='Search String of Keywords', placeholder="Output will appear here", lines=4)
# temp_slider0 = gr.Slider(1.0, 3.0, value=1.0, label='Creativity')
textbox1 = gr.Textbox(label="Input Sentence", lines=5)
# output_textbox1 = gr.Textbox(placeholder="Output will appear here", lines=4)
title1 = "Patent-BERT: Context-Dependent Synonym Generator"
description1 = """<p>
Try inserting a few sentences from a patent, and pick keywords for the model to analyze. The model will analyze the
context of the keywords in the sentences and generate the top three most likely candidates for each word.
This can be used for more creative patent drafting or patent searches using the generated search string. The base model is
<a href= "https://github.com/google/patents-public-data/blob/master/models/BERT%20for%20Patents.md">Patent BERT</a> created and trained by Google.
<strong>Note:</strong> Current pipeline only allows for <strong>three</strong> keyword submissions. Stemming (e.g., altering -> alter) is built into the output for
broader search string. <br/>
Beta features (currently work-in-progress) include: (<strong>A</strong>) adjustment options for (i) the number of keywords, (ii) the number of context-dependent synonyms,
and (iii) a 'creativity' parameter of the model; (<strong>B</strong>) analysis of where these words appear in the patent (e.g.,
claim, summary, etc.); and (<strong>C</strong>) a stemming option for input keywords.
<br/>
<p/>"""
# textbox2 = gr.Textbox(label="Input Sentences", lines=5)
# output_textbox2 = gr.Textbox(placeholder="Output will appear here", lines=4)
# temp_slider2 = gr.Slider(1.0, 3.0, value=1.0, label='Creativity')
# edit_slider2 = gr.Slider(1, 20, step=1, value=1.0, label='Number of edits')
# title2 = "Patent-BERT Sentence Remix-er: Multiple Edits"
# description2 = """<p>
#
# Try typing in a sentence for the model to remix. Adjust the 'creativity' scale bar to change the
# the model's confidence in its likely substitutions and the 'number of edits' for the number of edits you want
# the model to attempt to make. The words substituted in the output sentence will be enclosed in asterisks (e.g., *word*).
# <br/> <p/> """
demo0 = gr.Interface(
fn=extract_keywords,
inputs=[textbox0, textbox01],
outputs=[label0, label01, label02, output_textbox0],
examples=tab_two_examples,
allow_flagging='never',
title=title1,
description=description1
)
# demo1 = gr.Interface(
# fn=unmask_single,
# inputs=[textbox1],
# outputs='label',
# examples=tab_one_examples,
# allow_flagging='never',
# title=title1,
# description=description1
# )
# demo2 = gr.Interface(
# fn=unmask,
# inputs=[textbox2, temp_slider2, edit_slider2],
# outputs=[output_textbox2],
# examples=tab_two_examples,
# allow_flagging='never',
# title=title2,
# description=description2
# )
gr.TabbedInterface(
[demo0], ["Keyword generator"]
).launch()
|