File size: 14,882 Bytes
8c0b646
 
d5cc744
f023836
4db26d9
 
ca69fee
d5cc744
ca69fee
8c0b646
ba4ce98
19d0d27
ba4ce98
 
4db26d9
ba4ce98
 
 
 
4db26d9
ba4ce98
 
 
 
4db26d9
ba4ce98
4db26d9
 
 
 
 
 
 
 
a95bc58
f023836
 
ba4ce98
fce4c33
4db26d9
fce4c33
4db26d9
 
 
c16370c
a95bc58
 
 
 
616c8c6
4db26d9
616c8c6
 
f023836
616c8c6
 
 
 
 
 
 
eabdff9
616c8c6
fce4c33
 
d5cc744
ca69fee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5cc744
 
 
 
 
 
 
 
 
2ce1788
d5cc744
ca69fee
 
 
 
d5cc744
 
ca69fee
 
d5cc744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca69fee
fce4c33
4db26d9
c16370c
cd3e092
 
 
 
 
 
 
 
 
 
 
a95bc58
616c8c6
a95bc58
 
 
 
 
4db26d9
c16370c
cd3e092
4db26d9
eabdff9
cd3e092
c16370c
 
cd3e092
 
f51fca8
cd3e092
 
470c4c2
a95bc58
cd3e092
f51fca8
470c4c2
 
 
c16370c
cd3e092
ba4ce98
 
1ca245c
ba4ce98
616c8c6
4db26d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f023836
 
 
4db26d9
 
 
 
 
 
 
f023836
 
4db26d9
 
 
f023836
 
 
4db26d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a95bc58
4db26d9
 
 
b7321ed
4db26d9
9580f49
4b598b3
 
4db26d9
4b598b3
187c035
 
4b598b3
 
 
a95bc58
 
4db26d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a95bc58
 
 
 
3566540
4db26d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c0b646
a95bc58
4db26d9
a95bc58
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import gradio as gr
import numpy as np
import torch
import re
from nltk.stem import PorterStemmer
from collections import defaultdict
from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline
from transformers import AutoModelForMaskedLM

ex_str1 = "A crustless sandwich made from two slices of baked bread. The sandwich includes first and second matching " \
          "crustless bread pieces. The bread pieces have the same general outer shape defined by an outer periphery " \
          "with central portions surrounded by an outer peripheral area, the bread pieces being at least partially " \
          "crimped together at the outer peripheral area."
ex_key1 = "sandwich bread crimped"

ex_str2 = "The present disclosure provides a DNA-targeting RNA that comprises a targeting sequence and, together with" \
          " a modifying polypeptide, provides for site-specific modification of a target DNA and/or a polypeptide" \
          " associated with the target DNA. "
ex_key2 = "DNA target modification"

ex_str3 = "The graphite plane is composed of a two-dimensional hexagonal lattice of carbon atoms and the plate has a " \
          "length and a width parallel to the graphite plane and a thickness orthogonal to the graphite plane with at " \
          "least one of the length, width, and thickness values being 100 nanometers or smaller. "
ex_key3 = "graphite lattice orthogonal "

tab_two_examples = [[ex_str1, ex_key1],
                    [ex_str2, ex_key2],
                    [ex_str3, ex_key3]]
#
# tab_one_examples = [['A crustless _ made from two slices of baked bread.'],
#                     ['The present disclosure provides a DNA-targeting RNA that comprises a targeting _.'],
#                     ['The _ plane is composed of a two-dimensional hexagonal lattice of carbon atoms.']
#                     ]

ignore_str = ['a', 'an', 'the', 'is', 'and', 'or', '!', '(', ')', '-', '[', ']', '{', '}', ';', ':', "'", '"', '\\',
              ',', '<', '>', '.', '/', '?', '@', '#', '$', '%', '^', '&', '*', '_', '~']


def add_mask(text, lower_bound=0, index=None):
    split_text = text.split()
    if index is not None:
        split_text[index] = '[MASK]'
        return ' '.join(split_text), None
    # If the user supplies a mask, don't add more
    if '_' in split_text:
        u_pos = [i for i, s in enumerate(split_text) if '_' in s][0]
        split_text[u_pos] = '[MASK]'
        return ' '.join(split_text), '[MASK]'

    idx = np.random.randint(low=lower_bound, high=len(split_text), size=1).astype(int)[0]
    # Don't mask certain words
    num_iters = 0
    while split_text[idx].lower() in ignore_str:
        num_iters += 1
        idx = np.random.randint(len(split_text), size=1).astype(int)[0]
        if num_iters > 10:
            break

    masked_string = split_text[idx]
    split_text[idx] = '[MASK]'
    masked_output = ' '.join(split_text)
    return masked_output, masked_string


class TempScalePipe(FillMaskPipeline):
    def _sanitize_parameters(self, top_k=None, targets=None, temp=None):
        postprocess_params = {}

        if targets is not None:
            target_ids = self.get_target_ids(targets, top_k)
            postprocess_params["target_ids"] = target_ids

        if top_k is not None:
            postprocess_params["top_k"] = top_k

        if temp is not None:
            postprocess_params["temp"] = temp
        return {}, {}, postprocess_params


    def __call__(self, inputs, *args, **kwargs):
        """
        Fill the masked token in the text(s) given as inputs.

        Args:
            args (`str` or `List[str]`):
                One or several texts (or one list of prompts) with masked tokens.
            targets (`str` or `List[str]`, *optional*):
                When passed, the model will limit the scores to the passed targets instead of looking up in the whole
                vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
                resulting token will be used (with a warning, and that might be slower).
            top_k (`int`, *optional*):
                When passed, overrides the number of predictions to return.

        Return:
            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:

            - **sequence** (`str`) -- The corresponding input with the mask token prediction.
            - **score** (`float`) -- The corresponding probability.
            - **token** (`int`) -- The predicted token id (to replace the masked one).
            - **token** (`str`) -- The predicted token (to replace the masked one).
        """
        outputs = super().__call__(inputs, **kwargs)
        if isinstance(inputs, list) and len(inputs) == 1:
            return outputs[0]
        return outputs

    def postprocess(self, model_outputs, top_k=10, target_ids=None, temp=1):
        # Cap top_k if there are targets
        if target_ids is not None and target_ids.shape[0] < top_k:
            top_k = target_ids.shape[0]
        input_ids = model_outputs["input_ids"][0]
        outputs = model_outputs["logits"]

        masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
        # Fill mask pipeline supports only one ${mask_token} per sample

        logits = outputs[0, masked_index, :] / temp
        probs = logits.softmax(dim=-1)
        sampling = False
        if sampling:
            predictions = torch.multinomial(probs, num_samples=3)
            values = probs[0, predictions]
        if target_ids is not None:
            probs = probs[..., target_ids]
        if not sampling:
            values, predictions = probs.topk(top_k)

        result = []
        single_mask = values.shape[0] == 1
        for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
            row = []
            for v, p in zip(_values, _predictions):
                # Copy is important since we're going to modify this array in place
                tokens = input_ids.numpy().copy()
                if target_ids is not None:
                    p = target_ids[p].tolist()

                tokens[masked_index[i]] = p
                # Filter padding out:
                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
                # Originally we skip special tokens to give readable output.
                # For multi masks though, the other [MASK] would be removed otherwise
                # making the output look odd, so we add them back
                sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
                proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
                row.append(proposition)
            result.append(row)
        if single_mask:
            return result[0]
        return result


PIPELINE_REGISTRY.register_pipeline(
    "temp-scale",
    pipeline_class=TempScalePipe,
    pt_model=AutoModelForMaskedLM,
)
scrambler = pipeline("temp-scale", model="anferico/bert-for-patents")

generator = pipeline('text-generation', model='gpt2')

def sample_output(out, sampling):
    score_to_str = {out[k]: k for k in out.keys()}
    score_list = list(score_to_str.keys())
    if sampling == 'multi':
        idx = np.argmax(np.random.multinomial(1, score_list, 1))
    else:
        idx = np.random.randint(0, len(score_list))
    score = score_list[idx]
    return score_to_str[score]


def unmask_single(text, temp=1):
    masked_text, _ = add_mask(text)
    res = scrambler(masked_text, temp=temp, top_k=10)
    out = {item["token_str"]: item["score"] for item in res}
    return out


def unmask(text, temp, rounds, lower_bound=0):
    sampling = 'multi'
    for _ in range(rounds):
        masked_text, masked = add_mask(text, lower_bound)
        split_text = masked_text.split()
        res = scrambler(masked_text, temp=temp, top_k=15)
        mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0]
        out = {item["token_str"]: item["score"] for item in res}
        new_token = sample_output(out, sampling)
        unsuccessful_iters = 0
        while masked in new_token:
            if unsuccessful_iters > 5:
                break
            print('skipped', new_token)
            new_token = sample_output(out, sampling=sampling)
            unsuccessful_iters += 1
        if masked in new_token:
            split_text[mask_pos] = new_token
        else:
            split_text[mask_pos] = '*' + new_token + '*'
        text = ' '.join(split_text)

    text = list(text)
    text[0] = text[0].upper()
    return ''.join(text)


def autocomplete(text, temp):
    output = generator(text, max_length=30, num_return_sequences=1)
    gpt_out = output[0]['generated_text']
    # diff = gpt_out.replace(text, '')
    patent_bert_out = unmask(gpt_out, temp=temp, rounds=5, lower_bound=len(text.split()))
    # Take the output from gpt-2 and randomly mask, if a mask is confident, swap it in. Iterate 5 times
    return patent_bert_out


def extract_keywords(text, queries):
    q_dict = {}
    temp = 1 # set temperature to 1
    for query in queries.split():
        # Iterate through text and mask each token
        ps = PorterStemmer()
        top_scores = defaultdict(list)
        top_k_range = 30
        text_no_punc = re.sub(r'[^\w\s]', '', text)
        indices = [i for i, t in enumerate(text_no_punc.split()) if t.lower() == query.lower()]
        for i in indices:
            masked_text, masked = add_mask(text, index=i)
            res = scrambler(masked_text, temp=temp, top_k=top_k_range)
            out = {item["token_str"]: item["score"] for item in res}
            sorted_keys = sorted(out, key=out.get)
            # If the key does not appear, floor its rank for that round
            for rank, token_str in enumerate(sorted_keys):
                if token_str in ignore_str:
                    continue
                stemmed = ps.stem(token_str)
                norm_rank = rank / top_k_range
                top_scores[stemmed].append(norm_rank)
            for key in top_scores.keys():
                if key not in out.keys():
                    top_scores[key].append(0)
        # Calc mean
        for key in top_scores.keys():
            top_scores[key] = np.mean(top_scores[key])
        # Normalize
        for key in top_scores.keys():
            top_scores[key] = top_scores[key] / np.sum(list(top_scores.values()))
        # Get top_k
        top_n = sorted(list(top_scores.values()))[-3]
        for key in list(top_scores.keys()):
            if top_scores[key] < top_n:
                del top_scores[key]
        q_dict[query] = top_scores

    keywords = ''
    for i, q in enumerate(q_dict.keys()):
        keywords += '['
        for ii, k in enumerate(q_dict[q].keys()):
            keywords += k
            if ii < len(q_dict[q].keys()) - 1:
                keywords += ' OR '
            else:
                keywords += ']'
        if i < len(q_dict.keys()) - 1:
            keywords += ' AND '
    # keywords = set([k for q in q_dict.keys() for k in q_dict[q].keys()])
    # search_str = ' OR '.join(keywords)
    output = [q_dict[q] for q in q_dict]
    output.append(keywords)
    return output
    # fig, ax = plt.subplots(nrows=1, ncols=3)
    # for q in q_dict:
    #     ax.bar(q_dict[q])
    # return fig

label0 = gr.Label(label='keyword 1', num_top_classes=3)
label01 = gr.Label(label='keyword 2', num_top_classes=3)
label02 = gr.Label(label='keyword 3', num_top_classes=3)
textbox02 = gr.Textbox(label="Input Keywords", lines=3)
textbox01 = gr.Textbox(label="Input Keywords", placeholder="Type keywords here", lines=1)
textbox0 = gr.Textbox(label="Input Sentences", placeholder="Type sentences here", lines=5)

output_textbox0 = gr.Textbox(label='Search String of Keywords', placeholder="Output will appear here", lines=4)
# temp_slider0 = gr.Slider(1.0, 3.0, value=1.0, label='Creativity')

textbox1 = gr.Textbox(label="Input Sentence", lines=5)
# output_textbox1 = gr.Textbox(placeholder="Output will appear here", lines=4)
title1 = "Patent-BERT: Context-Dependent Synonym Generator"
description1 = """<p>
Try inserting a few sentences from a patent, and pick keywords for the model to analyze. The model will analyze the 
context of the keywords in the sentences and generate the top three most likely candidates for each word. 
This can be used for more creative patent drafting or patent searches using the generated search string. The base model is 
<a href= "https://github.com/google/patents-public-data/blob/master/models/BERT%20for%20Patents.md">Patent BERT</a> created and trained by Google.

<strong>Note:</strong> Current pipeline only allows for <strong>three</strong> keyword submissions. Stemming (e.g., altering -> alter) is built into the output for 
broader search string. <br/>

Beta features (currently work-in-progress) include: (<strong>A</strong>) adjustment options for (i) the number of keywords, (ii) the number of context-dependent synonyms, 
and (iii) a 'creativity' parameter of the model; (<strong>B</strong>) analysis of where these words appear in the patent (e.g., 
claim, summary, etc.); and (<strong>C</strong>) a stemming option for input keywords. 
<br/>
<p/>"""

# textbox2 = gr.Textbox(label="Input Sentences", lines=5)
# output_textbox2 = gr.Textbox(placeholder="Output will appear here", lines=4)
# temp_slider2 = gr.Slider(1.0, 3.0, value=1.0, label='Creativity')
# edit_slider2 = gr.Slider(1, 20, step=1, value=1.0, label='Number of edits')


# title2 = "Patent-BERT Sentence Remix-er: Multiple Edits"
# description2 = """<p>
#
# Try typing in a sentence for the model to remix. Adjust the 'creativity' scale bar to change the
# the model's confidence in its likely substitutions and the 'number of edits' for the number of edits you want
# the model to attempt to make. The words substituted in the output sentence will be enclosed in asterisks (e.g., *word*).
# <br/> <p/> """

demo0 = gr.Interface(
    fn=extract_keywords,
    inputs=[textbox0, textbox01],
    outputs=[label0, label01, label02, output_textbox0],
    examples=tab_two_examples,
    allow_flagging='never',
    title=title1,
    description=description1
)

# demo1 = gr.Interface(
#     fn=unmask_single,
#     inputs=[textbox1],
#     outputs='label',
#     examples=tab_one_examples,
#     allow_flagging='never',
#     title=title1,
#     description=description1
# )

# demo2 = gr.Interface(
#     fn=unmask,
#     inputs=[textbox2, temp_slider2, edit_slider2],
#     outputs=[output_textbox2],
#     examples=tab_two_examples,
#     allow_flagging='never',
#     title=title2,
#     description=description2
# )

gr.TabbedInterface(
    [demo0], ["Keyword generator"]
).launch()