Spaces:

B-patents
/

patent-bert

Build error

App Files Files Community

danseith commited on Feb 27, 2023

Commit

4db26d9

1 Parent(s): b7321ed

search string generator

Browse files

Files changed (1) hide show

app.py +146 -47

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import gradio as gr
 import numpy as np
 import torch
 from transformers import pipeline
 from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline
 from transformers import AutoModelForMaskedLM
@@ -9,38 +11,45 @@ ex_str1 = "A crustless sandwich made from two slices of baked bread. The sandwic
           "crustless bread pieces. The bread pieces have the same general outer shape defined by an outer periphery " \
           "with central portions surrounded by an outer peripheral area, the bread pieces being at least partially " \
           "crimped together at the outer peripheral area."
 ex_str2 = "The present disclosure provides a DNA-targeting RNA that comprises a targeting sequence and, together with" \
           " a modifying polypeptide, provides for site-specific modification of a target DNA and/or a polypeptide" \
           " associated with the target DNA. "
 ex_str3 = "The graphite plane is composed of a two-dimensional hexagonal lattice of carbon atoms and the plate has a " \
           "length and a width parallel to the graphite plane and a thickness orthogonal to the graphite plane with at " \
           "least one of the length, width, and thickness values being 100 nanometers or smaller. "
-tab_two_examples = [[ex_str1, 1.2, 1],
-                    [ex_str2, 1.5, 10],
-                    [ex_str3, 1.4, 5]]
-tab_one_examples = [['A crustless _ made from two slices of baked bread.'],
-                    ['The present disclosure provides a DNA-targeting RNA that comprises a targeting _.'],
-                    ['The _ plane is composed of a two-dimensional hexagonal lattice of carbon atoms.']
-                    ]
-def add_mask(text):
     split_text = text.split()
     # If the user supplies a mask, don't add more
     if '_' in split_text:
         u_pos = [i for i, s in enumerate(split_text) if '_' in s][0]
         split_text[u_pos] = '[MASK]'
         return ' '.join(split_text), '[MASK]'
-    idx = np.random.randint(len(split_text), size=1).astype(int)[0]
     # Don't mask certain words
     num_iters = 0
-    while split_text[idx].lower() in ['a', 'an', 'the', 'is', 'and', 'or']:
         num_iters += 1
         idx = np.random.randint(len(split_text), size=1).astype(int)[0]
         if num_iters > 10:
@@ -148,6 +157,7 @@ PIPELINE_REGISTRY.register_pipeline(
 )
 scrambler = pipeline("temp-scale", model="anferico/bert-for-patents")
 def sample_output(out, sampling):
     score_to_str = {out[k]: k for k in out.keys()}
@@ -167,10 +177,10 @@ def unmask_single(text, temp=1):
     return out
-def unmask(text, temp, rounds):
     sampling = 'multi'
     for _ in range(rounds):
-        masked_text, masked = add_mask(text)
         split_text = masked_text.split()
         res = scrambler(masked_text, temp=temp, top_k=15)
         mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0]
@@ -194,51 +204,140 @@ def unmask(text, temp, rounds):
     return ''.join(text)
-textbox1 = gr.Textbox(label="Input Sentence", lines=5)
-output_textbox1 = gr.Textbox(placeholder="Output will appear here", lines=4)
-textbox2 = gr.Textbox(label="Input Sentences", lines=5)
-output_textbox2 = gr.Textbox(placeholder="Output will appear here", lines=4)
-temp_slider2 = gr.Slider(1.0, 3.0, value=1.0, label='Creativity')
-edit_slider2 = gr.Slider(1, 20, step=1, value=1.0, label='Number of edits')
-title1 = "Patent-BERT Sentence Remix-er: Single Edit"
 description1 = """<p>
 This is a model based on
 <a href= "https://github.com/google/patents-public-data/blob/master/models/BERT%20for%20Patents.md">Patent BERT</a> created by Google.
-<br/>
-Try inserting a '_' where you want the model to generate a list of likely words.
-<strong>Note:</strong> You can only add one '_' per submission.
 <br/>
 <p/>"""
-title2 = "Patent-BERT Sentence Remix-er: Multiple Edits"
-description2 = """<p>
-Try typing in a sentence for the model to remix. Adjust the 'creativity' scale bar to change the
-the model's confidence in its likely substitutions and the 'number of edits' for the number of edits you want
-the model to attempt to make. The words substituted in the output sentence will be enclosed in asterisks (e.g., *word*).
-<br/> <p/> """
-demo1 = gr.Interface(
-    fn=unmask_single,
-    inputs=[textbox1],
-    outputs='label',
-    examples=tab_one_examples,
     allow_flagging='never',
     title=title1,
     description=description1
 )
-demo2 = gr.Interface(
-    fn=unmask,
-    inputs=[textbox2, temp_slider2, edit_slider2],
-    outputs=[output_textbox2],
-    examples=tab_two_examples,
-    allow_flagging='never',
-    title=title2,
-    description=description2
-)
 gr.TabbedInterface(
-    [demo1, demo2], ["Single edit", "Multiple Edits"]
 ).launch()

 import gradio as gr
 import numpy as np
 import torch
+from nltk.stem import PorterStemmer
+from collections import defaultdict
 from transformers import pipeline
 from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline
 from transformers import AutoModelForMaskedLM
           "crustless bread pieces. The bread pieces have the same general outer shape defined by an outer periphery " \
           "with central portions surrounded by an outer peripheral area, the bread pieces being at least partially " \
           "crimped together at the outer peripheral area."
+ex_key1 = "sandwich bread crimped"
 ex_str2 = "The present disclosure provides a DNA-targeting RNA that comprises a targeting sequence and, together with" \
           " a modifying polypeptide, provides for site-specific modification of a target DNA and/or a polypeptide" \
           " associated with the target DNA. "
+ex_key2 = "DNA target modification"
 ex_str3 = "The graphite plane is composed of a two-dimensional hexagonal lattice of carbon atoms and the plate has a " \
           "length and a width parallel to the graphite plane and a thickness orthogonal to the graphite plane with at " \
           "least one of the length, width, and thickness values being 100 nanometers or smaller. "
+ex_key3 = "graphite lattice orthogonal "
+tab_two_examples = [[ex_str1, ex_key1],
+                    [ex_str2, ex_key2],
+                    [ex_str3, ex_key3]]
+#
+# tab_one_examples = [['A crustless _ made from two slices of baked bread.'],
+#                     ['The present disclosure provides a DNA-targeting RNA that comprises a targeting _.'],
+#                     ['The _ plane is composed of a two-dimensional hexagonal lattice of carbon atoms.']
+#                     ]
+ignore = ['a', 'an', 'the', 'is', 'and', 'or']
+def add_mask(text, lower_bound=0, index=None):
     split_text = text.split()
+    if index is not None:
+        split_text[index] = '[MASK]'
+        return ' '.join(split_text), None
     # If the user supplies a mask, don't add more
     if '_' in split_text:
         u_pos = [i for i, s in enumerate(split_text) if '_' in s][0]
         split_text[u_pos] = '[MASK]'
         return ' '.join(split_text), '[MASK]'
+    idx = np.random.randint(low=lower_bound, high=len(split_text), size=1).astype(int)[0]
     # Don't mask certain words
     num_iters = 0
+    while split_text[idx].lower() in ignore:
         num_iters += 1
         idx = np.random.randint(len(split_text), size=1).astype(int)[0]
         if num_iters > 10:
 )
 scrambler = pipeline("temp-scale", model="anferico/bert-for-patents")
+generator = pipeline('text-generation', model='gpt2')
 def sample_output(out, sampling):
     score_to_str = {out[k]: k for k in out.keys()}
     return out
+def unmask(text, temp, rounds, lower_bound=0):
     sampling = 'multi'
     for _ in range(rounds):
+        masked_text, masked = add_mask(text, lower_bound)
         split_text = masked_text.split()
         res = scrambler(masked_text, temp=temp, top_k=15)
         mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0]
     return ''.join(text)
+def autocomplete(text, temp):
+    output = generator(text, max_length=30, num_return_sequences=1)
+    gpt_out = output[0]['generated_text']
+    # diff = gpt_out.replace(text, '')
+    patent_bert_out = unmask(gpt_out, temp=temp, rounds=5, lower_bound=len(text.split()))
+    # Take the output from gpt-2 and randomly mask, if a mask is confident, swap it in. Iterate 5 times
+    return patent_bert_out
+def extract_keywords(text, queries):
+    q_dict = {}
+    temp = 1 # set temperature to 1
+    for query in queries.split():
+        # Iterate through text and mask each token
+        ps = PorterStemmer()
+        top_scores = defaultdict(list)
+        top_k_range = 10
+        indices = [i for i, t in enumerate(text.split()) if t.lower() == query.lower()]
+        for i in indices:
+            masked_text, masked = add_mask(text, index=i)
+            res = scrambler(masked_text, temp=temp, top_k=top_k_range)
+            out = {item["token_str"]: item["score"] for item in res}
+            sorted_keys = sorted(out, key=out.get)
+            # If the key does not appear, floor its rank for that round
+            for rank, token_str in enumerate(sorted_keys):
+                stemmed = ps.stem(token_str)
+                if token_str not in top_scores.keys():
+                    top_scores[stemmed].append(0)
+                norm_rank = rank / top_k_range
+                top_scores[stemmed].append(norm_rank)
+        # Calc mean
+        for key in top_scores.keys():
+            top_scores[key] = np.mean(top_scores[key])
+        # Normalize
+        for key in top_scores.keys():
+            top_scores[key] = top_scores[key] / np.sum(list(top_scores.values()))
+        # Get top_k
+        top_n = sorted(list(top_scores.values()))[-3]
+        for key in list(top_scores.keys()):
+            if top_scores[key] < top_n:
+                del top_scores[key]
+        q_dict[query] = top_scores
+    keywords = ''
+    for i, q in enumerate(q_dict.keys()):
+        keywords += '['
+        for ii, k in enumerate(q_dict[q].keys()):
+            keywords += k
+            if ii < len(q_dict[q].keys()) - 1:
+                keywords += ' OR '
+            else:
+                keywords += ']'
+        if i < len(q_dict.keys()) - 1:
+            keywords += ' AND '
+    # keywords = set([k for q in q_dict.keys() for k in q_dict[q].keys()])
+    # search_str = ' OR '.join(keywords)
+    output = [q_dict[q] for q in q_dict]
+    output.append(keywords)
+    return output
+    # fig, ax = plt.subplots(nrows=1, ncols=3)
+    # for q in q_dict:
+    #     ax.bar(q_dict[q])
+    # return fig
+label0 = gr.Label(label='keyword 1', num_top_classes=3)
+label01 = gr.Label(label='keyword 2', num_top_classes=3)
+label02 = gr.Label(label='keyword 3', num_top_classes=3)
+textbox02 = gr.Textbox(label="Input Keywords", lines=3)
+textbox01 = gr.Textbox(label="Input Keywords", placeholder="Type keywords here", lines=1)
+textbox0 = gr.Textbox(label="Input Sentences", placeholder="Type sentences here", lines=5)
+output_textbox0 = gr.Textbox(label='Search String of Keywords', placeholder="Output will appear here", lines=4)
+# temp_slider0 = gr.Slider(1.0, 3.0, value=1.0, label='Creativity')
+textbox1 = gr.Textbox(label="Input Sentence", lines=5)
+# output_textbox1 = gr.Textbox(placeholder="Output will appear here", lines=4)
+title1 = "Patent-BERT: Context-Dependent Synonym Generator"
 description1 = """<p>
+Try inserting a few sentences from a patent, and pick keywords for the model to analyze. The model will analyze the
+context of the keywords in the sentences and generate the top five most likely candidates for each word.
+Can be used for more creative patent drafting or patent searches using the generated search string.
 This is a model based on
 <a href= "https://github.com/google/patents-public-data/blob/master/models/BERT%20for%20Patents.md">Patent BERT</a> created by Google.
+<strong>Note:</strong> Current pipeline only allows for three keyword submission.
 <br/>
 <p/>"""
+# textbox2 = gr.Textbox(label="Input Sentences", lines=5)
+# output_textbox2 = gr.Textbox(placeholder="Output will appear here", lines=4)
+# temp_slider2 = gr.Slider(1.0, 3.0, value=1.0, label='Creativity')
+# edit_slider2 = gr.Slider(1, 20, step=1, value=1.0, label='Number of edits')
+# title2 = "Patent-BERT Sentence Remix-er: Multiple Edits"
+# description2 = """<p>
+#
+# Try typing in a sentence for the model to remix. Adjust the 'creativity' scale bar to change the
+# the model's confidence in its likely substitutions and the 'number of edits' for the number of edits you want
+# the model to attempt to make. The words substituted in the output sentence will be enclosed in asterisks (e.g., *word*).
+# <br/> <p/> """
+demo0 = gr.Interface(
+    fn=extract_keywords,
+    inputs=[textbox0, textbox01],
+    outputs=[label0, label01, label02, output_textbox0],
+    examples=tab_two_examples,
     allow_flagging='never',
     title=title1,
     description=description1
 )
+# demo1 = gr.Interface(
+#     fn=unmask_single,
+#     inputs=[textbox1],
+#     outputs='label',
+#     examples=tab_one_examples,
+#     allow_flagging='never',
+#     title=title1,
+#     description=description1
+# )
+# demo2 = gr.Interface(
+#     fn=unmask,
+#     inputs=[textbox2, temp_slider2, edit_slider2],
+#     outputs=[output_textbox2],
+#     examples=tab_two_examples,
+#     allow_flagging='never',
+#     title=title2,
+#     description=description2
+# )
 gr.TabbedInterface(
+    [demo0], ["Keyword generator"]
 ).launch()