keyword_bert

Running

App Files Files Community

jannisborn commited on Jan 15, 2023

Commit

6938961

unverified ·

1 Parent(s): d320ce9

update

Browse files

Files changed (5) hide show

README.md +1 -1
app.py +50 -62
model_cards/article.md +26 -37
model_cards/description.md +1 -1
model_cards/examples.csv +2 -5

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: GT4SD - Patent Generative Transformers
 emoji: 💡
 colorFrom: green
 colorTo: blue

 ---
+title: GT4SD - KeyBERT
 emoji: 💡
 colorFrom: green
 colorTo: blue

app.py CHANGED Viewed

@@ -2,11 +2,9 @@ import logging
 import pathlib
 import gradio as gr
 import pandas as pd
-from gt4sd.algorithms.generation.pgt import (
-    PGT,
-    PGTCoherenceChecker,
-    PGTEditor,
-    PGTGenerator,
 )
 from gt4sd.algorithms.registry import ApplicationsRegistry
@@ -14,38 +12,33 @@ from gt4sd.algorithms.registry import ApplicationsRegistry
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
-MODEL_FN = {
-    "PGTGenerator": PGTGenerator,
-    "PGTEditor": PGTEditor,
-    "PGTCoherenceChecker": PGTCoherenceChecker,
-}
 def run_inference(
-    model_type: str,
-    generator_task: str,
-    editor_task: str,
-    checker_task: str,
-    prompt: str,
-    second_prompt: str,
-    length: int,
-    k: int,
-    p: float,
 ):
-    kwargs = {"max_length": length, "top_k": k, "top_p": p}
-    if model_type == "PGTGenerator":
-        config = PGTGenerator(task=generator_task, input_text=prompt, **kwargs)
-    elif model_type == "PGTEditor":
-        config = PGTEditor(input_type=editor_task, input_text=prompt, **kwargs)
-    elif model_type == "PGTCoherenceChecker":
-        config = PGTCoherenceChecker(
-            coherence_type=checker_task, input_a=prompt, input_b=second_prompt, **kwargs
-        )
-    model = PGT(config)
-    text = list(model.sample(1))[0]
     return text
@@ -55,57 +48,52 @@ if __name__ == "__main__":
     # Preparation (retrieve all available algorithms)
     all_algos = ApplicationsRegistry.list_available()
     algos = [
-        x["algorithm_application"]
-        for x in list(filter(lambda x: "PGT" in x["algorithm_name"], all_algos))
     ]
     # Load metadata
     metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
     examples = pd.read_csv(
-        metadata_root.joinpath("examples.csv"), sep="|", header=None
     ).fillna("")
-    print("Examples: ", examples.values.tolist())
     with open(metadata_root.joinpath("article.md"), "r") as f:
         article = f.read()
     with open(metadata_root.joinpath("description.md"), "r") as f:
         description = f.read()
-    gen_tasks = [
-        "title-to-abstract",
-        "abstract-to-title",
-        "abstract-to-claim",
-        "claim-to-abstract",
-    ]
     demo = gr.Interface(
         fn=run_inference,
-        title="Patent Generative Transformer",
         inputs=[
-            gr.Dropdown(algos, label="Model type", value="PGTGenerator"),
-            gr.Dropdown(gen_tasks, label="Generator task", value="title-to-abstract"),
-            gr.Dropdown(["abstract", "claim"], label="Editor task", value="abstract"),
-            gr.Dropdown(
-                ["title-abstract", "title-claim", "abstract-claim"],
-                label="Checker task",
-                value="title-abstract",
-            ),
             gr.Textbox(
-                label="Primary Text prompt",
-                placeholder="Artificial intelligence and machine learning infrastructure",
                 lines=5,
             ),
-            gr.Textbox(
-                label="Secondary text prompt (only coherence checker)",
-                placeholder="",
-                lines=1,
             ),
             gr.Slider(
-                minimum=5, maximum=1024, value=512, label="Maximal length", step=1
             ),
-            gr.Slider(minimum=2, maximum=500, value=50, label="Top-k", step=1),
-            gr.Slider(minimum=0.5, maximum=1.0, value=0.95, label="Top-p"),
         ],
         outputs=gr.Textbox(label="Output"),
         article=article,

 import pathlib
 import gradio as gr
 import pandas as pd
+from gt4sd.algorithms.conditional_generation.key_bert import (
+    KeywordBERTGenerationAlgorithm,
+    KeyBERTGenerator,
 )
 from gt4sd.algorithms.registry import ApplicationsRegistry
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 def run_inference(
+    algorithm_version: str,
+    text: str,
+    minimum_keyphrase_ngram: int,
+    maximum_keyphrase_ngram: int,
+    stop_words: str,
+    use_maxsum: bool,
+    number_of_candidates: int,
+    use_mmr: bool,
+    diversity: float,
+    number_of_keywords: int,
 ):
+    config = KeyBERTGenerator(
+        algorithm_version=algorithm_version,
+        minimum_keyphrase_ngram=minimum_keyphrase_ngram,
+        maximum_keyphrase_ngram=maximum_keyphrase_ngram,
+        stop_words=stop_words,
+        top_n=number_of_keywords,
+        use_maxsum=use_maxsum,
+        use_mmr=use_mmr,
+        diversity=diversity,
+        number_of_candidates=number_of_candidates,
+    )
+    model = KeywordBERTGenerationAlgorithm(configuration=config, target=text)
+    text = list(model.sample(number_of_keywords))
     return text
     # Preparation (retrieve all available algorithms)
     all_algos = ApplicationsRegistry.list_available()
     algos = [
+        x["algorithm_version"]
+        for x in list(filter(lambda x: "KeywordBERT" in x["algorithm_name"], all_algos))
     ]
     # Load metadata
     metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
     examples = pd.read_csv(
+        metadata_root.joinpath("examples.csv"), sep=",", header=None
     ).fillna("")
     with open(metadata_root.joinpath("article.md"), "r") as f:
         article = f.read()
     with open(metadata_root.joinpath("description.md"), "r") as f:
         description = f.read()
     demo = gr.Interface(
         fn=run_inference,
+        title="KeywordBERT",
         inputs=[
+            gr.Dropdown(algos, label="Algorithm version", value="circa_bert_v2"),
             gr.Textbox(
+                label="Text prompt",
+                placeholder="This is a text I want to understand better",
                 lines=5,
             ),
+            gr.Slider(
+                minimum=1, maximum=5, value=1, label="Minimum keyphrase ngram", step=1
+            ),
+            gr.Slider(
+                minimum=2, maximum=10, value=1, label="Maximum keyphrase ngram", step=1
+            ),
+            gr.Textbox(label="Stop words", placeholder="english", lines=1),
+            gr.Radio(choices=[True, False], label="MaxSum", value=False),
+            gr.Slider(
+                minimum=5, maximum=100, value=20, label="MaxSum candidates", step=1
+            ),
+            gr.Radio(
+                choices=[True, False],
+                label="Max. marginal relevance control",
+                value=False,
             ),
+            gr.Slider(minimum=0.1, maximum=1, value=0.5, label="Diversity"),
             gr.Slider(
+                minimum=1, maximum=50, value=10, label="Number of keywords", step=1
             ),
         ],
         outputs=gr.Textbox(label="Output"),
         article=article,

model_cards/article.md CHANGED Viewed

@@ -1,57 +1,43 @@
 # Model documentation & parameters
-**Model type**: Type of PGT model to be used:
-- `PGTGenerator`: A model for part-of-patent generator
-- `PGTEditor`: An algorithm for part-of-patent editing.
-- `PGTCoherenceChecker`: An algorithm for patent coherence check
-**Generator task**: Task in case the `PGTGenerator` model is used. Options are:
-- `title-to-abstract`
-- `abstract-to-title`
-- `abstract-to-claim`
-- `claim-to-abstract`
-**Editor task**: Task in case the `PGTEditor` model is used. Options are:
-- `abstract`
-- `claim`
-**Coherence task**: Task in case the `PGTCoherenceChecker` model is used. Options are:
-- `title-abstract`
-- `title-claim`
-- `abstract-claim`
-**Primary text prompt**: The main text prompt for the model
-**Secondary text prompt**: The secondary text prompt for the model (only used for `PGTCoherenceChecker`).
-**Maximal length**: The maximal number of tokens in the generated sequences.
-**Top-k**: Number of top-k probability tokens to keep.
-**Top-p**: Only tokens with cumulative probabilities summing up to this value are kept.
-# Model card -- PatentGenerativeTransformer
-**Model Details**: Patent Generative Transformer (PGT), a transformer-based multitask language model trained to facilitate the patent generation process. Published by [Christofidellis et al. (*ICML 2022 Workshop KRLM*)](https://openreview.net/forum?id=dLHtwZKvJmE)
-**Developers**: Dimitrios Christofidellis and colleagues at IBM Research.
-**Distributors**: Model natively integrated into GT4SD.
-**Model date**: 2022.
-**Model type**:
-- `PGTGenerator`: A model for part-of-patent generator
-- `PGTEditor`: An algorithm for part-of-patent editing.
-- `PGTCoherenceChecker`: An algorithm for patent coherence check
 **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
 N.A.
 **Paper or other resource for more information**:
-The Patent Generative Transformer (PGT) [paper by Christofidellis et al. (*ICML 2022 Workshop KRLM*)](https://openreview.net/forum?id=dLHtwZKvJmE).
 **License**: MIT
@@ -61,7 +47,7 @@ The Patent Generative Transformer (PGT) [paper by Christofidellis et al. (*ICML
 **Primary intended uses/users**: N.A.
-**Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
 **Metrics**: N.A.
@@ -75,10 +61,13 @@ Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi
 ## Citation
 ```bib
-@inproceedings{christofidellis2022pgt,
-  title={PGT: a prompt based generative transformer for the patent domain},
-  author={Christofidellis, Dimitrios and Torres, Antonio Berrios and Dave, Ashish and Roveri, Manuel and Schmidt, Kristin and Swaminathan, Sarath and Vandierendonck, Hans and Zubarev, Dmitry and Manica, Matteo},
-  booktitle={ICML 2022 Workshop on Knowledge Retrieval and Language Models},
-  year={2022}
 }
 ```

 # Model documentation & parameters
+**Algorithm version**: The model version to use. Note that *any* HF model can be wrapped to a `KeyBERT` model.
+**Text**: The main text prompt to "understand", i.e., generate keywords.
+**Minimum keyphrase ngram**: Lower bound for phrase size. Each keyword will have at least this many words.
+**Maximum keyphrase ngram**: Upper bound for phrase size. Each keyword will have at least this many words.
+**Stop words**: Stopwords to remove from the document. If not provided, no stop words removal.
+**Use MaxSum**: To diversify the results, we take the `2 x MaxSum candidates` most similar words/phrases to the document. Then, we take all top_n combinations from the `2 x MaxSum candidates` and extract the combination that are the least similar to each other by cosine similarity. Control usage of max sum similarity for keywords generated.
+**MaxSum candidates**: Candidates considered when enabling `Use MaxSum`.
+**Use Max. marginal relevance**: To diversify the results, we can use Maximal Margin Relevance (MMR) to create keywords / keyphrases which is also based on cosine similarity.
+**Diversity**: Diversity for the results when enabling `max. marginal relevance`.
+**Number of keywords**: How many keywords should be generated (maximal 50).
+# Model card -- KeywordBERT
+**Model Details**: KeyBERT is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings to create keywords and keyphrases that are most similar to a document.
+**Developers**: Maarten Grootendorst.
+**Distributors**: Original developer's code from [https://github.com/MaartenGr/KeyBERT](https://github.com/MaartenGr/KeyBERT).
+**Model date**: 2020.
+**Model type**: Different BERT and SciBERT models, trained on [CIRCA data](https://circa.res.ibm.com/index.html).
 **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
 N.A.
 **Paper or other resource for more information**:
+The [KeyBERT GitHub repo](https://github.com/MaartenGr/KeyBERT).
 **License**: MIT
 **Primary intended uses/users**: N.A.
+**Out-of-scope use cases**: Production-level inference.
 **Metrics**: N.A.
 ## Citation
 ```bib
+@misc{grootendorst2020keybert,
+  author       = {Maarten Grootendorst},
+  title        = {KeyBERT: Minimal keyword extraction with BERT.},
+  year         = 2020,
+  publisher    = {Zenodo},
+  version      = {v0.3.0},
+  doi          = {10.5281/zenodo.4461265},
+  url          = {https://doi.org/10.5281/zenodo.4461265}
 }
 ```

model_cards/description.md CHANGED Viewed

@@ -1,6 +1,6 @@
 <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
-[Patent Generative Transformers (PGT)](https://openreview.net/forum?id=dLHtwZKvJmE): A prompt based generative transformer for the patent domain (Christofidellis et al., 2022; *ICLR Workshop KRLM*).
 For **examples** and **documentation** of the model parameters, please see below.
 Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.

 <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
+[KeywordBERT](https://github.com/MaartenGr/KeyBERT) is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings to create keywords and keyphrases that are most similar to a document.
 For **examples** and **documentation** of the model parameters, please see below.
 Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.

model_cards/examples.csv CHANGED Viewed

@@ -1,5 +1,2 @@
-PGTGenerator|title-to-abstract|||Artificial intelligence and machine learning infrastructure||512|50|1.0
-PGTGenerator|title-to-abstract|||Artificial intelligence and machine learning infrastructure||756|20|0.95
-PGTEditor||abstract||In one step of a method for infusing an [MASK], the infusion fluid is pumped through a fluid delivery line of an infusion system. In another step, measurements are taken with at least one sensor connected to the infusion system. In an additional step, an air determination is determined with at least one processor. The air determination is related to air in the fluid delivery line. The air determination is based on the measurements taken by the at least one sensor. The air determination is further based on: (1) [MASK] information regarding the infusion of the infusion fluid; or (2) multi-channel filtering of the measurements from the at least one sensor or non-linear mapping of the measurements from the at least one sensor; and statistical process control charts applied to the multi-channel filtered measurements or applied to the non-linear mapped measurements.||512|50|1
-PGTCoherenceChecker|||title-abstract|Artificial intelligence and machine learning infrastructure|An artificial intelligence and machine learning infrastructure system, including: one or more storage systems comprising, respectively, one or more storage devices; and one or more graphical processing units, wherein the graphical processing units are configured to communicate with the one or more storage systems over a communication fabric; where the one or more storage systems, the one or more graphical processing units, and the communication fabric are implemented within a single chassis.|512|50|1
-PGTCoherenceChecker|||title-abstract|Analog image processing|An artificial intelligence and machine learning infrastructure system for image classification, including: one or more storage systems comprising, respectively, one or more storage devices; and one or more graphical processing units, wherein the graphical processing units are configured to communicate with the one or more storage systems over a communication fabric; where the one or more storage systems, the one or more graphical processing units, and the communication fabric are implemented within a single chassis.|512|50|1


1	+ distilbert-base-nli-mean-tokens,This is a text I would like to understand better,1,2,english,False,20,False,0.5,3
2	+ distilbert-base-nli-mean-tokens,KeyBERT is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings to create keywords and keyphrases that are most similar to a document.,1,2,english,False,20,False,0.5,3