jannisborn commited on
Commit
6938961
·
unverified ·
1 Parent(s): d320ce9
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: GT4SD - Patent Generative Transformers
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
 
1
  ---
2
+ title: GT4SD - KeyBERT
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
app.py CHANGED
@@ -2,11 +2,9 @@ import logging
2
  import pathlib
3
  import gradio as gr
4
  import pandas as pd
5
- from gt4sd.algorithms.generation.pgt import (
6
- PGT,
7
- PGTCoherenceChecker,
8
- PGTEditor,
9
- PGTGenerator,
10
  )
11
  from gt4sd.algorithms.registry import ApplicationsRegistry
12
 
@@ -14,38 +12,33 @@ from gt4sd.algorithms.registry import ApplicationsRegistry
14
  logger = logging.getLogger(__name__)
15
  logger.addHandler(logging.NullHandler())
16
 
17
- MODEL_FN = {
18
- "PGTGenerator": PGTGenerator,
19
- "PGTEditor": PGTEditor,
20
- "PGTCoherenceChecker": PGTCoherenceChecker,
21
- }
22
-
23
 
24
  def run_inference(
25
- model_type: str,
26
- generator_task: str,
27
- editor_task: str,
28
- checker_task: str,
29
- prompt: str,
30
- second_prompt: str,
31
- length: int,
32
- k: int,
33
- p: float,
 
34
  ):
35
 
36
- kwargs = {"max_length": length, "top_k": k, "top_p": p}
37
-
38
- if model_type == "PGTGenerator":
39
- config = PGTGenerator(task=generator_task, input_text=prompt, **kwargs)
40
- elif model_type == "PGTEditor":
41
- config = PGTEditor(input_type=editor_task, input_text=prompt, **kwargs)
42
- elif model_type == "PGTCoherenceChecker":
43
- config = PGTCoherenceChecker(
44
- coherence_type=checker_task, input_a=prompt, input_b=second_prompt, **kwargs
45
- )
46
-
47
- model = PGT(config)
48
- text = list(model.sample(1))[0]
49
 
50
  return text
51
 
@@ -55,57 +48,52 @@ if __name__ == "__main__":
55
  # Preparation (retrieve all available algorithms)
56
  all_algos = ApplicationsRegistry.list_available()
57
  algos = [
58
- x["algorithm_application"]
59
- for x in list(filter(lambda x: "PGT" in x["algorithm_name"], all_algos))
60
  ]
61
 
62
  # Load metadata
63
  metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
64
 
65
  examples = pd.read_csv(
66
- metadata_root.joinpath("examples.csv"), sep="|", header=None
67
  ).fillna("")
68
- print("Examples: ", examples.values.tolist())
69
 
70
  with open(metadata_root.joinpath("article.md"), "r") as f:
71
  article = f.read()
72
  with open(metadata_root.joinpath("description.md"), "r") as f:
73
  description = f.read()
74
 
75
- gen_tasks = [
76
- "title-to-abstract",
77
- "abstract-to-title",
78
- "abstract-to-claim",
79
- "claim-to-abstract",
80
- ]
81
-
82
  demo = gr.Interface(
83
  fn=run_inference,
84
- title="Patent Generative Transformer",
85
  inputs=[
86
- gr.Dropdown(algos, label="Model type", value="PGTGenerator"),
87
- gr.Dropdown(gen_tasks, label="Generator task", value="title-to-abstract"),
88
- gr.Dropdown(["abstract", "claim"], label="Editor task", value="abstract"),
89
- gr.Dropdown(
90
- ["title-abstract", "title-claim", "abstract-claim"],
91
- label="Checker task",
92
- value="title-abstract",
93
- ),
94
  gr.Textbox(
95
- label="Primary Text prompt",
96
- placeholder="Artificial intelligence and machine learning infrastructure",
97
  lines=5,
98
  ),
99
- gr.Textbox(
100
- label="Secondary text prompt (only coherence checker)",
101
- placeholder="",
102
- lines=1,
 
 
 
 
 
 
 
 
 
 
 
103
  ),
 
104
  gr.Slider(
105
- minimum=5, maximum=1024, value=512, label="Maximal length", step=1
106
  ),
107
- gr.Slider(minimum=2, maximum=500, value=50, label="Top-k", step=1),
108
- gr.Slider(minimum=0.5, maximum=1.0, value=0.95, label="Top-p"),
109
  ],
110
  outputs=gr.Textbox(label="Output"),
111
  article=article,
 
2
  import pathlib
3
  import gradio as gr
4
  import pandas as pd
5
+ from gt4sd.algorithms.conditional_generation.key_bert import (
6
+ KeywordBERTGenerationAlgorithm,
7
+ KeyBERTGenerator,
 
 
8
  )
9
  from gt4sd.algorithms.registry import ApplicationsRegistry
10
 
 
12
  logger = logging.getLogger(__name__)
13
  logger.addHandler(logging.NullHandler())
14
 
 
 
 
 
 
 
15
 
16
  def run_inference(
17
+ algorithm_version: str,
18
+ text: str,
19
+ minimum_keyphrase_ngram: int,
20
+ maximum_keyphrase_ngram: int,
21
+ stop_words: str,
22
+ use_maxsum: bool,
23
+ number_of_candidates: int,
24
+ use_mmr: bool,
25
+ diversity: float,
26
+ number_of_keywords: int,
27
  ):
28
 
29
+ config = KeyBERTGenerator(
30
+ algorithm_version=algorithm_version,
31
+ minimum_keyphrase_ngram=minimum_keyphrase_ngram,
32
+ maximum_keyphrase_ngram=maximum_keyphrase_ngram,
33
+ stop_words=stop_words,
34
+ top_n=number_of_keywords,
35
+ use_maxsum=use_maxsum,
36
+ use_mmr=use_mmr,
37
+ diversity=diversity,
38
+ number_of_candidates=number_of_candidates,
39
+ )
40
+ model = KeywordBERTGenerationAlgorithm(configuration=config, target=text)
41
+ text = list(model.sample(number_of_keywords))
42
 
43
  return text
44
 
 
48
  # Preparation (retrieve all available algorithms)
49
  all_algos = ApplicationsRegistry.list_available()
50
  algos = [
51
+ x["algorithm_version"]
52
+ for x in list(filter(lambda x: "KeywordBERT" in x["algorithm_name"], all_algos))
53
  ]
54
 
55
  # Load metadata
56
  metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
57
 
58
  examples = pd.read_csv(
59
+ metadata_root.joinpath("examples.csv"), sep=",", header=None
60
  ).fillna("")
 
61
 
62
  with open(metadata_root.joinpath("article.md"), "r") as f:
63
  article = f.read()
64
  with open(metadata_root.joinpath("description.md"), "r") as f:
65
  description = f.read()
66
 
 
 
 
 
 
 
 
67
  demo = gr.Interface(
68
  fn=run_inference,
69
+ title="KeywordBERT",
70
  inputs=[
71
+ gr.Dropdown(algos, label="Algorithm version", value="circa_bert_v2"),
 
 
 
 
 
 
 
72
  gr.Textbox(
73
+ label="Text prompt",
74
+ placeholder="This is a text I want to understand better",
75
  lines=5,
76
  ),
77
+ gr.Slider(
78
+ minimum=1, maximum=5, value=1, label="Minimum keyphrase ngram", step=1
79
+ ),
80
+ gr.Slider(
81
+ minimum=2, maximum=10, value=1, label="Maximum keyphrase ngram", step=1
82
+ ),
83
+ gr.Textbox(label="Stop words", placeholder="english", lines=1),
84
+ gr.Radio(choices=[True, False], label="MaxSum", value=False),
85
+ gr.Slider(
86
+ minimum=5, maximum=100, value=20, label="MaxSum candidates", step=1
87
+ ),
88
+ gr.Radio(
89
+ choices=[True, False],
90
+ label="Max. marginal relevance control",
91
+ value=False,
92
  ),
93
+ gr.Slider(minimum=0.1, maximum=1, value=0.5, label="Diversity"),
94
  gr.Slider(
95
+ minimum=1, maximum=50, value=10, label="Number of keywords", step=1
96
  ),
 
 
97
  ],
98
  outputs=gr.Textbox(label="Output"),
99
  article=article,
model_cards/article.md CHANGED
@@ -1,57 +1,43 @@
1
  # Model documentation & parameters
2
 
3
- **Model type**: Type of PGT model to be used:
4
- - `PGTGenerator`: A model for part-of-patent generator
5
- - `PGTEditor`: An algorithm for part-of-patent editing.
6
- - `PGTCoherenceChecker`: An algorithm for patent coherence check
7
 
8
- **Generator task**: Task in case the `PGTGenerator` model is used. Options are:
9
- - `title-to-abstract`
10
- - `abstract-to-title`
11
- - `abstract-to-claim`
12
- - `claim-to-abstract`
13
 
14
- **Editor task**: Task in case the `PGTEditor` model is used. Options are:
15
- - `abstract`
16
- - `claim`
17
 
18
- **Coherence task**: Task in case the `PGTCoherenceChecker` model is used. Options are:
19
- - `title-abstract`
20
- - `title-claim`
21
- - `abstract-claim`
22
 
23
- **Primary text prompt**: The main text prompt for the model
24
 
25
- **Secondary text prompt**: The secondary text prompt for the model (only used for `PGTCoherenceChecker`).
26
 
27
- **Maximal length**: The maximal number of tokens in the generated sequences.
28
 
29
- **Top-k**: Number of top-k probability tokens to keep.
30
 
31
- **Top-p**: Only tokens with cumulative probabilities summing up to this value are kept.
32
 
 
33
 
34
 
35
- # Model card -- PatentGenerativeTransformer
36
 
37
- **Model Details**: Patent Generative Transformer (PGT), a transformer-based multitask language model trained to facilitate the patent generation process. Published by [Christofidellis et al. (*ICML 2022 Workshop KRLM*)](https://openreview.net/forum?id=dLHtwZKvJmE)
38
 
39
- **Developers**: Dimitrios Christofidellis and colleagues at IBM Research.
40
 
41
- **Distributors**: Model natively integrated into GT4SD.
42
 
43
- **Model date**: 2022.
44
 
45
- **Model type**:
46
- - `PGTGenerator`: A model for part-of-patent generator
47
- - `PGTEditor`: An algorithm for part-of-patent editing.
48
- - `PGTCoherenceChecker`: An algorithm for patent coherence check
49
 
50
  **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
51
  N.A.
52
 
53
  **Paper or other resource for more information**:
54
- The Patent Generative Transformer (PGT) [paper by Christofidellis et al. (*ICML 2022 Workshop KRLM*)](https://openreview.net/forum?id=dLHtwZKvJmE).
55
 
56
  **License**: MIT
57
 
@@ -61,7 +47,7 @@ The Patent Generative Transformer (PGT) [paper by Christofidellis et al. (*ICML
61
 
62
  **Primary intended uses/users**: N.A.
63
 
64
- **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
65
 
66
  **Metrics**: N.A.
67
 
@@ -75,10 +61,13 @@ Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi
75
 
76
  ## Citation
77
  ```bib
78
- @inproceedings{christofidellis2022pgt,
79
- title={PGT: a prompt based generative transformer for the patent domain},
80
- author={Christofidellis, Dimitrios and Torres, Antonio Berrios and Dave, Ashish and Roveri, Manuel and Schmidt, Kristin and Swaminathan, Sarath and Vandierendonck, Hans and Zubarev, Dmitry and Manica, Matteo},
81
- booktitle={ICML 2022 Workshop on Knowledge Retrieval and Language Models},
82
- year={2022}
 
 
 
83
  }
84
  ```
 
1
  # Model documentation & parameters
2
 
3
+ **Algorithm version**: The model version to use. Note that *any* HF model can be wrapped to a `KeyBERT` model.
 
 
 
4
 
5
+ **Text**: The main text prompt to "understand", i.e., generate keywords.
 
 
 
 
6
 
7
+ **Minimum keyphrase ngram**: Lower bound for phrase size. Each keyword will have at least this many words.
 
 
8
 
9
+ **Maximum keyphrase ngram**: Upper bound for phrase size. Each keyword will have at least this many words.
 
 
 
10
 
11
+ **Stop words**: Stopwords to remove from the document. If not provided, no stop words removal.
12
 
13
+ **Use MaxSum**: To diversify the results, we take the `2 x MaxSum candidates` most similar words/phrases to the document. Then, we take all top_n combinations from the `2 x MaxSum candidates` and extract the combination that are the least similar to each other by cosine similarity. Control usage of max sum similarity for keywords generated.
14
 
15
+ **MaxSum candidates**: Candidates considered when enabling `Use MaxSum`.
16
 
17
+ **Use Max. marginal relevance**: To diversify the results, we can use Maximal Margin Relevance (MMR) to create keywords / keyphrases which is also based on cosine similarity.
18
 
19
+ **Diversity**: Diversity for the results when enabling `max. marginal relevance`.
20
 
21
+ **Number of keywords**: How many keywords should be generated (maximal 50).
22
 
23
 
24
+ # Model card -- KeywordBERT
25
 
26
+ **Model Details**: KeyBERT is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings to create keywords and keyphrases that are most similar to a document.
27
 
28
+ **Developers**: Maarten Grootendorst.
29
 
30
+ **Distributors**: Original developer's code from [https://github.com/MaartenGr/KeyBERT](https://github.com/MaartenGr/KeyBERT).
31
 
32
+ **Model date**: 2020.
33
 
34
+ **Model type**: Different BERT and SciBERT models, trained on [CIRCA data](https://circa.res.ibm.com/index.html).
 
 
 
35
 
36
  **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
37
  N.A.
38
 
39
  **Paper or other resource for more information**:
40
+ The [KeyBERT GitHub repo](https://github.com/MaartenGr/KeyBERT).
41
 
42
  **License**: MIT
43
 
 
47
 
48
  **Primary intended uses/users**: N.A.
49
 
50
+ **Out-of-scope use cases**: Production-level inference.
51
 
52
  **Metrics**: N.A.
53
 
 
61
 
62
  ## Citation
63
  ```bib
64
+ @misc{grootendorst2020keybert,
65
+ author = {Maarten Grootendorst},
66
+ title = {KeyBERT: Minimal keyword extraction with BERT.},
67
+ year = 2020,
68
+ publisher = {Zenodo},
69
+ version = {v0.3.0},
70
+ doi = {10.5281/zenodo.4461265},
71
+ url = {https://doi.org/10.5281/zenodo.4461265}
72
  }
73
  ```
model_cards/description.md CHANGED
@@ -1,6 +1,6 @@
1
  <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
 
3
- [Patent Generative Transformers (PGT)](https://openreview.net/forum?id=dLHtwZKvJmE): A prompt based generative transformer for the patent domain (Christofidellis et al., 2022; *ICLR Workshop KRLM*).
4
 
5
  For **examples** and **documentation** of the model parameters, please see below.
6
  Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
 
1
  <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
 
3
+ [KeywordBERT](https://github.com/MaartenGr/KeyBERT) is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings to create keywords and keyphrases that are most similar to a document.
4
 
5
  For **examples** and **documentation** of the model parameters, please see below.
6
  Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
model_cards/examples.csv CHANGED
@@ -1,5 +1,2 @@
1
- PGTGenerator|title-to-abstract|||Artificial intelligence and machine learning infrastructure||512|50|1.0
2
- PGTGenerator|title-to-abstract|||Artificial intelligence and machine learning infrastructure||756|20|0.95
3
- PGTEditor||abstract||In one step of a method for infusing an [MASK], the infusion fluid is pumped through a fluid delivery line of an infusion system. In another step, measurements are taken with at least one sensor connected to the infusion system. In an additional step, an air determination is determined with at least one processor. The air determination is related to air in the fluid delivery line. The air determination is based on the measurements taken by the at least one sensor. The air determination is further based on: (1) [MASK] information regarding the infusion of the infusion fluid; or (2) multi-channel filtering of the measurements from the at least one sensor or non-linear mapping of the measurements from the at least one sensor; and statistical process control charts applied to the multi-channel filtered measurements or applied to the non-linear mapped measurements.||512|50|1
4
- PGTCoherenceChecker|||title-abstract|Artificial intelligence and machine learning infrastructure|An artificial intelligence and machine learning infrastructure system, including: one or more storage systems comprising, respectively, one or more storage devices; and one or more graphical processing units, wherein the graphical processing units are configured to communicate with the one or more storage systems over a communication fabric; where the one or more storage systems, the one or more graphical processing units, and the communication fabric are implemented within a single chassis.|512|50|1
5
- PGTCoherenceChecker|||title-abstract|Analog image processing|An artificial intelligence and machine learning infrastructure system for image classification, including: one or more storage systems comprising, respectively, one or more storage devices; and one or more graphical processing units, wherein the graphical processing units are configured to communicate with the one or more storage systems over a communication fabric; where the one or more storage systems, the one or more graphical processing units, and the communication fabric are implemented within a single chassis.|512|50|1
 
1
+ distilbert-base-nli-mean-tokens,This is a text I would like to understand better,1,2,english,False,20,False,0.5,3
2
+ distilbert-base-nli-mean-tokens,KeyBERT is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings to create keywords and keyphrases that are most similar to a document.,1,2,english,False,20,False,0.5,3