Spaces:
Running
Running
update
Browse files- README.md +1 -1
- app.py +50 -62
- model_cards/article.md +26 -37
- model_cards/description.md +1 -1
- model_cards/examples.csv +2 -5
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: GT4SD -
|
3 |
emoji: 💡
|
4 |
colorFrom: green
|
5 |
colorTo: blue
|
|
|
1 |
---
|
2 |
+
title: GT4SD - KeyBERT
|
3 |
emoji: 💡
|
4 |
colorFrom: green
|
5 |
colorTo: blue
|
app.py
CHANGED
@@ -2,11 +2,9 @@ import logging
|
|
2 |
import pathlib
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
-
from gt4sd.algorithms.
|
6 |
-
|
7 |
-
|
8 |
-
PGTEditor,
|
9 |
-
PGTGenerator,
|
10 |
)
|
11 |
from gt4sd.algorithms.registry import ApplicationsRegistry
|
12 |
|
@@ -14,38 +12,33 @@ from gt4sd.algorithms.registry import ApplicationsRegistry
|
|
14 |
logger = logging.getLogger(__name__)
|
15 |
logger.addHandler(logging.NullHandler())
|
16 |
|
17 |
-
MODEL_FN = {
|
18 |
-
"PGTGenerator": PGTGenerator,
|
19 |
-
"PGTEditor": PGTEditor,
|
20 |
-
"PGTCoherenceChecker": PGTCoherenceChecker,
|
21 |
-
}
|
22 |
-
|
23 |
|
24 |
def run_inference(
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
34 |
):
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
model =
|
48 |
-
text = list(model.sample(
|
49 |
|
50 |
return text
|
51 |
|
@@ -55,57 +48,52 @@ if __name__ == "__main__":
|
|
55 |
# Preparation (retrieve all available algorithms)
|
56 |
all_algos = ApplicationsRegistry.list_available()
|
57 |
algos = [
|
58 |
-
x["
|
59 |
-
for x in list(filter(lambda x: "
|
60 |
]
|
61 |
|
62 |
# Load metadata
|
63 |
metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
|
64 |
|
65 |
examples = pd.read_csv(
|
66 |
-
metadata_root.joinpath("examples.csv"), sep="
|
67 |
).fillna("")
|
68 |
-
print("Examples: ", examples.values.tolist())
|
69 |
|
70 |
with open(metadata_root.joinpath("article.md"), "r") as f:
|
71 |
article = f.read()
|
72 |
with open(metadata_root.joinpath("description.md"), "r") as f:
|
73 |
description = f.read()
|
74 |
|
75 |
-
gen_tasks = [
|
76 |
-
"title-to-abstract",
|
77 |
-
"abstract-to-title",
|
78 |
-
"abstract-to-claim",
|
79 |
-
"claim-to-abstract",
|
80 |
-
]
|
81 |
-
|
82 |
demo = gr.Interface(
|
83 |
fn=run_inference,
|
84 |
-
title="
|
85 |
inputs=[
|
86 |
-
gr.Dropdown(algos, label="
|
87 |
-
gr.Dropdown(gen_tasks, label="Generator task", value="title-to-abstract"),
|
88 |
-
gr.Dropdown(["abstract", "claim"], label="Editor task", value="abstract"),
|
89 |
-
gr.Dropdown(
|
90 |
-
["title-abstract", "title-claim", "abstract-claim"],
|
91 |
-
label="Checker task",
|
92 |
-
value="title-abstract",
|
93 |
-
),
|
94 |
gr.Textbox(
|
95 |
-
label="
|
96 |
-
placeholder="
|
97 |
lines=5,
|
98 |
),
|
99 |
-
gr.
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
),
|
|
|
104 |
gr.Slider(
|
105 |
-
minimum=
|
106 |
),
|
107 |
-
gr.Slider(minimum=2, maximum=500, value=50, label="Top-k", step=1),
|
108 |
-
gr.Slider(minimum=0.5, maximum=1.0, value=0.95, label="Top-p"),
|
109 |
],
|
110 |
outputs=gr.Textbox(label="Output"),
|
111 |
article=article,
|
|
|
2 |
import pathlib
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
+
from gt4sd.algorithms.conditional_generation.key_bert import (
|
6 |
+
KeywordBERTGenerationAlgorithm,
|
7 |
+
KeyBERTGenerator,
|
|
|
|
|
8 |
)
|
9 |
from gt4sd.algorithms.registry import ApplicationsRegistry
|
10 |
|
|
|
12 |
logger = logging.getLogger(__name__)
|
13 |
logger.addHandler(logging.NullHandler())
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
def run_inference(
|
17 |
+
algorithm_version: str,
|
18 |
+
text: str,
|
19 |
+
minimum_keyphrase_ngram: int,
|
20 |
+
maximum_keyphrase_ngram: int,
|
21 |
+
stop_words: str,
|
22 |
+
use_maxsum: bool,
|
23 |
+
number_of_candidates: int,
|
24 |
+
use_mmr: bool,
|
25 |
+
diversity: float,
|
26 |
+
number_of_keywords: int,
|
27 |
):
|
28 |
|
29 |
+
config = KeyBERTGenerator(
|
30 |
+
algorithm_version=algorithm_version,
|
31 |
+
minimum_keyphrase_ngram=minimum_keyphrase_ngram,
|
32 |
+
maximum_keyphrase_ngram=maximum_keyphrase_ngram,
|
33 |
+
stop_words=stop_words,
|
34 |
+
top_n=number_of_keywords,
|
35 |
+
use_maxsum=use_maxsum,
|
36 |
+
use_mmr=use_mmr,
|
37 |
+
diversity=diversity,
|
38 |
+
number_of_candidates=number_of_candidates,
|
39 |
+
)
|
40 |
+
model = KeywordBERTGenerationAlgorithm(configuration=config, target=text)
|
41 |
+
text = list(model.sample(number_of_keywords))
|
42 |
|
43 |
return text
|
44 |
|
|
|
48 |
# Preparation (retrieve all available algorithms)
|
49 |
all_algos = ApplicationsRegistry.list_available()
|
50 |
algos = [
|
51 |
+
x["algorithm_version"]
|
52 |
+
for x in list(filter(lambda x: "KeywordBERT" in x["algorithm_name"], all_algos))
|
53 |
]
|
54 |
|
55 |
# Load metadata
|
56 |
metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
|
57 |
|
58 |
examples = pd.read_csv(
|
59 |
+
metadata_root.joinpath("examples.csv"), sep=",", header=None
|
60 |
).fillna("")
|
|
|
61 |
|
62 |
with open(metadata_root.joinpath("article.md"), "r") as f:
|
63 |
article = f.read()
|
64 |
with open(metadata_root.joinpath("description.md"), "r") as f:
|
65 |
description = f.read()
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
demo = gr.Interface(
|
68 |
fn=run_inference,
|
69 |
+
title="KeywordBERT",
|
70 |
inputs=[
|
71 |
+
gr.Dropdown(algos, label="Algorithm version", value="circa_bert_v2"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
gr.Textbox(
|
73 |
+
label="Text prompt",
|
74 |
+
placeholder="This is a text I want to understand better",
|
75 |
lines=5,
|
76 |
),
|
77 |
+
gr.Slider(
|
78 |
+
minimum=1, maximum=5, value=1, label="Minimum keyphrase ngram", step=1
|
79 |
+
),
|
80 |
+
gr.Slider(
|
81 |
+
minimum=2, maximum=10, value=1, label="Maximum keyphrase ngram", step=1
|
82 |
+
),
|
83 |
+
gr.Textbox(label="Stop words", placeholder="english", lines=1),
|
84 |
+
gr.Radio(choices=[True, False], label="MaxSum", value=False),
|
85 |
+
gr.Slider(
|
86 |
+
minimum=5, maximum=100, value=20, label="MaxSum candidates", step=1
|
87 |
+
),
|
88 |
+
gr.Radio(
|
89 |
+
choices=[True, False],
|
90 |
+
label="Max. marginal relevance control",
|
91 |
+
value=False,
|
92 |
),
|
93 |
+
gr.Slider(minimum=0.1, maximum=1, value=0.5, label="Diversity"),
|
94 |
gr.Slider(
|
95 |
+
minimum=1, maximum=50, value=10, label="Number of keywords", step=1
|
96 |
),
|
|
|
|
|
97 |
],
|
98 |
outputs=gr.Textbox(label="Output"),
|
99 |
article=article,
|
model_cards/article.md
CHANGED
@@ -1,57 +1,43 @@
|
|
1 |
# Model documentation & parameters
|
2 |
|
3 |
-
**
|
4 |
-
- `PGTGenerator`: A model for part-of-patent generator
|
5 |
-
- `PGTEditor`: An algorithm for part-of-patent editing.
|
6 |
-
- `PGTCoherenceChecker`: An algorithm for patent coherence check
|
7 |
|
8 |
-
**
|
9 |
-
- `title-to-abstract`
|
10 |
-
- `abstract-to-title`
|
11 |
-
- `abstract-to-claim`
|
12 |
-
- `claim-to-abstract`
|
13 |
|
14 |
-
**
|
15 |
-
- `abstract`
|
16 |
-
- `claim`
|
17 |
|
18 |
-
**
|
19 |
-
- `title-abstract`
|
20 |
-
- `title-claim`
|
21 |
-
- `abstract-claim`
|
22 |
|
23 |
-
**
|
24 |
|
25 |
-
**
|
26 |
|
27 |
-
**
|
28 |
|
29 |
-
**
|
30 |
|
31 |
-
**
|
32 |
|
|
|
33 |
|
34 |
|
35 |
-
# Model card --
|
36 |
|
37 |
-
**Model Details**:
|
38 |
|
39 |
-
**Developers**:
|
40 |
|
41 |
-
**Distributors**:
|
42 |
|
43 |
-
**Model date**:
|
44 |
|
45 |
-
**Model type**:
|
46 |
-
- `PGTGenerator`: A model for part-of-patent generator
|
47 |
-
- `PGTEditor`: An algorithm for part-of-patent editing.
|
48 |
-
- `PGTCoherenceChecker`: An algorithm for patent coherence check
|
49 |
|
50 |
**Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
|
51 |
N.A.
|
52 |
|
53 |
**Paper or other resource for more information**:
|
54 |
-
The
|
55 |
|
56 |
**License**: MIT
|
57 |
|
@@ -61,7 +47,7 @@ The Patent Generative Transformer (PGT) [paper by Christofidellis et al. (*ICML
|
|
61 |
|
62 |
**Primary intended uses/users**: N.A.
|
63 |
|
64 |
-
**Out-of-scope use cases**: Production-level inference
|
65 |
|
66 |
**Metrics**: N.A.
|
67 |
|
@@ -75,10 +61,13 @@ Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi
|
|
75 |
|
76 |
## Citation
|
77 |
```bib
|
78 |
-
@
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
83 |
}
|
84 |
```
|
|
|
1 |
# Model documentation & parameters
|
2 |
|
3 |
+
**Algorithm version**: The model version to use. Note that *any* HF model can be wrapped to a `KeyBERT` model.
|
|
|
|
|
|
|
4 |
|
5 |
+
**Text**: The main text prompt to "understand", i.e., generate keywords.
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
**Minimum keyphrase ngram**: Lower bound for phrase size. Each keyword will have at least this many words.
|
|
|
|
|
8 |
|
9 |
+
**Maximum keyphrase ngram**: Upper bound for phrase size. Each keyword will have at least this many words.
|
|
|
|
|
|
|
10 |
|
11 |
+
**Stop words**: Stopwords to remove from the document. If not provided, no stop words removal.
|
12 |
|
13 |
+
**Use MaxSum**: To diversify the results, we take the `2 x MaxSum candidates` most similar words/phrases to the document. Then, we take all top_n combinations from the `2 x MaxSum candidates` and extract the combination that are the least similar to each other by cosine similarity. Control usage of max sum similarity for keywords generated.
|
14 |
|
15 |
+
**MaxSum candidates**: Candidates considered when enabling `Use MaxSum`.
|
16 |
|
17 |
+
**Use Max. marginal relevance**: To diversify the results, we can use Maximal Margin Relevance (MMR) to create keywords / keyphrases which is also based on cosine similarity.
|
18 |
|
19 |
+
**Diversity**: Diversity for the results when enabling `max. marginal relevance`.
|
20 |
|
21 |
+
**Number of keywords**: How many keywords should be generated (maximal 50).
|
22 |
|
23 |
|
24 |
+
# Model card -- KeywordBERT
|
25 |
|
26 |
+
**Model Details**: KeyBERT is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings to create keywords and keyphrases that are most similar to a document.
|
27 |
|
28 |
+
**Developers**: Maarten Grootendorst.
|
29 |
|
30 |
+
**Distributors**: Original developer's code from [https://github.com/MaartenGr/KeyBERT](https://github.com/MaartenGr/KeyBERT).
|
31 |
|
32 |
+
**Model date**: 2020.
|
33 |
|
34 |
+
**Model type**: Different BERT and SciBERT models, trained on [CIRCA data](https://circa.res.ibm.com/index.html).
|
|
|
|
|
|
|
35 |
|
36 |
**Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
|
37 |
N.A.
|
38 |
|
39 |
**Paper or other resource for more information**:
|
40 |
+
The [KeyBERT GitHub repo](https://github.com/MaartenGr/KeyBERT).
|
41 |
|
42 |
**License**: MIT
|
43 |
|
|
|
47 |
|
48 |
**Primary intended uses/users**: N.A.
|
49 |
|
50 |
+
**Out-of-scope use cases**: Production-level inference.
|
51 |
|
52 |
**Metrics**: N.A.
|
53 |
|
|
|
61 |
|
62 |
## Citation
|
63 |
```bib
|
64 |
+
@misc{grootendorst2020keybert,
|
65 |
+
author = {Maarten Grootendorst},
|
66 |
+
title = {KeyBERT: Minimal keyword extraction with BERT.},
|
67 |
+
year = 2020,
|
68 |
+
publisher = {Zenodo},
|
69 |
+
version = {v0.3.0},
|
70 |
+
doi = {10.5281/zenodo.4461265},
|
71 |
+
url = {https://doi.org/10.5281/zenodo.4461265}
|
72 |
}
|
73 |
```
|
model_cards/description.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
<img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
|
2 |
|
3 |
-
[
|
4 |
|
5 |
For **examples** and **documentation** of the model parameters, please see below.
|
6 |
Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
|
|
|
1 |
<img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
|
2 |
|
3 |
+
[KeywordBERT](https://github.com/MaartenGr/KeyBERT) is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings to create keywords and keyphrases that are most similar to a document.
|
4 |
|
5 |
For **examples** and **documentation** of the model parameters, please see below.
|
6 |
Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
|
model_cards/examples.csv
CHANGED
@@ -1,5 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
PGTEditor||abstract||In one step of a method for infusing an [MASK], the infusion fluid is pumped through a fluid delivery line of an infusion system. In another step, measurements are taken with at least one sensor connected to the infusion system. In an additional step, an air determination is determined with at least one processor. The air determination is related to air in the fluid delivery line. The air determination is based on the measurements taken by the at least one sensor. The air determination is further based on: (1) [MASK] information regarding the infusion of the infusion fluid; or (2) multi-channel filtering of the measurements from the at least one sensor or non-linear mapping of the measurements from the at least one sensor; and statistical process control charts applied to the multi-channel filtered measurements or applied to the non-linear mapped measurements.||512|50|1
|
4 |
-
PGTCoherenceChecker|||title-abstract|Artificial intelligence and machine learning infrastructure|An artificial intelligence and machine learning infrastructure system, including: one or more storage systems comprising, respectively, one or more storage devices; and one or more graphical processing units, wherein the graphical processing units are configured to communicate with the one or more storage systems over a communication fabric; where the one or more storage systems, the one or more graphical processing units, and the communication fabric are implemented within a single chassis.|512|50|1
|
5 |
-
PGTCoherenceChecker|||title-abstract|Analog image processing|An artificial intelligence and machine learning infrastructure system for image classification, including: one or more storage systems comprising, respectively, one or more storage devices; and one or more graphical processing units, wherein the graphical processing units are configured to communicate with the one or more storage systems over a communication fabric; where the one or more storage systems, the one or more graphical processing units, and the communication fabric are implemented within a single chassis.|512|50|1
|
|
|
1 |
+
distilbert-base-nli-mean-tokens,This is a text I would like to understand better,1,2,english,False,20,False,0.5,3
|
2 |
+
distilbert-base-nli-mean-tokens,KeyBERT is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings to create keywords and keyphrases that are most similar to a document.,1,2,english,False,20,False,0.5,3
|
|
|
|
|
|