Alain Vaucher commited on
Commit
4d89100
1 Parent(s): 0c13a31

Implement the app

Browse files
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 rxn4chemistry
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,38 @@
1
  ---
2
- title: Synthesis Protocol Extraction
3
- emoji: 🐨
4
- colorFrom: red
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 3.27.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Extraction of synthesis protocols from paragraphs in machine readable format
3
+ emoji: 💡
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.27.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ python_version: 3.8.13
12
+ pip_version: 23.0.1
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
16
+
17
+ ## Local testing
18
+
19
+ Set up an environment (Note: use Python 3.8 or lower):
20
+ ```bash
21
+ # Option 1: conda
22
+ conda create -n gradio python=3.8 -y
23
+ conda activate gradio
24
+ # Option 2: venv
25
+ python3.8 -m venv venv
26
+ source venv/bin/activate
27
+ ```
28
+
29
+ Install the dependencies:
30
+ ```bash
31
+ pip install -r pre-requirements.txt
32
+ pip install -r requirements.txt
33
+ ```
34
+
35
+ Run:
36
+ ```bash
37
+ gradio app.py
38
+ ```
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ import html
3
+ import logging
4
+ import traceback
5
+ from pathlib import Path
6
+ from typing import List
7
+
8
+ import gradio as gr
9
+ import pandas as pd
10
+ from rxn.utilities.logging import setup_console_logger
11
+ from rxn.utilities.strings import remove_postfix
12
+
13
+ from utils import TranslatorWithSentencePiece, split_into_sentences
14
+
15
+ logger = logging.getLogger(__name__)
16
+ logger.addHandler(logging.NullHandler())
17
+
18
+ _SAC_MODEL_TAG = "Heterogenous SAC model (ACE)"
19
+ _ORGANIC_MODEL_TAG = "Organic chemistry model"
20
+
21
+ model_type_to_models = {
22
+ _SAC_MODEL_TAG: ["sac.pt"],
23
+ _ORGANIC_MODEL_TAG: ["organic-1.pt", "organic-2.pt", "organic-3.pt"],
24
+ }
25
+
26
+
27
+ @functools.lru_cache
28
+ def load_model(model_type: str) -> TranslatorWithSentencePiece:
29
+ logger.info(f"Loading model {model_type}... ")
30
+
31
+ model_files = model_type_to_models[model_type]
32
+ sp_model = "sp_model.model"
33
+
34
+ model = TranslatorWithSentencePiece(
35
+ translation_model=model_files,
36
+ sentencepiece_model=sp_model,
37
+ )
38
+ logger.info(f"Loading model {model_type}... Done.")
39
+
40
+ return model
41
+
42
+
43
+ def sentence_and_actions_to_html(
44
+ sentence: str, action_string: str, show_sentences: bool
45
+ ) -> str:
46
+ output = ""
47
+ li_start = '<li style="margin-left: 12px;">' if show_sentences else "<li>"
48
+ li_end = "</li>"
49
+
50
+ action_string = remove_postfix(action_string, ".")
51
+
52
+ if show_sentences:
53
+ output += f"<p>{sentence}</p>"
54
+
55
+ actions = [f"{li_start}{action}{li_end}" for action in action_string.split("; ")]
56
+ output += "".join(actions)
57
+
58
+ if show_sentences:
59
+ # If we show the sentence, we need the list/enumeration delimiters,
60
+ # as there is one list per sentence.
61
+ output = f"<ol>{output}</ol>"
62
+
63
+ return output
64
+
65
+
66
+ def try_action_extraction(model_type: str, text: str, show_sentences: bool) -> str:
67
+ model = load_model(model_type)
68
+ sentences = split_into_sentences(text)
69
+ action_strings = model.translate(sentences)
70
+
71
+ output = ""
72
+ for sentence, action_string in zip(sentences, action_strings):
73
+ output += sentence_and_actions_to_html(sentence, action_string, show_sentences)
74
+
75
+ if not show_sentences:
76
+ # If the sentences were not shown, we need to add the list/enumeration
77
+ # delimiters here (globally)
78
+ output = f"<ol>{output}</ol>"
79
+
80
+ # PostTreatment was renamed to ThermalTreatment, old model still relies on the former
81
+ output = output.replace("POSTTREATMENT", "THERMALTREATMENT")
82
+
83
+ return output
84
+
85
+
86
+ def action_extraction(model_type: str, text: str, show_sentences: bool) -> str:
87
+ try:
88
+ return try_action_extraction(model_type, text, show_sentences)
89
+ except Exception as e:
90
+ tb = "".join(traceback.TracebackException.from_exception(e).format())
91
+ tb_html = f"<pre>{html.escape(tb)}</pre>"
92
+ return f"<p><b>Error!</b> The action extraction failed: {e}</p>{tb_html}"
93
+
94
+
95
+ def launch() -> gr.Interface:
96
+ logger.info("Launching the Gradio app")
97
+
98
+ metadata_dir = Path(__file__).parent / "model_cards"
99
+
100
+ examples_df: pd.DataFrame = pd.read_csv(
101
+ metadata_dir / "sac_synthesis_mining_examples.csv", header=None
102
+ ).fillna("")
103
+ examples: List[List[str]] = examples_df.to_numpy().tolist()
104
+
105
+ with open(metadata_dir / "sac_synthesis_mining_article.md", "r") as f:
106
+ article = f.read()
107
+ with open(metadata_dir / "sac_synthesis_mining_description.md", "r") as f:
108
+ description = f.read()
109
+
110
+ demo = gr.Interface(
111
+ fn=action_extraction,
112
+ title="Extraction of synthesis protocols from paragraphs in machine readable format",
113
+ inputs=[
114
+ gr.Dropdown(
115
+ [_SAC_MODEL_TAG, _ORGANIC_MODEL_TAG],
116
+ label="Model",
117
+ value=_SAC_MODEL_TAG,
118
+ ),
119
+ gr.Textbox(label="Synthesis text", lines=7, placeholder=examples[0][1]),
120
+ gr.Checkbox(label="Show sentences in the output"),
121
+ ],
122
+ outputs=gr.HTML(label="Output"),
123
+ article=article,
124
+ description=description,
125
+ examples=examples,
126
+ allow_flagging="never",
127
+ theme="gradio/base",
128
+ )
129
+ demo.launch(debug=True, show_error=True)
130
+ return demo
131
+
132
+
133
+ setup_console_logger(level="INFO")
134
+ demo = launch()
model_cards/sac_synthesis_mining.png ADDED
model_cards/sac_synthesis_mining_article.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model documentation & parameters
2
+
3
+ ## Parameters
4
+
5
+ ### Model
6
+ Whether to use the model trained 1) on procedures for heterogeneous single-atom catalyst synthesis, or 2) on organic chemistry procedures.
7
+
8
+ ### Synthesis text
9
+ Synthesis procedure (in English prose) to extract actions from.
10
+
11
+
12
+ # Model card -- Text mining synthesis protocols of heterogeneous single-atom catalysts
13
+
14
+ **Model Details**:
15
+ Sequence-to-sequence transformer model
16
+
17
+ **Developers**:
18
+ Manu Suvarna, Alain C. Vaucher, Sharon Mitchell, Teodoro Laino, and Javier Pérez-Ramírez.
19
+
20
+ **Distributors**:
21
+ Same as the *developers*.
22
+
23
+ **Model date**:
24
+ April 2023.
25
+
26
+ **Algorithm version**:
27
+ Details in the source code and in the paper.
28
+
29
+ **Model type**:
30
+ A Transformer-based sequence-to-sequence language model that extracts synthesis actions from procedure text.
31
+ The model relies on the [OpenNMT](https://github.com/OpenNMT/OpenNMT-py) library.
32
+
33
+ **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
34
+ Details in the source code and in the paper.
35
+
36
+ **Paper or other resource for more information**:
37
+ Currently under review.
38
+
39
+ **License**: MIT
40
+
41
+ **Where to send questions or comments about the model**:
42
+ Contact one of the *developers*.
43
+
44
+ **Intended Use. Use cases that were envisioned during development**:
45
+ Chemical research, in particular in the field of heterogeneous single-atom catalysts.
46
+
47
+ **Primary intended uses/users**:
48
+ Researchers and computational chemists using the model for model comparison or research exploration purposes.
49
+
50
+ **Out-of-scope use cases**:
51
+ Production-level inference.
52
+
53
+ **Factors**:
54
+ Not applicable.
55
+
56
+ **Metrics**:
57
+ Details in the source code and in the paper.
58
+
59
+ **Datasets**:
60
+ Details in the source code and in the paper.
61
+
62
+ **Ethical Considerations**:
63
+ No specific considerations as no private/personal data is involved.
64
+ Please consult with the authors in case of questions.
65
+
66
+ **Caveats and Recommendations**:
67
+ Please consult with original authors in case of questions.
68
+
69
+ Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596).
70
+
71
+
72
+ ## Citation
73
+
74
+ ```bib
75
+ @article{suvarna2023textmining,
76
+ title={Text mining and standardization of single-atom catalyst protocols to foster digital synthesis},
77
+ author={Manu Suvarna, Alain C. Vaucher, Sharon Mitchell, Teodoro Laino, and Javier Pérez-Ramírez},
78
+ journal={under review},
79
+ }
80
+ ```
81
+
model_cards/sac_synthesis_mining_description.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ <img align="right" src="https://to/be/completed/sac_synthesis_mining.png" alt="logo" width="240" >
2
+
3
+ ### Text mining synthesis protocols of heterogeneous catalysts
4
+
5
+ The following lets you test the model presented in the paper "Text mining and standardization of single-atom catalyst protocols to foster digital synthesis". The model is a transformer-based sequence-to-sequence language model that extracts synthesis actions from procedure text.
6
+
7
+ For **examples** and **documentation** of the model parameters, please see the model card below.
8
+
9
+ Disclaimer: The model has been primarily trained to capture synthesis protocols for SACs by solution-phase and thermal treatments. It can also extract synthesis details of other families of heterogeneous catalysts prepared by wet chemistry routes, including precipitation, impregnation, and sol-gel to an acceptable accuracy.
model_cards/sac_synthesis_mining_examples.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ "Heterogenous SAC model (ACE)","In a typical procedure, nickel nitrate hexahydrate (3.48 g) and sodium citrate (4.71 g) were dissolved in 400 mL water to form clear solution A. In the meantime, potassium hexacyanocobaltate (III) (2.66 g) was dissolved into water (200 mL) to form clear solution B. Then, solutions A and B were mixed under magnetic stirring. Stirring was stopped after combining the component solutions. After 24 h, the solid was collected by centrifugation, washed with water and ethanol, and then dried at room temperature. Then, the dried sample was annealed at 500 °C in NH3 for 1 h with a slow heating rate of 2°C min−1 , and immersed in 5 M H2SO4 solution and stirred at 80°C for 24 h to form CoNi-SAs/NC hollow cubes."
2
+ "Organic chemistry model","A solution of ((1S,2S)-1-{[(4'-methoxymethyl-biphenyl-4-yl)-(2-pyridin-2-yl-cyclopropanecarbonyl)-amino]-methyl}-2-methyl-butyl)-carbamic acid tert-butyl ester (25 mg, 0.045 mmol) and dichloromethane (4 mL) was treated with a solution of HCl in dioxane (4 N, 0.5 mL) and the resulting reaction mixture was maintained at room temperature for 12 h. The reaction was then concentrated to dryness to afford (1R,2R)-2-pyridin-2-yl-cyclopropanecarboxylic acid ((2S,3S)-2-amino-3-methylpentyl)-(4'-methoxymethyl-biphenyl-4-yl)-amide (18 mg, 95% yield) as a white solid."
organic-1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0288cf80f5e3c73168c17284e6ea066a529d83e411c26e448ced2fdde6d7852e
3
+ size 89958148
organic-2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1671e0030de5389bba43043acee36f91aaba87d29a9a1fedb1a4e2772fb42924
3
+ size 89958148
organic-3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a00a0faf86978990fa2074d5d35564b5653a4464ea49b5c11f5cbd6ff962da98
3
+ size 89958148
pre-requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ future==0.18.3
2
+ numpy==1.24.2
3
+ pip==23.0.1
4
+ torch==1.5.1
requirements.txt ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
+ altair==4.2.2
5
+ anyio==3.6.2
6
+ appdirs==1.4.4
7
+ async-timeout==4.0.2
8
+ attrs==22.2.0
9
+ beautifulsoup4==4.12.2
10
+ cffi==1.15.1
11
+ charset-normalizer==3.1.0
12
+ ChemDataExtractor==1.3.0
13
+ click==8.1.3
14
+ ConfigArgParse==1.5.3
15
+ contourpy==1.0.7
16
+ cryptography==40.0.2
17
+ cssselect==1.2.0
18
+ cycler==0.11.0
19
+ DAWG==0.8.0
20
+ diskcache==5.5.1
21
+ dnspython==2.3.0
22
+ entrypoints==0.4
23
+ fastapi==0.95.1
24
+ ffmpy==0.3.0
25
+ filelock==3.11.0
26
+ fonttools==4.39.3
27
+ frozenlist==1.3.3
28
+ fsspec==2023.4.0
29
+ gradio==3.25.0
30
+ gradio_client==0.1.2
31
+ h11==0.14.0
32
+ httpcore==0.17.0
33
+ httpx==0.24.0
34
+ huggingface-hub==0.13.4
35
+ idna==3.4
36
+ importlib-resources==5.12.0
37
+ Jinja2==3.1.2
38
+ joblib==1.2.0
39
+ jsonschema==4.17.3
40
+ kiwisolver==1.4.4
41
+ linkify-it-py==2.0.0
42
+ lxml==4.9.2
43
+ markdown-it-py==2.2.0
44
+ MarkupSafe==2.1.2
45
+ matplotlib==3.7.1
46
+ mdit-py-plugins==0.3.3
47
+ mdurl==0.1.2
48
+ multidict==6.0.4
49
+ nltk==3.8.1
50
+ orjson==3.8.10
51
+ packaging==23.1
52
+ pandas==2.0.0
53
+ pdfminer.six==20221105
54
+ Pillow==9.5.0
55
+ pkgutil_resolve_name==1.3.10
56
+ pycparser==2.21
57
+ pydantic==1.10.7
58
+ pydub==0.25.1
59
+ pymongo==4.3.3
60
+ pyparsing==3.0.9
61
+ pyrsistent==0.19.3
62
+ python-crfsuite==0.9.9
63
+ python-dateutil==2.8.2
64
+ python-dotenv==1.0.0
65
+ python-multipart==0.0.6
66
+ pytz==2023.3
67
+ PyYAML==6.0
68
+ regex==2023.3.23
69
+ requests==2.28.2
70
+ rxn-onmt-utils==1.0.2
71
+ rxn-opennmt-py==1.1.4
72
+ rxn-utils==1.1.9
73
+ semantic-version==2.10.0
74
+ sentencepiece==0.1.98
75
+ six==1.16.0
76
+ sniffio==1.3.0
77
+ soupsieve==2.4
78
+ starlette==0.26.1
79
+ toolz==0.12.0
80
+ torchtext==0.4.0
81
+ tqdm==4.65.0
82
+ typing_extensions==4.5.0
83
+ tzdata==2023.3
84
+ uc-micro-py==1.0.1
85
+ urllib3==1.26.15
86
+ uvicorn==0.21.1
87
+ websockets==11.0.1
88
+ yarl==1.8.2
89
+ zipp==3.15.0
sac.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc8a569656c093246f639ac55e6890bea70a070d8b1ea6bd3e32cdbf7486c3a7
3
+ size 89958148
sp_model.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7e903009fc018b4de56efa71028eea13aa523513a1a0bad14f027dff36cb587
3
+ size 265630
sp_model.vocab ADDED
The diff for this file is too large to render. See raw diff
 
utils.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Iterable, Iterator, List, Union
3
+
4
+ import sentencepiece as spm
5
+ import chemdataextractor
6
+ from rxn.onmt_utils.internal_translation_utils import TranslationResult
7
+ from rxn.onmt_utils.translator import Translator
8
+
9
+ logger = logging.getLogger(__name__)
10
+ logger.addHandler(logging.NullHandler())
11
+
12
+
13
+ def split_into_sentences(text: str) -> List[str]:
14
+ paragraph = chemdataextractor.doc.Paragraph(text)
15
+ return [sentence.text for sentence in paragraph.sentences]
16
+
17
+
18
+ class SentencePieceTokenizer:
19
+ def __init__(self, model_file: str):
20
+ self.sp = spm.SentencePieceProcessor()
21
+ self.sp.Load(model_file)
22
+
23
+ def tokenize(self, sentence: str) -> str:
24
+ tokens = self.sp.EncodeAsPieces(sentence)
25
+ tokenized = " ".join(tokens)
26
+ return tokenized
27
+
28
+ def detokenize(self, sentence: str) -> str:
29
+ tokens = sentence.split(" ")
30
+ detokenized = self.sp.DecodePieces(tokens)
31
+ return detokenized
32
+
33
+
34
+ class TranslatorWithSentencePiece:
35
+ def __init__(
36
+ self, translation_model: Union[str, Iterable[str]], sentencepiece_model: str
37
+ ):
38
+ self.sp = SentencePieceTokenizer(sentencepiece_model)
39
+ self.translator = Translator.from_model_path(translation_model)
40
+
41
+ def translate(self, sentences: List[str]) -> List[str]:
42
+ translations = self.translate_multiple_with_scores(sentences)
43
+ return [t[0].text for t in translations]
44
+
45
+ def translate_multiple_with_scores(
46
+ self, sentences: List[str], n_best=1
47
+ ) -> Iterator[List[TranslationResult]]:
48
+ tokenized_sentences = [self.sp.tokenize(s) for s in sentences]
49
+
50
+ translations = self.translator.translate_multiple_with_scores(
51
+ tokenized_sentences, n_best
52
+ )
53
+
54
+ for translation_group in translations:
55
+ for t in translation_group:
56
+ t.text = self.sp.detokenize(t.text)
57
+ yield translation_group