Alain Vaucher
commited on
Commit
•
4d89100
1
Parent(s):
0c13a31
Implement the app
Browse files- LICENSE +21 -0
- README.md +29 -4
- app.py +134 -0
- model_cards/sac_synthesis_mining.png +0 -0
- model_cards/sac_synthesis_mining_article.md +81 -0
- model_cards/sac_synthesis_mining_description.md +9 -0
- model_cards/sac_synthesis_mining_examples.csv +2 -0
- organic-1.pt +3 -0
- organic-2.pt +3 -0
- organic-3.pt +3 -0
- pre-requirements.txt +4 -0
- requirements.txt +89 -0
- sac.pt +3 -0
- sp_model.model +3 -0
- sp_model.vocab +0 -0
- utils.py +57 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 rxn4chemistry
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,13 +1,38 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.27.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Extraction of synthesis protocols from paragraphs in machine readable format
|
3 |
+
emoji: 💡
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: blue
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.27.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
+
python_version: 3.8.13
|
12 |
+
pip_version: 23.0.1
|
13 |
---
|
14 |
|
15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
16 |
+
|
17 |
+
## Local testing
|
18 |
+
|
19 |
+
Set up an environment (Note: use Python 3.8 or lower):
|
20 |
+
```bash
|
21 |
+
# Option 1: conda
|
22 |
+
conda create -n gradio python=3.8 -y
|
23 |
+
conda activate gradio
|
24 |
+
# Option 2: venv
|
25 |
+
python3.8 -m venv venv
|
26 |
+
source venv/bin/activate
|
27 |
+
```
|
28 |
+
|
29 |
+
Install the dependencies:
|
30 |
+
```bash
|
31 |
+
pip install -r pre-requirements.txt
|
32 |
+
pip install -r requirements.txt
|
33 |
+
```
|
34 |
+
|
35 |
+
Run:
|
36 |
+
```bash
|
37 |
+
gradio app.py
|
38 |
+
```
|
app.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import functools
|
2 |
+
import html
|
3 |
+
import logging
|
4 |
+
import traceback
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import List
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
import pandas as pd
|
10 |
+
from rxn.utilities.logging import setup_console_logger
|
11 |
+
from rxn.utilities.strings import remove_postfix
|
12 |
+
|
13 |
+
from utils import TranslatorWithSentencePiece, split_into_sentences
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
logger.addHandler(logging.NullHandler())
|
17 |
+
|
18 |
+
_SAC_MODEL_TAG = "Heterogenous SAC model (ACE)"
|
19 |
+
_ORGANIC_MODEL_TAG = "Organic chemistry model"
|
20 |
+
|
21 |
+
model_type_to_models = {
|
22 |
+
_SAC_MODEL_TAG: ["sac.pt"],
|
23 |
+
_ORGANIC_MODEL_TAG: ["organic-1.pt", "organic-2.pt", "organic-3.pt"],
|
24 |
+
}
|
25 |
+
|
26 |
+
|
27 |
+
@functools.lru_cache
|
28 |
+
def load_model(model_type: str) -> TranslatorWithSentencePiece:
|
29 |
+
logger.info(f"Loading model {model_type}... ")
|
30 |
+
|
31 |
+
model_files = model_type_to_models[model_type]
|
32 |
+
sp_model = "sp_model.model"
|
33 |
+
|
34 |
+
model = TranslatorWithSentencePiece(
|
35 |
+
translation_model=model_files,
|
36 |
+
sentencepiece_model=sp_model,
|
37 |
+
)
|
38 |
+
logger.info(f"Loading model {model_type}... Done.")
|
39 |
+
|
40 |
+
return model
|
41 |
+
|
42 |
+
|
43 |
+
def sentence_and_actions_to_html(
|
44 |
+
sentence: str, action_string: str, show_sentences: bool
|
45 |
+
) -> str:
|
46 |
+
output = ""
|
47 |
+
li_start = '<li style="margin-left: 12px;">' if show_sentences else "<li>"
|
48 |
+
li_end = "</li>"
|
49 |
+
|
50 |
+
action_string = remove_postfix(action_string, ".")
|
51 |
+
|
52 |
+
if show_sentences:
|
53 |
+
output += f"<p>{sentence}</p>"
|
54 |
+
|
55 |
+
actions = [f"{li_start}{action}{li_end}" for action in action_string.split("; ")]
|
56 |
+
output += "".join(actions)
|
57 |
+
|
58 |
+
if show_sentences:
|
59 |
+
# If we show the sentence, we need the list/enumeration delimiters,
|
60 |
+
# as there is one list per sentence.
|
61 |
+
output = f"<ol>{output}</ol>"
|
62 |
+
|
63 |
+
return output
|
64 |
+
|
65 |
+
|
66 |
+
def try_action_extraction(model_type: str, text: str, show_sentences: bool) -> str:
|
67 |
+
model = load_model(model_type)
|
68 |
+
sentences = split_into_sentences(text)
|
69 |
+
action_strings = model.translate(sentences)
|
70 |
+
|
71 |
+
output = ""
|
72 |
+
for sentence, action_string in zip(sentences, action_strings):
|
73 |
+
output += sentence_and_actions_to_html(sentence, action_string, show_sentences)
|
74 |
+
|
75 |
+
if not show_sentences:
|
76 |
+
# If the sentences were not shown, we need to add the list/enumeration
|
77 |
+
# delimiters here (globally)
|
78 |
+
output = f"<ol>{output}</ol>"
|
79 |
+
|
80 |
+
# PostTreatment was renamed to ThermalTreatment, old model still relies on the former
|
81 |
+
output = output.replace("POSTTREATMENT", "THERMALTREATMENT")
|
82 |
+
|
83 |
+
return output
|
84 |
+
|
85 |
+
|
86 |
+
def action_extraction(model_type: str, text: str, show_sentences: bool) -> str:
|
87 |
+
try:
|
88 |
+
return try_action_extraction(model_type, text, show_sentences)
|
89 |
+
except Exception as e:
|
90 |
+
tb = "".join(traceback.TracebackException.from_exception(e).format())
|
91 |
+
tb_html = f"<pre>{html.escape(tb)}</pre>"
|
92 |
+
return f"<p><b>Error!</b> The action extraction failed: {e}</p>{tb_html}"
|
93 |
+
|
94 |
+
|
95 |
+
def launch() -> gr.Interface:
|
96 |
+
logger.info("Launching the Gradio app")
|
97 |
+
|
98 |
+
metadata_dir = Path(__file__).parent / "model_cards"
|
99 |
+
|
100 |
+
examples_df: pd.DataFrame = pd.read_csv(
|
101 |
+
metadata_dir / "sac_synthesis_mining_examples.csv", header=None
|
102 |
+
).fillna("")
|
103 |
+
examples: List[List[str]] = examples_df.to_numpy().tolist()
|
104 |
+
|
105 |
+
with open(metadata_dir / "sac_synthesis_mining_article.md", "r") as f:
|
106 |
+
article = f.read()
|
107 |
+
with open(metadata_dir / "sac_synthesis_mining_description.md", "r") as f:
|
108 |
+
description = f.read()
|
109 |
+
|
110 |
+
demo = gr.Interface(
|
111 |
+
fn=action_extraction,
|
112 |
+
title="Extraction of synthesis protocols from paragraphs in machine readable format",
|
113 |
+
inputs=[
|
114 |
+
gr.Dropdown(
|
115 |
+
[_SAC_MODEL_TAG, _ORGANIC_MODEL_TAG],
|
116 |
+
label="Model",
|
117 |
+
value=_SAC_MODEL_TAG,
|
118 |
+
),
|
119 |
+
gr.Textbox(label="Synthesis text", lines=7, placeholder=examples[0][1]),
|
120 |
+
gr.Checkbox(label="Show sentences in the output"),
|
121 |
+
],
|
122 |
+
outputs=gr.HTML(label="Output"),
|
123 |
+
article=article,
|
124 |
+
description=description,
|
125 |
+
examples=examples,
|
126 |
+
allow_flagging="never",
|
127 |
+
theme="gradio/base",
|
128 |
+
)
|
129 |
+
demo.launch(debug=True, show_error=True)
|
130 |
+
return demo
|
131 |
+
|
132 |
+
|
133 |
+
setup_console_logger(level="INFO")
|
134 |
+
demo = launch()
|
model_cards/sac_synthesis_mining.png
ADDED
model_cards/sac_synthesis_mining_article.md
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Model documentation & parameters
|
2 |
+
|
3 |
+
## Parameters
|
4 |
+
|
5 |
+
### Model
|
6 |
+
Whether to use the model trained 1) on procedures for heterogeneous single-atom catalyst synthesis, or 2) on organic chemistry procedures.
|
7 |
+
|
8 |
+
### Synthesis text
|
9 |
+
Synthesis procedure (in English prose) to extract actions from.
|
10 |
+
|
11 |
+
|
12 |
+
# Model card -- Text mining synthesis protocols of heterogeneous single-atom catalysts
|
13 |
+
|
14 |
+
**Model Details**:
|
15 |
+
Sequence-to-sequence transformer model
|
16 |
+
|
17 |
+
**Developers**:
|
18 |
+
Manu Suvarna, Alain C. Vaucher, Sharon Mitchell, Teodoro Laino, and Javier Pérez-Ramírez.
|
19 |
+
|
20 |
+
**Distributors**:
|
21 |
+
Same as the *developers*.
|
22 |
+
|
23 |
+
**Model date**:
|
24 |
+
April 2023.
|
25 |
+
|
26 |
+
**Algorithm version**:
|
27 |
+
Details in the source code and in the paper.
|
28 |
+
|
29 |
+
**Model type**:
|
30 |
+
A Transformer-based sequence-to-sequence language model that extracts synthesis actions from procedure text.
|
31 |
+
The model relies on the [OpenNMT](https://github.com/OpenNMT/OpenNMT-py) library.
|
32 |
+
|
33 |
+
**Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
|
34 |
+
Details in the source code and in the paper.
|
35 |
+
|
36 |
+
**Paper or other resource for more information**:
|
37 |
+
Currently under review.
|
38 |
+
|
39 |
+
**License**: MIT
|
40 |
+
|
41 |
+
**Where to send questions or comments about the model**:
|
42 |
+
Contact one of the *developers*.
|
43 |
+
|
44 |
+
**Intended Use. Use cases that were envisioned during development**:
|
45 |
+
Chemical research, in particular in the field of heterogeneous single-atom catalysts.
|
46 |
+
|
47 |
+
**Primary intended uses/users**:
|
48 |
+
Researchers and computational chemists using the model for model comparison or research exploration purposes.
|
49 |
+
|
50 |
+
**Out-of-scope use cases**:
|
51 |
+
Production-level inference.
|
52 |
+
|
53 |
+
**Factors**:
|
54 |
+
Not applicable.
|
55 |
+
|
56 |
+
**Metrics**:
|
57 |
+
Details in the source code and in the paper.
|
58 |
+
|
59 |
+
**Datasets**:
|
60 |
+
Details in the source code and in the paper.
|
61 |
+
|
62 |
+
**Ethical Considerations**:
|
63 |
+
No specific considerations as no private/personal data is involved.
|
64 |
+
Please consult with the authors in case of questions.
|
65 |
+
|
66 |
+
**Caveats and Recommendations**:
|
67 |
+
Please consult with original authors in case of questions.
|
68 |
+
|
69 |
+
Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596).
|
70 |
+
|
71 |
+
|
72 |
+
## Citation
|
73 |
+
|
74 |
+
```bib
|
75 |
+
@article{suvarna2023textmining,
|
76 |
+
title={Text mining and standardization of single-atom catalyst protocols to foster digital synthesis},
|
77 |
+
author={Manu Suvarna, Alain C. Vaucher, Sharon Mitchell, Teodoro Laino, and Javier Pérez-Ramírez},
|
78 |
+
journal={under review},
|
79 |
+
}
|
80 |
+
```
|
81 |
+
|
model_cards/sac_synthesis_mining_description.md
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<img align="right" src="https://to/be/completed/sac_synthesis_mining.png" alt="logo" width="240" >
|
2 |
+
|
3 |
+
### Text mining synthesis protocols of heterogeneous catalysts
|
4 |
+
|
5 |
+
The following lets you test the model presented in the paper "Text mining and standardization of single-atom catalyst protocols to foster digital synthesis". The model is a transformer-based sequence-to-sequence language model that extracts synthesis actions from procedure text.
|
6 |
+
|
7 |
+
For **examples** and **documentation** of the model parameters, please see the model card below.
|
8 |
+
|
9 |
+
Disclaimer: The model has been primarily trained to capture synthesis protocols for SACs by solution-phase and thermal treatments. It can also extract synthesis details of other families of heterogeneous catalysts prepared by wet chemistry routes, including precipitation, impregnation, and sol-gel to an acceptable accuracy.
|
model_cards/sac_synthesis_mining_examples.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
"Heterogenous SAC model (ACE)","In a typical procedure, nickel nitrate hexahydrate (3.48 g) and sodium citrate (4.71 g) were dissolved in 400 mL water to form clear solution A. In the meantime, potassium hexacyanocobaltate (III) (2.66 g) was dissolved into water (200 mL) to form clear solution B. Then, solutions A and B were mixed under magnetic stirring. Stirring was stopped after combining the component solutions. After 24 h, the solid was collected by centrifugation, washed with water and ethanol, and then dried at room temperature. Then, the dried sample was annealed at 500 °C in NH3 for 1 h with a slow heating rate of 2°C min−1 , and immersed in 5 M H2SO4 solution and stirred at 80°C for 24 h to form CoNi-SAs/NC hollow cubes."
|
2 |
+
"Organic chemistry model","A solution of ((1S,2S)-1-{[(4'-methoxymethyl-biphenyl-4-yl)-(2-pyridin-2-yl-cyclopropanecarbonyl)-amino]-methyl}-2-methyl-butyl)-carbamic acid tert-butyl ester (25 mg, 0.045 mmol) and dichloromethane (4 mL) was treated with a solution of HCl in dioxane (4 N, 0.5 mL) and the resulting reaction mixture was maintained at room temperature for 12 h. The reaction was then concentrated to dryness to afford (1R,2R)-2-pyridin-2-yl-cyclopropanecarboxylic acid ((2S,3S)-2-amino-3-methylpentyl)-(4'-methoxymethyl-biphenyl-4-yl)-amide (18 mg, 95% yield) as a white solid."
|
organic-1.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0288cf80f5e3c73168c17284e6ea066a529d83e411c26e448ced2fdde6d7852e
|
3 |
+
size 89958148
|
organic-2.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1671e0030de5389bba43043acee36f91aaba87d29a9a1fedb1a4e2772fb42924
|
3 |
+
size 89958148
|
organic-3.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a00a0faf86978990fa2074d5d35564b5653a4464ea49b5c11f5cbd6ff962da98
|
3 |
+
size 89958148
|
pre-requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
future==0.18.3
|
2 |
+
numpy==1.24.2
|
3 |
+
pip==23.0.1
|
4 |
+
torch==1.5.1
|
requirements.txt
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.1.0
|
2 |
+
aiohttp==3.8.4
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==4.2.2
|
5 |
+
anyio==3.6.2
|
6 |
+
appdirs==1.4.4
|
7 |
+
async-timeout==4.0.2
|
8 |
+
attrs==22.2.0
|
9 |
+
beautifulsoup4==4.12.2
|
10 |
+
cffi==1.15.1
|
11 |
+
charset-normalizer==3.1.0
|
12 |
+
ChemDataExtractor==1.3.0
|
13 |
+
click==8.1.3
|
14 |
+
ConfigArgParse==1.5.3
|
15 |
+
contourpy==1.0.7
|
16 |
+
cryptography==40.0.2
|
17 |
+
cssselect==1.2.0
|
18 |
+
cycler==0.11.0
|
19 |
+
DAWG==0.8.0
|
20 |
+
diskcache==5.5.1
|
21 |
+
dnspython==2.3.0
|
22 |
+
entrypoints==0.4
|
23 |
+
fastapi==0.95.1
|
24 |
+
ffmpy==0.3.0
|
25 |
+
filelock==3.11.0
|
26 |
+
fonttools==4.39.3
|
27 |
+
frozenlist==1.3.3
|
28 |
+
fsspec==2023.4.0
|
29 |
+
gradio==3.25.0
|
30 |
+
gradio_client==0.1.2
|
31 |
+
h11==0.14.0
|
32 |
+
httpcore==0.17.0
|
33 |
+
httpx==0.24.0
|
34 |
+
huggingface-hub==0.13.4
|
35 |
+
idna==3.4
|
36 |
+
importlib-resources==5.12.0
|
37 |
+
Jinja2==3.1.2
|
38 |
+
joblib==1.2.0
|
39 |
+
jsonschema==4.17.3
|
40 |
+
kiwisolver==1.4.4
|
41 |
+
linkify-it-py==2.0.0
|
42 |
+
lxml==4.9.2
|
43 |
+
markdown-it-py==2.2.0
|
44 |
+
MarkupSafe==2.1.2
|
45 |
+
matplotlib==3.7.1
|
46 |
+
mdit-py-plugins==0.3.3
|
47 |
+
mdurl==0.1.2
|
48 |
+
multidict==6.0.4
|
49 |
+
nltk==3.8.1
|
50 |
+
orjson==3.8.10
|
51 |
+
packaging==23.1
|
52 |
+
pandas==2.0.0
|
53 |
+
pdfminer.six==20221105
|
54 |
+
Pillow==9.5.0
|
55 |
+
pkgutil_resolve_name==1.3.10
|
56 |
+
pycparser==2.21
|
57 |
+
pydantic==1.10.7
|
58 |
+
pydub==0.25.1
|
59 |
+
pymongo==4.3.3
|
60 |
+
pyparsing==3.0.9
|
61 |
+
pyrsistent==0.19.3
|
62 |
+
python-crfsuite==0.9.9
|
63 |
+
python-dateutil==2.8.2
|
64 |
+
python-dotenv==1.0.0
|
65 |
+
python-multipart==0.0.6
|
66 |
+
pytz==2023.3
|
67 |
+
PyYAML==6.0
|
68 |
+
regex==2023.3.23
|
69 |
+
requests==2.28.2
|
70 |
+
rxn-onmt-utils==1.0.2
|
71 |
+
rxn-opennmt-py==1.1.4
|
72 |
+
rxn-utils==1.1.9
|
73 |
+
semantic-version==2.10.0
|
74 |
+
sentencepiece==0.1.98
|
75 |
+
six==1.16.0
|
76 |
+
sniffio==1.3.0
|
77 |
+
soupsieve==2.4
|
78 |
+
starlette==0.26.1
|
79 |
+
toolz==0.12.0
|
80 |
+
torchtext==0.4.0
|
81 |
+
tqdm==4.65.0
|
82 |
+
typing_extensions==4.5.0
|
83 |
+
tzdata==2023.3
|
84 |
+
uc-micro-py==1.0.1
|
85 |
+
urllib3==1.26.15
|
86 |
+
uvicorn==0.21.1
|
87 |
+
websockets==11.0.1
|
88 |
+
yarl==1.8.2
|
89 |
+
zipp==3.15.0
|
sac.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc8a569656c093246f639ac55e6890bea70a070d8b1ea6bd3e32cdbf7486c3a7
|
3 |
+
size 89958148
|
sp_model.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f7e903009fc018b4de56efa71028eea13aa523513a1a0bad14f027dff36cb587
|
3 |
+
size 265630
|
sp_model.vocab
ADDED
The diff for this file is too large to render.
See raw diff
|
|
utils.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from typing import Iterable, Iterator, List, Union
|
3 |
+
|
4 |
+
import sentencepiece as spm
|
5 |
+
import chemdataextractor
|
6 |
+
from rxn.onmt_utils.internal_translation_utils import TranslationResult
|
7 |
+
from rxn.onmt_utils.translator import Translator
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
logger.addHandler(logging.NullHandler())
|
11 |
+
|
12 |
+
|
13 |
+
def split_into_sentences(text: str) -> List[str]:
|
14 |
+
paragraph = chemdataextractor.doc.Paragraph(text)
|
15 |
+
return [sentence.text for sentence in paragraph.sentences]
|
16 |
+
|
17 |
+
|
18 |
+
class SentencePieceTokenizer:
|
19 |
+
def __init__(self, model_file: str):
|
20 |
+
self.sp = spm.SentencePieceProcessor()
|
21 |
+
self.sp.Load(model_file)
|
22 |
+
|
23 |
+
def tokenize(self, sentence: str) -> str:
|
24 |
+
tokens = self.sp.EncodeAsPieces(sentence)
|
25 |
+
tokenized = " ".join(tokens)
|
26 |
+
return tokenized
|
27 |
+
|
28 |
+
def detokenize(self, sentence: str) -> str:
|
29 |
+
tokens = sentence.split(" ")
|
30 |
+
detokenized = self.sp.DecodePieces(tokens)
|
31 |
+
return detokenized
|
32 |
+
|
33 |
+
|
34 |
+
class TranslatorWithSentencePiece:
|
35 |
+
def __init__(
|
36 |
+
self, translation_model: Union[str, Iterable[str]], sentencepiece_model: str
|
37 |
+
):
|
38 |
+
self.sp = SentencePieceTokenizer(sentencepiece_model)
|
39 |
+
self.translator = Translator.from_model_path(translation_model)
|
40 |
+
|
41 |
+
def translate(self, sentences: List[str]) -> List[str]:
|
42 |
+
translations = self.translate_multiple_with_scores(sentences)
|
43 |
+
return [t[0].text for t in translations]
|
44 |
+
|
45 |
+
def translate_multiple_with_scores(
|
46 |
+
self, sentences: List[str], n_best=1
|
47 |
+
) -> Iterator[List[TranslationResult]]:
|
48 |
+
tokenized_sentences = [self.sp.tokenize(s) for s in sentences]
|
49 |
+
|
50 |
+
translations = self.translator.translate_multiple_with_scores(
|
51 |
+
tokenized_sentences, n_best
|
52 |
+
)
|
53 |
+
|
54 |
+
for translation_group in translations:
|
55 |
+
for t in translation_group:
|
56 |
+
t.text = self.sp.detokenize(t.text)
|
57 |
+
yield translation_group
|