Spaces:

jinysun
/

DeepAcceptor

Running

App Files Files Community

jinysun commited on Nov 8, 2023

Commit

36c5570

1 Parent(s): 6edf9da

Upload 10 files

Browse files

Files changed (10) hide show

.gitignore +160 -0
15data.h5 +3 -0
LICENSE +201 -0
abcBERT.py +96 -0
app.py +37 -0
compound_constants.py +156 -0
dataset.py +497 -0
model.py +280 -0
requirements.txt +10 -0
utils.py +696 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

15data.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ec80795633fe96e7226a7e63909138e6f4fc37654dcff6831627b1670986497
+size 17610752

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

abcBERT.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar  2 15:05:03 2023
+@author: BM109X32G-10GPU-02
+"""
+import tensorflow as tf
+import tensorflow.keras as keras
+import tensorflow.keras.layers as layers
+from tensorflow.keras.constraints import max_norm
+import pandas as pd
+import numpy as np
+import sys
+from dataset import predict_smiles
+from sklearn.metrics import r2_score,roc_auc_score
+from model import  PredictModel,BertModel
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
+os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
+def main(smiles):
+    keras.backend.clear_session()
+    os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
+    small = {'name': 'Small', 'num_layers': 3, 'num_heads': 4, 'd_model': 128, 'path': 'small_weights','addH':True}
+    medium = {'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'medium_weights2','addH':True}
+    medium3 = {'name': 'Medium', 'num_layers': 8, 'num_heads': 8, 'd_model': 256, 'path': 'medium_weights2',
+               'addH': True}
+    large = {'name': 'Large', 'num_layers': 12, 'num_heads': 12, 'd_model': 576, 'path': 'large_weights','addH':True}
+    medium_without_H = {'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256, 'path': 'weights_without_H','addH':False}
+    medium_without_pretrain = {'name': 'Medium', 'num_layers': 6, 'num_heads': 8, 'd_model': 256,'path': 'medium_without_pretraining_weights','addH':True}
+    arch = medium3## small 3 4 128   medium: 6 6  256     large:  12 8 516
+    pretraining = False
+    pretraining_str = 'pretraining' if pretraining else ''
+    trained_epoch = 80
+    task = 'data'
+    seed = 14
+    num_layers = arch['num_layers']
+    num_heads = arch['num_heads']
+    d_model = arch['d_model']
+    addH = arch['addH']
+    dff = d_model * 2
+    vocab_size =60
+    dropout_rate = 0.1
+    tf.random.set_seed(seed=seed)
+    graph_dataset = predict_smiles(smiles, addH=addH)
+    # graph_dataset = Graph_Regression_Dataset('data/reg/{}.csv', smiles_field='SMILES',
+    #                                                         label_field='PCE',addH=addH)
+    test_dataset = graph_dataset.get_data()
+    #value_range = graph_dataset.value_range()
+    x, adjoin_matrix, y = next(iter(test_dataset.take(1)))
+    seq = tf.cast(tf.math.equal(x, 0), tf.float32)
+    mask = seq[:, tf.newaxis, tf.newaxis, :]
+    model = PredictModel(num_layers=num_layers, d_model=d_model, dff=dff, num_heads=num_heads, vocab_size=vocab_size,
+                         dense_dropout=0.2)
+    preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False)
+    model.load_weights('{}.h5'.format('15data'))
+    class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+        def __init__(self, d_model, total_steps=4000):
+            super(CustomSchedule, self).__init__()
+            self.d_model = d_model
+            self.d_model = tf.cast(self.d_model, tf.float32)
+            self.total_step = total_steps
+            self.warmup_steps = total_steps*0.10
+        def __call__(self, step):
+            arg1 = step/self.warmup_steps
+            arg2 = 1-(step-self.warmup_steps)/(self.total_step-self.warmup_steps)
+            return 10e-5* tf.math.minimum(arg1, arg2)
+    steps_per_epoch = len(test_dataset)
+    value_range = 1
+    y_true = []
+    y_preds = []
+    for x, adjoin_matrix, y in test_dataset:
+        seq = tf.cast(tf.math.equal(x, 0), tf.float32)
+        mask = seq[:, tf.newaxis, tf.newaxis, :]
+        preds = model(x, mask=mask, adjoin_matrix=adjoin_matrix, training=False)
+        y_true.append(y.numpy())
+        y_preds.append(preds.numpy())
+    y_true = np.concatenate(y_true, axis=0).reshape(-1)
+    y_preds = np.concatenate(y_preds, axis=0).reshape(-1)
+    return y_preds

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import streamlit as st
+import pandas as pd
+import rdkit
+import streamlit_ketcher
+from streamlit_ketcher import st_ketcher
+import abcBERT
+# Page setup
+st.set_page_config(page_title="DeepAcceptor", page_icon="🔋", layout="wide")
+st.title("DeepAcceptor")
+# Connect to the Google Sheet
+url1 = r"https://docs.google.com/spreadsheets/d/1YOEIg0nMTSPkAOr8wkqxQRLuUhys3-J0I-KPEpmzPLw/gviz/tq?tqx=out:csv&sheet=accept"
+url = r"https://docs.google.com/spreadsheets/d/1YOEIg0nMTSPkAOr8wkqxQRLuUhys3-J0I-KPEpmzPLw/gviz/tq?tqx=out:csv&sheet=111"
+df1 = pd.read_csv(url1, dtype=str, encoding='utf-8')
+text_search = st.text_input("Search papers or molecules", value="")
+m1 = df1["name"].str.contains(text_search)
+m2 = df1["reference"].str.contains(text_search)
+df_search = df1[m1 | m2]
+if text_search:
+    st.write(df_search)
+    st.download_button( "Download edited files as .csv", df_search.to_csv(), "df_search.csv", use_container_width=True)
+edited_df = st.data_editor(df1, num_rows="dynamic")
+edited_df.to_csv(url)
+st.download_button(
+    "⬇️ Download edited files as .csv", edited_df.to_csv(), "edited_df.csv", use_container_width=True
+)
+molecule = st.text_input("Molecule")
+smile_code = st_ketcher(molecule)
+st.markdown(f"Smile code: ``{smile_code}``")
+try:
+    pce = abcBERT.main( str(smile_code ) )
+    st.markdown(f"PCE: ``{pce}``")
+except:
+    st.markdown(f"PCE:  None  ")

compound_constants.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jul 28 21:20:20 2022
+@author: BM109X32G-10GPU-02
+"""
+"""
+| Compound constants.
+"""
+# functional groups from https://www.daylight.com/dayhtml_tutorials/languages/smarts/smarts_examples.html
+DAY_LIGHT_FG_SMARTS_LIST = [
+        # C
+        "[CX4]",
+        "[$([CX2](=C)=C)]",
+        "[$([CX3]=[CX3])]",
+        "[$([CX2]#C)]",
+        # C & O
+        "[CX3]=[OX1]",
+        "[$([CX3]=[OX1]),$([CX3+]-[OX1-])]",
+        "[CX3](=[OX1])C",
+        "[OX1]=CN",
+        "[CX3](=[OX1])O",
+        "[CX3](=[OX1])[F,Cl,Br,I]",
+        "[CX3H1](=O)[#6]",
+        "[CX3](=[OX1])[OX2][CX3](=[OX1])",
+        "[NX3][CX3](=[OX1])[#6]",
+        "[NX3][CX3]=[NX3+]",
+        "[NX3,NX4+][CX3](=[OX1])[OX2,OX1-]",
+        "[NX3][CX3](=[OX1])[OX2H0]",
+        "[NX3,NX4+][CX3](=[OX1])[OX2H,OX1-]",
+        "[CX3](=O)[O-]",
+        "[CX3](=[OX1])(O)O",
+        "[CX3](=[OX1])([OX2])[OX2H,OX1H0-1]",
+        "C[OX2][CX3](=[OX1])[OX2]C",
+        "[CX3](=O)[OX2H1]",
+        "[CX3](=O)[OX1H0-,OX2H1]",
+        "[NX3][CX2]#[NX1]",
+        "[#6][CX3](=O)[OX2H0][#6]",
+        "[#6][CX3](=O)[#6]",
+        "[OD2]([#6])[#6]",
+        # H
+        "[H]",
+        "[!#1]",
+        "[H+]",
+        "[+H]",
+        "[!H]",
+        # N
+        "[NX3;H2,H1;!$(NC=O)]",
+        "[NX3][CX3]=[CX3]",
+        "[NX3;H2;!$(NC=[!#6]);!$(NC#[!#6])][#6]",
+        "[NX3;H2,H1;!$(NC=O)].[NX3;H2,H1;!$(NC=O)]",
+        "[NX3][$(C=C),$(cc)]",
+        "[NX3,NX4+][CX4H]([*])[CX3](=[OX1])[O,N]",
+        "[NX3H2,NH3X4+][CX4H]([*])[CX3](=[OX1])[NX3,NX4+][CX4H]([*])[CX3](=[OX1])[OX2H,OX1-]",
+        "[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([*])[CX3](=[OX1])[OX2H,OX1-,N]",
+        "[CH3X4]",
+        "[CH2X4][CH2X4][CH2X4][NHX3][CH0X3](=[NH2X3+,NHX2+0])[NH2X3]",
+        "[CH2X4][CX3](=[OX1])[NX3H2]",
+        "[CH2X4][CX3](=[OX1])[OH0-,OH]",
+        "[CH2X4][SX2H,SX1H0-]",
+        "[CH2X4][CH2X4][CX3](=[OX1])[OH0-,OH]",
+        "[$([$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H2][CX3](=[OX1])[OX2H,OX1-,N])]",
+        "[CH2X4][#6X3]1:[$([#7X3H+,#7X2H0+0]:[#6X3H]:[#7X3H]),$([#7X3H])]:[#6X3H]:\
+[$([#7X3H+,#7X2H0+0]:[#6X3H]:[#7X3H]),$([#7X3H])]:[#6X3H]1",
+        "[CHX4]([CH3X4])[CH2X4][CH3X4]",
+        "[CH2X4][CHX4]([CH3X4])[CH3X4]",
+        "[CH2X4][CH2X4][CH2X4][CH2X4][NX4+,NX3+0]",
+        "[CH2X4][CH2X4][SX2][CH3X4]",
+        "[CH2X4][cX3]1[cX3H][cX3H][cX3H][cX3H][cX3H]1",
+        "[$([NX3H,NX4H2+]),$([NX3](C)(C)(C))]1[CX4H]([CH2][CH2][CH2]1)[CX3](=[OX1])[OX2H,OX1-,N]",
+        "[CH2X4][OX2H]",
+        "[NX3][CX3]=[SX1]",
+        "[CHX4]([CH3X4])[OX2H]",
+        "[CH2X4][cX3]1[cX3H][nX3H][cX3]2[cX3H][cX3H][cX3H][cX3H][cX3]12",
+        "[CH2X4][cX3]1[cX3H][cX3H][cX3]([OHX2,OH0X1-])[cX3H][cX3H]1",
+        "[CHX4]([CH3X4])[CH3X4]",
+        "N[CX4H2][CX3](=[OX1])[O,N]",
+        "N1[CX4H]([CH2][CH2][CH2]1)[CX3](=[OX1])[O,N]",
+        "[$(*-[NX2-]-[NX2+]#[NX1]),$(*-[NX2]=[NX2+]=[NX1-])]",
+        "[$([NX1-]=[NX2+]=[NX1-]),$([NX1]#[NX2+]-[NX1-2])]",
+        "[#7]",
+        "[NX2]=N",
+        "[NX2]=[NX2]",
+        "[$([NX2]=[NX3+]([O-])[#6]),$([NX2]=[NX3+0](=[O])[#6])]",
+        "[$([#6]=[N+]=[N-]),$([#6-]-[N+]#[N])]",
+        "[$([nr5]:[nr5,or5,sr5]),$([nr5]:[cr5]:[nr5,or5,sr5])]",
+        "[NX3][NX3]",
+        "[NX3][NX2]=[*]",
+        "[CX3;$([C]([#6])[#6]),$([CH][#6])]=[NX2][#6]",
+        "[$([CX3]([#6])[#6]),$([CX3H][#6])]=[$([NX2][#6]),$([NX2H])]",
+        "[NX3+]=[CX3]",
+        "[CX3](=[OX1])[NX3H][CX3](=[OX1])",
+        "[CX3](=[OX1])[NX3H0]([#6])[CX3](=[OX1])",
+        "[CX3](=[OX1])[NX3H0]([NX3H0]([CX3](=[OX1]))[CX3](=[OX1]))[CX3](=[OX1])",
+        "[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]",
+        "[$([OX1]=[NX3](=[OX1])[OX1-]),$([OX1]=[NX3+]([OX1-])[OX1-])]",
+        "[NX1]#[CX2]",
+        "[CX1-]#[NX2+]",
+        "[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]",
+        "[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8].[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]",
+        "[NX2]=[OX1]",
+        "[$([#7+][OX1-]),$([#7v5]=[OX1]);!$([#7](~[O])~[O]);!$([#7]=[#7])]",
+        # O
+        "[OX2H]",
+        "[#6][OX2H]",
+        "[OX2H][CX3]=[OX1]",
+        "[OX2H]P",
+        "[OX2H][#6X3]=[#6]",
+        "[OX2H][cX3]:[c]",
+        "[OX2H][$(C=C),$(cc)]",
+        "[$([OH]-*=[!#6])]",
+        "[OX2,OX1-][OX2,OX1-]",
+        # P
+        "[$(P(=[OX1])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),\
+$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)]),$([P+]([OX1-])([$([OX2H]),$([OX1-])\
+,$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)])]",
+        "[$(P(=[OX1])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),\
+$([OX1-]),$([OX2][#6]),$([OX2]P)]),$([P+]([OX1-])([OX2][#6])([$([OX2H]),$([OX1-]),\
+$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)])]",
+        # S
+        "[S-][CX3](=S)[#6]",
+        "[#6X3](=[SX1])([!N])[!N]",
+        "[SX2]",
+        "[#16X2H]",
+        "[#16!H0]",
+        "[#16X2H0]",
+        "[#16X2H0][!#16]",
+        "[#16X2H0][#16X2H0]",
+        "[#16X2H0][!#16].[#16X2H0][!#16]",
+        "[$([#16X3](=[OX1])[OX2H0]),$([#16X3+]([OX1-])[OX2H0])]",
+        "[$([#16X3](=[OX1])[OX2H,OX1H0-]),$([#16X3+]([OX1-])[OX2H,OX1H0-])]",
+        "[$([#16X4](=[OX1])=[OX1]),$([#16X4+2]([OX1-])[OX1-])]",
+        "[$([#16X4](=[OX1])(=[OX1])([#6])[#6]),$([#16X4+2]([OX1-])([OX1-])([#6])[#6])]",
+        "[$([#16X4](=[OX1])(=[OX1])([#6])[OX2H,OX1H0-]),$([#16X4+2]([OX1-])([OX1-])([#6])[OX2H,OX1H0-])]",
+        "[$([#16X4](=[OX1])(=[OX1])([#6])[OX2H0]),$([#16X4+2]([OX1-])([OX1-])([#6])[OX2H0])]",
+        "[$([#16X4]([NX3])(=[OX1])(=[OX1])[#6]),$([#16X4+2]([NX3])([OX1-])([OX1-])[#6])]",
+        "[SX4](C)(C)(=O)=N",
+        "[$([SX4](=[OX1])(=[OX1])([!O])[NX3]),$([SX4+2]([OX1-])([OX1-])([!O])[NX3])]",
+        "[$([#16X3]=[OX1]),$([#16X3+][OX1-])]",
+        "[$([#16X3](=[OX1])([#6])[#6]),$([#16X3+]([OX1-])([#6])[#6])]",
+        "[$([#16X4](=[OX1])(=[OX1])([OX2H,OX1H0-])[OX2][#6]),$([#16X4+2]([OX1-])([OX1-])([OX2H,OX1H0-])[OX2][#6])]",
+        "[$([SX4](=O)(=O)(O)O),$([SX4+2]([O-])([O-])(O)O)]",
+        "[$([#16X4](=[OX1])(=[OX1])([OX2][#6])[OX2][#6]),$([#16X4](=[OX1])(=[OX1])([OX2][#6])[OX2][#6])]",
+        "[$([#16X4]([NX3])(=[OX1])(=[OX1])[OX2][#6]),$([#16X4+2]([NX3])([OX1-])([OX1-])[OX2][#6])]",
+        "[$([#16X4]([NX3])(=[OX1])(=[OX1])[OX2H,OX1H0-]),$([#16X4+2]([NX3])([OX1-])([OX1-])[OX2H,OX1H0-])]",
+        "[#16X2][OX2H,OX1H0-]",
+        "[#16X2][OX2H0]",
+        # X
+        "[#6][F,Cl,Br,I]",
+        "[F,Cl,Br,I]",
+        "[F,Cl,Br,I].[F,Cl,Br,I].[F,Cl,Br,I]",
+    ]

dataset.py ADDED Viewed

	@@ -0,0 +1,497 @@

+import pandas as pd
+import numpy as np
+from utils import mol_to_geognn_graph_data_MMFF3d as smiles2adjoin
+import tensorflow as tf
+str2num = {'<pad>':0 ,'H': 1, 'C': 2, 'N': 3, 'O': 4, 'F': 5, 'S': 6, 'Cl': 7, 'P': 8, 'Br':  9,
+         'B': 10,'I': 11,'Si':12,'Se':13,'<unk>':14,'<mask>':15,'<global>':16}
+num2str =  {i:j for j,i in str2num.items()}
+class Graph_Bert_Dataset(object):
+    def __init__(self,path,smiles_field=['0'], adj=['1'],addH=True):
+        if path.endswith('.txt') or path.endswith('.tsv'):
+            self.df = pd.read_csv(path,sep='\n\t')
+        else:
+            self.df = pd.read_csv(path)
+        self.smiles_field = smiles_field
+        self.adj = adj
+        self.vocab = str2num
+        self.devocab = num2str
+        self.addH = addH
+    def get_data(self):
+        data = self.df
+        train_idx = []
+        idx = data.sample(frac=0.9).index
+        train_idx.extend(idx)
+        data1 = data[data.index.isin(train_idx)]
+        data2 = data[~data.index.isin(train_idx)]
+        self.dataset1 = tf.data.Dataset.from_tensor_slices((data1[self.smiles_field],data1[self.adj]))
+        self.dataset1 = self.dataset1.map(self.tf_numerical_smiles).padded_batch(256, padded_shapes=(
+            tf.TensorShape([None]),tf.TensorShape([None,None]), tf.TensorShape([None]) ,tf.TensorShape([None]))).prefetch(50)
+        self.dataset2 = tf.data.Dataset.from_tensor_slices((data2[self.smiles_field],data2[self.adj]))
+        self.dataset2 = self.dataset2.map(self.tf_numerical_smiles).padded_batch(512, padded_shapes=(
+            tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([None]),
+            tf.TensorShape([None]))).prefetch(50)
+        return self.dataset1, self.dataset2
+    def numerical_smiles(self, atom, adj):
+        #smiles = smiles.numpy().decode()
+        atom = np.array(atom)
+        atom = atom[0].decode()
+        atom = atom.replace('\n','')
+        atom = atom.replace('[',' ')
+        atom = atom.replace(']',' ')
+        atom = atom.split("'")
+        atoms_list = []
+        for i in atom:
+            if i not in [' ']:
+                atoms_list.append(i)
+        adj = np.array(adj)[0].decode()
+        adjoin_matrix =np.load( adj )
+        atoms_list = ['<global>'] + atoms_list
+        nums_list =  [str2num.get(i,str2num['<unk>']) for i in atoms_list]
+        temp = np.ones((len(nums_list),len(nums_list)))
+        temp[1:,1:] = adjoin_matrix
+        temp[np.where(temp  == 0)]=-1e9
+        adjoin_matrix = temp
+        #adjoin_matrix = (1 - temp) * (-1e9)
+        choices = np.random.permutation(len(nums_list)-1)[:max(int(len(nums_list)*0.15),1)] + 1
+        y = np.array(nums_list).astype('int64')
+        weight = np.zeros(len(nums_list))
+        for i in choices:
+            rand = np.random.rand()
+            weight[i] = 1
+            if rand < 0.8:
+                nums_list[i] = str2num['<mask>']
+            elif rand < 0.9:
+                nums_list[i] = int(np.random.rand() * 14 + 1)
+        x = np.array(nums_list).astype('int64')
+        weight = weight.astype('float32')
+        return x, adjoin_matrix, y, weight
+    def tf_numerical_smiles(self, atom,adj):
+        #print(data)
+        # x,adjoin_matrix,y,weight = tf.py_function(self.balanced_numerical_smiles,
+        #                                           [data], [tf.int64, tf.float32 ,tf.int64,tf.float32])
+        x, adjoin_matrix, y, weight = tf.py_function(self.numerical_smiles, (atom, adj),
+                                                     [tf.int64, tf.float32, tf.int64, tf.float32])
+        x.set_shape([None])
+        adjoin_matrix.set_shape([None,None])
+        y.set_shape([None])
+        weight.set_shape([None])
+        return x, adjoin_matrix, y, weight
+class Graph_Regression_Dataset_test(object):
+    def __init__(self,path,smiles_field='SMILES',label_field='PCE',normalize=False,max_len=1000,addH=True):
+        if path.endswith('.txt') or path.endswith('.tsv'):
+            self.df = pd.read_csv(path.format('test'),sep='\t')
+        else:
+            self.df = pd.read_csv(path.format('test'))
+        self.smiles_field = smiles_field
+        self.label_field = label_field
+        self.vocab = str2num
+        self.devocab = num2str
+        self.df = self.df[self.df[smiles_field].str.len()<=max_len]
+        self.addH =  addH
+        if normalize:
+            self.max = self.df[self.label_field].max()
+            self.min = self.df[self.label_field].min()
+            self.df[self.label_field] = (self.df[self.label_field]-self.min)/(self.max-self.min)-0.5
+            self.value_range = self.max-self.min
+    def get_data(self):
+        train_data = self.df
+        self.dataset1 = tf.data.Dataset.from_tensor_slices((train_data[self.smiles_field], train_data[self.label_field]))
+        self.dataset1 = self.dataset1.map(self.tf_numerical_smiles).padded_batch(64, padded_shapes=(
+            tf.TensorShape([None]), tf.TensorShape([None,None]),tf.TensorShape([1])))
+        return self.dataset1
+    def numerical_smiles(self, smiles,label):
+        smiles = smiles.numpy().decode()
+        atoms_list, adjoin_matrix = smiles2adjoins(smiles)
+        atoms_list = list(atoms_list)
+        atoms_list = ['<global>'] + atoms_list
+        nums_list =  [str2num.get(i,str2num['<unk>']) for i in atoms_list]
+        temp = np.ones((len(nums_list),len(nums_list)))
+        temp[1:,1:] = adjoin_matrix
+        temp[np.where(temp  == 0)]=-1e9
+        adjoin_matrix = temp
+        x = np.array(nums_list).astype('int64')
+        y = np.array([label]).astype('float32')
+        return x, adjoin_matrix,y
+    def tf_numerical_smiles(self, smiles,label):
+        x,adjoin_matrix,y = tf.py_function(self.numerical_smiles, [smiles,label], [tf.int64, tf.float32 ,tf.float32])
+        x.set_shape([None])
+        adjoin_matrix.set_shape([None,None])
+        y.set_shape([None])
+        return x, adjoin_matrix , y
+class predict_smiles(object):
+    def __init__(self,smiles ,normalize=False,max_len=1000,addH=True):
+        self.smiles_field = smiles
+        self.label_field = float(0)
+        self.vocab = str2num
+        self.devocab = num2str
+        #self.df = self.df[self.df[smiles_field].str.len()<=max_len]
+        self.addH =  addH
+        if normalize:
+            self.max = self.df[self.label_field].max()
+            self.min = self.df[self.label_field].min()
+            self.df[self.label_field] = (self.df[self.label_field]-self.min)/(self.max-self.min)-0.5
+            self.value_range = self.max-self.min
+    def numerical_smiles(self, atoms_list,adj,label):
+        atom = np.array(atoms_list)
+        atoms_list = []
+        for i in atom:
+            if i not in [' ']:
+                atoms_list.append(i)
+        label = np.array(label)
+        adj = np.array(adj)
+        adjoin_matrix =adj
+        atoms_list = ['<global>'] + atoms_list
+        nums_list =  [str2num.get(i,str2num['<unk>']) for i in atoms_list]
+        #temp = np.ones((len(nums_list),len(nums_list)))
+        #temp[1:, 1:] = adjoin_matrix
+        #adjoin_matrix = (1-temp)*(-1e9)
+        temp = np.ones((len(nums_list),len(nums_list)))
+        temp[1:,1:] = adjoin_matrix
+        temp[np.where(temp  == 0)]=-1e9
+        adjoin_matrix = temp
+        x = np.array(nums_list).astype('int64')
+        y = np.array([label]).astype('float32')
+        return x, adjoin_matrix,y
+    def get_data(self):
+        atom, adj = smiles2adjoin( self.smiles_field)
+        atom = np.array(atom)
+        atoms_list = []
+        for i in atom:
+            if i not in [' ']:
+                atoms_list.append(i)
+        adj = np.array(adj)
+        adjoin_matrix = adj
+        self.dataset1 = tf.data.Dataset.from_tensors((atoms_list, adjoin_matrix,  self.label_field))
+        self.dataset1 = self.dataset1.map(self.tf_numerical_smiles).cache().padded_batch(1, padded_shapes=(
+            tf.TensorShape([None]), tf.TensorShape([None,None]),tf.TensorShape([1])))
+        return self.dataset1
+    def tf_numerical_smiles(self, atoms_list,adj,label):
+        x,adjoin_matrix,y = tf.py_function(self.numerical_smiles, (atoms_list,adj,label), [tf.int64, tf.float32 ,tf.float32])
+        x.set_shape([None])
+        adjoin_matrix.set_shape([None,None])
+        y.set_shape([None])
+        return x, adjoin_matrix , y
+class Graph_Regression_test(object):
+    def __init__(self,path,smiles_field=['0'],adj = ['1'], label_field=['2'],normalize=False,max_len=1000,addH=True):
+        if path.endswith('.txt') or path.endswith('.tsv'):
+           # self.df = pd.read_csv(path.format('train3'),sep='\t')
+            #self.dt = pd.read_csv(path.format('test3'),sep='\t')
+            self.dv = pd.read_csv(path.format('val3'),sep='\t')
+        else:
+            #self.df = pd.read_csv(path.format('train/train'))
+           #self.dt = pd.read_csv(path.format('test/test'))
+            self.dv = pd.read_csv(path.format('val/val'))
+        self.smiles_field = smiles_field
+        self.adj = adj
+        self.label_field = label_field
+        self.vocab = str2num
+        self.devocab = num2str
+        #self.df = self.df[self.df[smiles_field].str.len()<=max_len]
+        self.addH =  addH
+        if normalize:
+            self.max = self.df[self.label_field].max()
+            self.min = self.df[self.label_field].min()
+            self.df[self.label_field] = (self.df[self.label_field]-self.min)/(self.max-self.min)-0.5
+            self.value_range = self.max-self.min
+    def get_data(self):
+        train_data = self.dv
+        #idx = train_data.sample(frac=0.9).index
+        # train_idx = []
+        # #idx = train_data.sample(frac=0.9).index
+        # train_idx.extend(idx)
+        # data1 = train_data[train_data.index.isin(train_idx)]
+        # data2 = train_data[~train_data.index.isin(train_idx)]
+        self.dataset1 = tf.data.Dataset.from_tensor_slices((train_data[self.smiles_field],train_data[self.adj], train_data[self.label_field]))
+        self.dataset1 = self.dataset1.map(self.tf_numerical_smiles).cache().padded_batch(64, padded_shapes=(
+            tf.TensorShape([None]), tf.TensorShape([None,None]),tf.TensorShape([1]))).prefetch(100)
+        return self.dataset1
+    def numerical_smiles(self, atom,adj,label):
+        atom = np.array(atom)
+        atom = atom[0].decode()
+        atom = atom.replace('\n','')
+        atom = atom.replace('[',' ')
+        atom = atom.replace(']',' ')
+        atom = atom.split("'")
+        atoms_list = []
+        for i in atom:
+            if i not in [' ']:
+                atoms_list.append(i)
+        label = np.array(label)[0]
+        adj = np.array(adj)[0].decode()
+        adjoin_matrix =np.load( adj )
+        atoms_list = ['<global>'] + atoms_list
+        nums_list =  [str2num.get(i,str2num['<unk>']) for i in atoms_list]
+        #temp = np.ones((len(nums_list),len(nums_list)))
+        #temp[1:, 1:] = adjoin_matrix
+        #adjoin_matrix = (1-temp)*(-1e9)
+        temp = np.ones((len(nums_list),len(nums_list)))
+        temp[1:,1:] = adjoin_matrix
+        temp[np.where(temp  == 0)]=-1e9
+        adjoin_matrix = temp
+        x = np.array(nums_list).astype('int64')
+        y = np.array([label]).astype('float32')
+        return x, adjoin_matrix,y
+    def tf_numerical_smiles(self, smiles,adj,label):
+        x,adjoin_matrix,y = tf.py_function(self.numerical_smiles, (smiles,adj,label), [tf.int64, tf.float32 ,tf.float32])
+        x.set_shape([None])
+        adjoin_matrix.set_shape([None,None])
+        y.set_shape([None])
+        return x, adjoin_matrix , y
+class Graph_Regression(object):
+    def __init__(self,path,smiles_field=['0'],adj = ['1'], label_field=['2'],normalize=False,max_len=1000,addH=True):
+        if path.endswith('.txt') or path.endswith('.tsv'):
+            self.df = pd.read_csv(path.format('train3'),sep='\t')
+            self.dt = pd.read_csv(path.format('test3'),sep='\t')
+            #self.dv = pd.read_csv(path.format('val3'),sep='\t')
+        else:
+            self.df = pd.read_csv(path.format('train/train'))
+            self.dt = pd.read_csv(path.format('test/test'))
+            #self.dv = pd.read_csv(path.format('val3'))
+        self.smiles_field = smiles_field
+        self.adj = adj
+        self.label_field = label_field
+        self.vocab = str2num
+        self.devocab = num2str
+        #self.df = self.df[self.df[smiles_field].str.len()<=max_len]
+        self.addH =  addH
+        if normalize:
+            self.max = self.df[self.label_field].max()
+            self.min = self.df[self.label_field].min()
+            self.df[self.label_field] = (self.df[self.label_field]-self.min)/(self.max-self.min)-0.5
+            self.value_range = self.max-self.min
+    def get_data(self):
+        train_data = self.df
+        test_data = self.dt
+        data2=test_data
+        #idx = train_data.sample(frac=0.9).index
+        # train_idx = []
+        # #idx = train_data.sample(frac=0.9).index
+        # train_idx.extend(idx)
+        # data1 = train_data[train_data.index.isin(train_idx)]
+        # data2 = train_data[~train_data.index.isin(train_idx)]
+        self.dataset1 = tf.data.Dataset.from_tensor_slices((train_data[self.smiles_field],train_data[self.adj], train_data[self.label_field]))
+        self.dataset1 = self.dataset1.map(self.tf_numerical_smiles).cache().padded_batch(64, padded_shapes=(
+            tf.TensorShape([None]), tf.TensorShape([None,None]),tf.TensorShape([1]))).prefetch(100)
+        self.dataset2 = tf.data.Dataset.from_tensor_slices((test_data[self.smiles_field], test_data[self.adj],test_data[self.label_field]))
+        self.dataset2 = self.dataset2.map(self.tf_numerical_smiles).padded_batch(64, padded_shapes=(
+            tf.TensorShape([None]),tf.TensorShape([None,None]), tf.TensorShape([1]))).cache().prefetch(100)
+        self.dataset3 = tf.data.Dataset.from_tensor_slices((data2[self.smiles_field],test_data[self.adj], data2[self.label_field]))
+        self.dataset3 = self.dataset3.map(self.tf_numerical_smiles).padded_batch(64, padded_shapes=(
+            tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([1]))).cache().prefetch(100)
+        return self.dataset1,self.dataset2,self.dataset3
+    def numerical_smiles(self, atom,adj,label):
+        atom = np.array(atom)
+        atom = atom[0].decode()
+        atom = atom.replace('\n','')
+        atom = atom.replace('[',' ')
+        atom = atom.replace(']',' ')
+        atom = atom.split("'")
+        atoms_list = []
+        for i in atom:
+            if i not in [' ']:
+                atoms_list.append(i)
+        label = np.array(label)[0]
+        adj = np.array(adj)[0].decode()
+        adjoin_matrix =np.load( adj )
+        atoms_list = ['<global>'] + atoms_list
+        nums_list =  [str2num.get(i,str2num['<unk>']) for i in atoms_list]
+        #temp = np.ones((len(nums_list),len(nums_list)))
+        #temp[1:, 1:] = adjoin_matrix
+        #adjoin_matrix = (1-temp)*(-1e9)
+        temp = np.ones((len(nums_list),len(nums_list)))
+        temp[1:,1:] = adjoin_matrix
+        temp[np.where(temp  == 0)]=-1e9
+        adjoin_matrix = temp
+        x = np.array(nums_list).astype('int64')
+        y = np.array([label]).astype('float32')
+        return x, adjoin_matrix,y
+    def tf_numerical_smiles(self, smiles,adj,label):
+        x,adjoin_matrix,y = tf.py_function(self.numerical_smiles, (smiles,adj,label), [tf.int64, tf.float32 ,tf.float32])
+        x.set_shape([None])
+        adjoin_matrix.set_shape([None,None])
+        y.set_shape([None])
+        return x, adjoin_matrix , y
+class Inference_Dataset(object):
+    def __init__(self,path,smiles_field='Smiles',addH=True):
+        if path.endswith('.txt') or path.endswith('.tsv'):
+            self.df = pd.read_csv(path,sep='\t')
+        else:
+            self.df = pd.read_csv(path)
+        self.smiles_field = smiles_field
+        self.vocab = str2num
+        self.devocab = num2str
+        self.addH = addH
+    def get_data(self):
+        data = self.df
+        train_idx = []
+        idx = data.sample(frac=0.9).index
+        train_idx.extend(idx)
+        data1 = data[data.index.isin(train_idx)]
+        data2 = data[~data.index.isin(train_idx)]
+        print(len(data1))
+        self.dataset1 = tf.data.Dataset.from_tensor_slices(data1[self.smiles_field].tolist())
+        self.dataset1 = self.dataset1.map(self.tf_numerical_smiles).padded_batch(1, padded_shapes=(
+            tf.TensorShape([None]),tf.TensorShape([None,None]), tf.TensorShape([None]) ,tf.TensorShape([None]))).prefetch(50)
+        print(self.dataset1)
+        self.dataset2 = tf.data.Dataset.from_tensor_slices(data2[self.smiles_field].tolist())
+        self.dataset2 = self.dataset2.map(self.tf_numerical_smiles).padded_batch(1, padded_shapes=(
+            tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([None]),
+            tf.TensorShape([None]))).prefetch(50)
+        return self.dataset1, self.dataset2
+    def numerical_smiles(self, smiles):
+        smiles = smiles.numpy().decode()
+        atoms_list, adjoin_matrix = smiles2adjoins(smiles,explicit_hydrogens=self.addH)
+        print(atoms_list)
+        atoms_list = ['<global>'] + atoms_list
+        nums_list =  [str2num.get(i,str2num['<unk>']) for i in atoms_list]
+        temp = np.ones((len(nums_list),len(nums_list)))
+        temp[1:,1:] = adjoin_matrix
+        temp[np.where(temp  == 0)]=-1e9
+        adjoin_matrix = temp
+        choices = np.random.permutation(len(nums_list)-1)[:max(int(len(nums_list)*0.15),1)] + 1
+        y = np.array(nums_list).astype('int64')
+        x = np.array(nums_list).astype('int64')
+        return x, adjoin_matrix,  [smiles],atoms_list
+    def tf_numerical_smiles(self, data):
+        # x,adjoin_matrix,y,weight = tf.py_function(self.balanced_numerical_smiles,
+        #                                           [data], [tf.int64, tf.float32 ,tf.int64,tf.float32])
+        x, adjoin_matrix, y, weight = tf.py_function(self.numerical_smiles, [data],
+                                                     [tf.int64, tf.float32, tf.int64, tf.float32])
+        smiles.set_shape([1])
+        atom_list.set_shape([None])
+        x.set_shape([None])
+        adjoin_matrix.set_shape([None,None])
+        y.set_shape([None])
+        weight.set_shape([None])
+        return x, adjoin_matrix,smiles,atom_list
+class Inference_Dataset(object):
+    def __init__(self,sml_list,max_len=1000,addH=True):
+        self.vocab = str2num
+        self.devocab = num2str
+        self.sml_list = [i for i in sml_list if len(i)<max_len]
+        self.addH =  addH
+    def get_data(self):
+        self.dataset = tf.data.Dataset.from_tensor_slices((self.sml_list,))
+        self.dataset = self.dataset.map(self.tf_numerical_smiles).padded_batch(64, padded_shapes=(
+            tf.TensorShape([None]), tf.TensorShape([None,None]),tf.TensorShape([1]),tf.TensorShape([None]))).cache().prefetch(20)
+        return self.dataset
+    def numerical_smiles(self, smiles):
+        smiles_origin = smiles
+        smiles = smiles.numpy().decode()
+        atoms_list, adjoin_matrix = smiles2adjoins(smiles)
+        atoms_list = ['<global>'] + atoms_list
+        nums_list =  [str2num.get(i,str2num['<unk>']) for i in atoms_list]
+        temp = np.ones((len(nums_list),len(nums_list)))
+        temp[1:,1:] = adjoin_matrix
+        adjoin_matrix = (1-temp)*(-1e9)
+        x = np.array(nums_list).astype('int64')
+        return x, adjoin_matrix,[smiles], atoms_list
+    def tf_numerical_smiles(self, smiles):
+        x,adjoin_matrix,smiles,atom_list = tf.py_function(self.numerical_smiles, [smiles], [tf.int64, tf.float32,tf.string, tf.string])
+        x.set_shape([None])
+        adjoin_matrix.set_shape([None,None])
+        smiles.set_shape([1])
+        atom_list.set_shape([None])
+        return x, adjoin_matrix,smiles,atom_list

model.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import tensorflow as tf
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+def gelu(x):
+    return 0.5 * x * (1.0 + tf.math.erf(x / tf.sqrt(2.)))
+def scaled_dot_product_attention(q, k, v, mask,adjoin_matrix):
+    """Calculate the attention weights.
+    q, k, v must have matching leading dimensions.
+    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
+    The mask has different shapes depending on its type(padding or look ahead)
+    but it must be broadcastable for addition.
+    Args:
+      q: query shape == (..., seq_len_q, depth)
+      k: key shape == (..., seq_len_k, depth)
+      v: value shape == (..., seq_len_v, depth_v)
+      mask: Float tensor with shape broadcastable
+            to (..., seq_len_q, seq_len_k). Defaults to None.
+    Returns:
+      output, attention_weights
+    """
+    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
+    # scale matmul_qk
+    dk = tf.cast(tf.shape(k)[-1], tf.float32)
+    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
+    # add the mask to the scaled tensor.
+    if mask is not None:
+        scaled_attention_logits += (mask * -1e9)
+    if adjoin_matrix is not None:
+         #adjoin_matrix1 =tf.where(adjoin_matrix>0,0.0,-1e9)
+         #scaled_attention_logits += adjoin_matrix1
+         #scaled_attention_logits = scaled_attention_logits * adjoin_matrix
+         scaled_attention_logits += adjoin_matrix
+        # softmax is normalized on the last axis (seq_len_k) so that the scores
+    # add up to 1.
+    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
+    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
+    return output, attention_weights
+class MultiHeadAttention(tf.keras.layers.Layer):
+    def __init__(self, d_model, num_heads):
+        super(MultiHeadAttention, self).__init__()
+        self.num_heads = num_heads
+        self.d_model = d_model
+        assert d_model % self.num_heads == 0
+        self.depth = d_model // self.num_heads
+        self.wq = tf.keras.layers.Dense(d_model)
+        self.wk = tf.keras.layers.Dense(d_model)
+        self.wv = tf.keras.layers.Dense(d_model)
+        self.dense = tf.keras.layers.Dense(d_model)
+    def split_heads(self, x, batch_size):
+        """Split the last dimension into (num_heads, depth).
+        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
+        """
+        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
+        return tf.transpose(x, perm=[0, 2, 1, 3])
+    def call(self, v, k, q, mask,adjoin_matrix):
+        batch_size = tf.shape(q)[0]
+        q = self.wq(q)  # (batch_size, seq_len, d_model)
+        k = self.wk(k)  # (batch_size, seq_len, d_model)
+        v = self.wv(v)  # (batch_size, seq_len, d_model)
+        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
+        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
+        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
+        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
+        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
+        scaled_attention, attention_weights = scaled_dot_product_attention(
+            q, k, v, mask,adjoin_matrix)
+        scaled_attention = tf.transpose(scaled_attention,
+                                        perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
+        concat_attention = tf.reshape(scaled_attention,
+                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
+        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
+        return output, attention_weights
+def point_wise_feed_forward_network(d_model, dff):
+  return tf.keras.Sequential([
+      tf.keras.layers.Dense(dff, activation=gelu),  # (batch_size, seq_len, dff)tf.keras.layers.LeakyReLU(0.01)
+      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
+  ])
+class EncoderLayer(tf.keras.layers.Layer):
+    def __init__(self, d_model, num_heads, dff, rate=0.1):
+        super(EncoderLayer, self).__init__()
+        self.mha = MultiHeadAttention(d_model, num_heads)
+        self.ffn = point_wise_feed_forward_network(d_model, dff)
+        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = tf.keras.layers.Dropout(rate)
+        self.dropout2 = tf.keras.layers.Dropout(rate)
+    def call(self, x, training, mask,adjoin_matrix):
+        attn_output, attention_weights = self.mha(x, x, x, mask,adjoin_matrix)  # (batch_size, input_seq_len, d_model)
+        attn_output = self.dropout1(attn_output, training=training)
+        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
+        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
+        return out2,attention_weights
+class Encoder(tf.keras.Model):
+    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
+                 maximum_position_encoding, rate=0.1):
+        super(Encoder, self).__init__()
+        self.d_model = d_model
+        self.num_layers = num_layers
+        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
+        # self.pos_encoding = positional_encoding(maximum_position_encoding,
+        #                                         self.d_model)
+        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
+                           for _ in range(num_layers)]
+        self.dropout = tf.keras.layers.Dropout(rate)
+    def call(self, x, training, mask,adjoin_matrix):
+        seq_len = tf.shape(x)[1]
+        adjoin_matrix = adjoin_matrix[:,tf.newaxis,:,:]
+        # adding embedding and position encoding.
+        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
+        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
+        x = self.dropout(x, training=training)
+        for i in range(self.num_layers):
+            x,attention_weights = self.enc_layers[i](x, training, mask,adjoin_matrix)
+        return x  # (batch_size, input_seq_len, d_model)
+class Encoder_test(tf.keras.Model):
+    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
+                 maximum_position_encoding, rate=0.1):
+        super(Encoder_test, self).__init__()
+        self.d_model = d_model
+        self.num_layers = num_layers
+        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
+        # self.pos_encoding = positional_encoding(maximum_position_encoding,
+        #                                         self.d_model)
+        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
+                           for _ in range(num_layers)]
+        self.dropout = tf.keras.layers.Dropout(rate)
+    def call(self, x, training, mask,adjoin_matrix):
+        seq_len = tf.shape(x)[1]
+        adjoin_matrix = adjoin_matrix[:,tf.newaxis,:,:]
+        # adding embedding and position encoding.
+        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
+        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
+        # x += self.pos_encoding[:, :seq_len, :]
+        x = self.dropout(x, training=training)
+        attention_weights_list = []
+        xs = []
+        for i in range(self.num_layers):
+            x,attention_weights = self.enc_layers[i](x, training, mask,adjoin_matrix)
+            attention_weights_list.append(attention_weights)
+            xs.append(x)
+        return x,attention_weights_list,xs
+class BertModel_test(tf.keras.Model):
+    def __init__(self,num_layers = 6,d_model = 256,dff = 512,num_heads = 8,vocab_size = 17,dropout_rate = 0.1):
+        super(BertModel_test, self).__init__()
+        self.encoder = Encoder_test(num_layers=num_layers,d_model=d_model,
+                        num_heads=num_heads,dff=dff,input_vocab_size=vocab_size,maximum_position_encoding=200,rate=dropout_rate)
+        self.fc1 = tf.keras.layers.Dense(d_model, activation=gelu)
+        self.layernorm = tf.keras.layers.LayerNormalization(-1)
+        self.fc2 = tf.keras.layers.Dense(vocab_size)
+    def call(self,x,adjoin_matrix,mask,training=False):
+        x,att,xs = self.encoder(x,training=training,mask=mask,adjoin_matrix=adjoin_matrix)
+        x = self.fc1(x)
+        x = self.layernorm(x)
+        x = self.fc2(x)
+        return x,att,xs
+class BertModel(tf.keras.Model):
+    def __init__(self,num_layers = 6,d_model = 256,dff = 512,num_heads = 8,vocab_size = 17,dropout_rate = 0.1):
+        super(BertModel, self).__init__()
+        self.encoder = Encoder(num_layers=num_layers,d_model=d_model,
+                        num_heads=num_heads,dff=dff,input_vocab_size=vocab_size,maximum_position_encoding=200,rate=dropout_rate)
+        self.fc1 = tf.keras.layers.Dense(d_model, activation=gelu)
+        self.layernorm = tf.keras.layers.LayerNormalization(-1)
+        self.fc2 = tf.keras.layers.Dense(vocab_size)
+    def call(self,x,adjoin_matrix,mask,training=False):
+        x = self.encoder(x,training=training,mask=mask,adjoin_matrix=adjoin_matrix)
+        x = self.fc1(x)
+        x = self.layernorm(x)
+        x = self.fc2(x)
+        return x
+class PredictModel(tf.keras.Model):
+    def __init__(self,num_layers = 8,d_model = 256,dff = 512,num_heads = 8,vocab_size =17,dropout_rate = 0.1,dense_dropout=0.1):
+        super(PredictModel, self).__init__()
+        self.encoder = Encoder(num_layers=num_layers,d_model=d_model,
+                        num_heads=num_heads,dff=dff,input_vocab_size=vocab_size,maximum_position_encoding=200,rate=dropout_rate)
+        self.fc1 = tf.keras.layers.Dense(256,activation=tf.keras.layers.LeakyReLU(0.25))
+        self.fc2 = tf.keras.layers.Dense(256,activation=tf.keras.layers.LeakyReLU(0.25))
+        self.dropout = tf.keras.layers.Dropout(dense_dropout)
+        self.fc3 = tf.keras.layers.Dense(1)
+    def call(self,x,adjoin_matrix,mask,training=False):
+        x = self.encoder(x,training=training,mask=mask,adjoin_matrix=adjoin_matrix)
+        x = x[:,0,:]
+        x = self.fc1(x)
+        x = self.dropout(x,training=training)
+        x = self.fc2(x)
+        x = self.fc3(x)
+        return x
+class PredictModel_test(tf.keras.Model):
+    def __init__(self,num_layers = 6,d_model = 256,dff = 512,num_heads = 8,vocab_size =17,dropout_rate = 0.1,dense_dropout=0.5):
+        super(PredictModel_test, self).__init__()
+        self.encoder = Encoder_test(num_layers=num_layers,d_model=d_model,
+                        num_heads=num_heads,dff=dff,input_vocab_size=vocab_size,maximum_position_encoding=200,rate=dropout_rate)
+        self.fc1 = tf.keras.layers.Dense(256, activation=tf.keras.layers.LeakyReLU(0.1))
+        self.dropout = tf.keras.layers.Dropout(dense_dropout)
+        self.fc2 = tf.keras.layers.Dense(1)
+    def call(self,x,adjoin_matrix,mask,training=False):
+        x,att,xs = self.encoder(x,training=training,mask=mask,adjoin_matrix=adjoin_matrix)
+        x = x[:, 0, :]
+        x = self.fc1(x)
+        x = self.dropout(x, training=training)
+        x = self.fc2(x)
+        return x,att,xs

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+altair
+streamlit
+streamlit-ketcher
+tensorflow
+pandas
+rdkit
+scikit-learn
+matplotlib

utils.py ADDED Viewed

	@@ -0,0 +1,696 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jul 28 14:40:59 2022
+@author: BM109X32G-10GPU-02
+"""
+import os
+from collections import OrderedDict
+import numpy as np
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit.Chem import rdchem
+from compound_constants import DAY_LIGHT_FG_SMARTS_LIST
+def get_gasteiger_partial_charges(mol, n_iter=12):
+    """
+    Calculates list of gasteiger partial charges for each atom in mol object.
+    Args:
+        mol: rdkit mol object.
+        n_iter(int): number of iterations. Default 12.
+    Returns:
+        list of computed partial charges for each atom.
+    """
+    Chem.rdPartialCharges.ComputeGasteigerCharges(mol, nIter=n_iter,
+                                                  throwOnParamFailure=True)
+    partial_charges = [float(a.GetProp('_GasteigerCharge')) for a in
+                       mol.GetAtoms()]
+    return partial_charges
+def create_standardized_mol_id(smiles):
+    """
+    Args:
+        smiles: smiles sequence.
+    Returns:
+        inchi.
+    """
+    if check_smiles_validity(smiles):
+        # remove stereochemistry
+        smiles = AllChem.MolToSmiles(AllChem.MolFromSmiles(smiles),
+                                     isomericSmiles=False)
+        mol = Chem.AddHs(AllChem.MolFromSmiles(smiles))
+        if not mol is None: # to catch weird issue with O=C1O[al]2oc(=O)c3ccc(cn3)c3ccccc3c3cccc(c3)c3ccccc3c3cc(C(F)(F)F)c(cc3o2)-c2ccccc2-c2cccc(c2)-c2ccccc2-c2cccnc21
+            if '.' in smiles: # if multiple species, pick largest molecule
+                mol_species_list = split_rdkit_mol_obj(mol)
+                largest_mol = get_largest_mol(mol_species_list)
+                inchi = AllChem.MolToInchi(largest_mol)
+            else:
+                inchi = AllChem.MolToInchi(mol)
+            return inchi
+        else:
+            return
+    else:
+        return
+def check_smiles_validity(smiles):
+    """
+    Check whether the smile can't be converted to rdkit mol object.
+    """
+    try:
+        m = Chem.MolFromSmiles(smiles)
+        if m:
+            return True
+        else:
+            return False
+    except Exception as e:
+        return False
+def split_rdkit_mol_obj(mol):
+    """
+    Split rdkit mol object containing multiple species or one species into a
+    list of mol objects or a list containing a single object respectively.
+    Args:
+        mol: rdkit mol object.
+    """
+    smiles = AllChem.MolToSmiles(mol, isomericSmiles=True)
+    smiles_list = smiles.split('.')
+    mol_species_list = []
+    for s in smiles_list:
+        if check_smiles_validity(s):
+            mol_species_list.append(AllChem.MolFromSmiles(s))
+    return mol_species_list
+def get_largest_mol(mol_list):
+    """
+    Given a list of rdkit mol objects, returns mol object containing the
+    largest num of atoms. If multiple containing largest num of atoms,
+    picks the first one.
+    Args:
+        mol_list(list): a list of rdkit mol object.
+    Returns:
+        the largest mol.
+    """
+    num_atoms_list = [len(m.GetAtoms()) for m in mol_list]
+    largest_mol_idx = num_atoms_list.index(max(num_atoms_list))
+    return mol_list[largest_mol_idx]
+def rdchem_enum_to_list(values):
+    """values = {0: rdkit.Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
+            1: rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
+            2: rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
+            3: rdkit.Chem.rdchem.ChiralType.CHI_OTHER}
+    """
+    return [values[i] for i in range(len(values))]
+def safe_index(alist, elem):
+    """
+    Return index of element e in list l. If e is not present, return the last index
+    """
+    try:
+        return alist.index(elem)
+    except ValueError:
+        return len(alist) - 1
+def get_atom_feature_dims(list_acquired_feature_names):
+    """ tbd
+    """
+    return list(map(len, [CompoundKit.atom_vocab_dict[name] for name in list_acquired_feature_names]))
+def get_bond_feature_dims(list_acquired_feature_names):
+    """ tbd
+    """
+    list_bond_feat_dim = list(map(len, [CompoundKit.bond_vocab_dict[name] for name in list_acquired_feature_names]))
+    # +1 for self loop edges
+    return [_l + 1 for _l in list_bond_feat_dim]
+class CompoundKit(object):
+    """
+    CompoundKit
+    """
+    atom_vocab_dict = {
+        "atomic_num": list(range(1, 119)) + ['misc'],
+        "chiral_tag": rdchem_enum_to_list(rdchem.ChiralType.values),
+    }
+    bond_vocab_dict = {
+        "bond_dir": rdchem_enum_to_list(rdchem.BondDir.values),
+        "bond_type": rdchem_enum_to_list(rdchem.BondType.values),
+    }
+    # float features
+    atom_float_names = ["van_der_waals_radis", "partial_charge", 'mass']
+    # bond_float_feats= ["bond_length", "bond_angle"]     # optional
+    ### functional groups
+    day_light_fg_smarts_list = DAY_LIGHT_FG_SMARTS_LIST
+    day_light_fg_mo_list = [Chem.MolFromSmarts(smarts) for smarts in day_light_fg_smarts_list]
+    morgan_fp_N = 200
+    morgan2048_fp_N = 2048
+    maccs_fp_N = 167
+    period_table = Chem.GetPeriodicTable()
+    ### atom
+    @staticmethod
+    def get_atom_value(atom, name):
+        """get atom values"""
+        if name == 'atomic_num':
+            return atom.GetAtomicNum()
+        elif name == 'chiral_tag':
+            return atom.GetChiralTag()
+        elif name == 'degree':
+            return atom.GetDegree()
+        elif name == 'explicit_valence':
+            return atom.GetExplicitValence()
+        elif name == 'formal_charge':
+            return atom.GetFormalCharge()
+        elif name == 'hybridization':
+            return atom.GetHybridization()
+        elif name == 'implicit_valence':
+            return atom.GetImplicitValence()
+        elif name == 'is_aromatic':
+            return int(atom.GetIsAromatic())
+        elif name == 'mass':
+            return int(atom.GetMass())
+        elif name == 'total_numHs':
+            return atom.GetTotalNumHs()
+        elif name == 'num_radical_e':
+            return atom.GetNumRadicalElectrons()
+        elif name == 'atom_is_in_ring':
+            return int(atom.IsInRing())
+        elif name == 'valence_out_shell':
+            return CompoundKit.period_table.GetNOuterElecs(atom.GetAtomicNum())
+        else:
+            raise ValueError(name)
+    @staticmethod
+    def get_atom_feature_id(atom, name):
+        """get atom features id"""
+        assert name in CompoundKit.atom_vocab_dict, "%s not found in atom_vocab_dict" % name
+        return safe_index(CompoundKit.atom_vocab_dict[name], CompoundKit.get_atom_value(atom, name))
+    @staticmethod
+    def get_atom_feature_size(name):
+        """get atom features size"""
+        assert name in CompoundKit.atom_vocab_dict, "%s not found in atom_vocab_dict" % name
+        return len(CompoundKit.atom_vocab_dict[name])
+    ### bond
+    @staticmethod
+    def get_bond_value(bond, name):
+        """get bond values"""
+        if name == 'bond_dir':
+            return bond.GetBondDir()
+        elif name == 'bond_type':
+            return bond.GetBondType()
+        elif name == 'is_in_ring':
+            return int(bond.IsInRing())
+        elif name == 'is_conjugated':
+            return int(bond.GetIsConjugated())
+        elif name == 'bond_stereo':
+            return bond.GetStereo()
+        else:
+            raise ValueError(name)
+    @staticmethod
+    def get_bond_feature_id(bond, name):
+        """get bond features id"""
+        assert name in CompoundKit.bond_vocab_dict, "%s not found in bond_vocab_dict" % name
+        return safe_index(CompoundKit.bond_vocab_dict[name], CompoundKit.get_bond_value(bond, name))
+    @staticmethod
+    def get_bond_feature_size(name):
+        """get bond features size"""
+        assert name in CompoundKit.bond_vocab_dict, "%s not found in bond_vocab_dict" % name
+        return len(CompoundKit.bond_vocab_dict[name])
+    ### fingerprint
+    @staticmethod
+    def get_morgan_fingerprint(mol, radius=2):
+        """get morgan fingerprint"""
+        nBits = CompoundKit.morgan_fp_N
+        mfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
+        return [int(b) for b in mfp.ToBitString()]
+    @staticmethod
+    def get_morgan2048_fingerprint(mol, radius=2):
+        """get morgan2048 fingerprint"""
+        nBits = CompoundKit.morgan2048_fp_N
+        mfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
+        return [int(b) for b in mfp.ToBitString()]
+    @staticmethod
+    def get_maccs_fingerprint(mol):
+        """get maccs fingerprint"""
+        fp = AllChem.GetMACCSKeysFingerprint(mol)
+        return [int(b) for b in fp.ToBitString()]
+    ### functional groups
+    @staticmethod
+    def get_daylight_functional_group_counts(mol):
+        """get daylight functional group counts"""
+        fg_counts = []
+        for fg_mol in CompoundKit.day_light_fg_mo_list:
+            sub_structs = Chem.Mol.GetSubstructMatches(mol, fg_mol, uniquify=True)
+            fg_counts.append(len(sub_structs))
+        return fg_counts
+    @staticmethod
+    def get_ring_size(mol):
+        """return (N,6) list"""
+        rings = mol.GetRingInfo()
+        rings_info = []
+        for r in rings.AtomRings():
+            rings_info.append(r)
+        ring_list = []
+        for atom in mol.GetAtoms():
+            atom_result = []
+            for ringsize in range(3, 9):
+                num_of_ring_at_ringsize = 0
+                for r in rings_info:
+                    if len(r) == ringsize and atom.GetIdx() in r:
+                        num_of_ring_at_ringsize += 1
+                if num_of_ring_at_ringsize > 8:
+                    num_of_ring_at_ringsize = 9
+                atom_result.append(num_of_ring_at_ringsize)
+            ring_list.append(atom_result)
+        return ring_list
+    @staticmethod
+    def atom_to_feat_vector(atom):
+        """ tbd """
+        atom_names = {
+            "atomic_num": safe_index(CompoundKit.atom_vocab_dict["atomic_num"], atom.GetAtomicNum()),
+        }
+        return atom_names
+    @staticmethod
+    def get_atom_names(mol):
+        """get atom name list
+        TODO: to be remove in the future
+        """
+        atom_features_dicts = []
+        Chem.rdPartialCharges.ComputeGasteigerCharges(mol)
+        for i, atom in enumerate(mol.GetAtoms()):
+            atom_features_dicts.append(CompoundKit.atom_to_feat_vector(atom))
+        ring_list = CompoundKit.get_ring_size(mol)
+        for i, atom in enumerate(mol.GetAtoms()):
+            atom_features_dicts[i]['in_num_ring_with_size3'] = safe_index(
+                    CompoundKit.atom_vocab_dict['in_num_ring_with_size3'], ring_list[i][0])
+            atom_features_dicts[i]['in_num_ring_with_size4'] = safe_index(
+                    CompoundKit.atom_vocab_dict['in_num_ring_with_size4'], ring_list[i][1])
+            atom_features_dicts[i]['in_num_ring_with_size5'] = safe_index(
+                    CompoundKit.atom_vocab_dict['in_num_ring_with_size5'], ring_list[i][2])
+            atom_features_dicts[i]['in_num_ring_with_size6'] = safe_index(
+                    CompoundKit.atom_vocab_dict['in_num_ring_with_size6'], ring_list[i][3])
+            atom_features_dicts[i]['in_num_ring_with_size7'] = safe_index(
+                    CompoundKit.atom_vocab_dict['in_num_ring_with_size7'], ring_list[i][4])
+            atom_features_dicts[i]['in_num_ring_with_size8'] = safe_index(
+                    CompoundKit.atom_vocab_dict['in_num_ring_with_size8'], ring_list[i][5])
+        return atom_features_dicts
+    @staticmethod
+    def check_partial_charge(atom):
+        """tbd"""
+        pc = atom.GetDoubleProp('_GasteigerCharge')
+        if pc != pc:
+            # unsupported atom, replace nan with 0
+            pc = 0
+        if pc == float('inf'):
+            # max 4 for other atoms, set to 10 here if inf is get
+            pc = 10
+        return pc
+class Compound3DKit(object):
+    """the 3Dkit of Compound"""
+    @staticmethod
+    def get_atom_poses(mol, conf):
+        """tbd"""
+        atom_poses = []
+        for i, atom in enumerate(mol.GetAtoms()):
+            if atom.GetAtomicNum() == 0:
+                return [[0.0, 0.0, 0.0]] * len(mol.GetAtoms())
+            pos = conf.GetAtomPosition(i)
+            atom_poses.append([pos.x, pos.y, pos.z])
+        return atom_poses
+    @staticmethod
+    def get_MMFF_atom_poses(mol, numConfs=None, return_energy=False):
+        """the atoms of mol will be changed in some cases."""
+        try:
+            new_mol = Chem.AddHs(mol)
+            res = AllChem.EmbedMultipleConfs(new_mol, numConfs=numConfs)
+            ### MMFF generates multiple conformations
+            res = AllChem.MMFFOptimizeMoleculeConfs(new_mol)
+            #new_mol = Chem.RemoveHs(new_mol)
+            index = np.argmin([x[1] for x in res])
+            energy = res[index][1]
+            conf = new_mol.GetConformer(id=int(index))
+        except:
+            new_mol = Chem.AddHs(mol)
+            AllChem.Compute2DCoords(new_mol)
+            energy = 0
+            conf = new_mol.GetConformer()
+        atom_poses = Compound3DKit.get_atom_poses(new_mol, conf)
+        if return_energy:
+            return new_mol, atom_poses, energy
+        else:
+            return new_mol, atom_poses
+    @staticmethod
+    def get_2d_atom_poses(mol):
+        """get 2d atom poses"""
+        AllChem.Compute2DCoords(mol)
+        conf = mol.GetConformer()
+        atom_poses = Compound3DKit.get_atom_poses(mol, conf)
+        return atom_poses
+    @staticmethod
+    def get_bond_lengths(edges, atom_poses):
+        """get bond lengths"""
+        bond_lengths = []
+        for src_node_i, tar_node_j in edges:
+            bond_lengths.append(np.linalg.norm(atom_poses[tar_node_j] - atom_poses[src_node_i]))
+        bond_lengths = np.array(bond_lengths, 'float32')
+        return bond_lengths
+    @staticmethod
+    def get_superedge_angles(edges, atom_poses, dir_type='HT'):
+        """get superedge angles"""
+        def _get_vec(atom_poses, edge):
+            return atom_poses[edge[1]] - atom_poses[edge[0]]
+        def _get_angle(vec1, vec2):
+            norm1 = np.linalg.norm(vec1)
+            norm2 = np.linalg.norm(vec2)
+            if norm1 == 0 or norm2 == 0:
+                return 0
+            vec1 = vec1 / (norm1 + 1e-5)    # 1e-5: prevent numerical errors
+            vec2 = vec2 / (norm2 + 1e-5)
+            angle = np.arccos(np.dot(vec1, vec2))
+            return angle
+        E = len(edges)
+        edge_indices = np.arange(E)
+        super_edges = []
+        bond_angles = []
+        bond_angle_dirs = []
+        for tar_edge_i in range(E):
+            tar_edge = edges[tar_edge_i]
+            if dir_type == 'HT':
+                src_edge_indices = edge_indices[edges[:, 1] == tar_edge[0]]
+            elif dir_type == 'HH':
+                src_edge_indices = edge_indices[edges[:, 1] == tar_edge[1]]
+            else:
+                raise ValueError(dir_type)
+            for src_edge_i in src_edge_indices:
+                if src_edge_i == tar_edge_i:
+                    continue
+                src_edge = edges[src_edge_i]
+                src_vec = _get_vec(atom_poses, src_edge)
+                tar_vec = _get_vec(atom_poses, tar_edge)
+                super_edges.append([src_edge_i, tar_edge_i])
+                angle = _get_angle(src_vec, tar_vec)
+                bond_angles.append(angle)
+                bond_angle_dirs.append(src_edge[1] == tar_edge[0])  # H -> H or H -> T
+        if len(super_edges) == 0:
+            super_edges = np.zeros([0, 2], 'int64')
+            bond_angles = np.zeros([0,], 'float32')
+        else:
+            super_edges = np.array(super_edges, 'int64')
+            bond_angles = np.array(bond_angles, 'float32')
+        return super_edges, bond_angles, bond_angle_dirs
+def new_smiles_to_graph_data(smiles, **kwargs):
+    """
+    Convert smiles to graph data.
+    """
+    mol = Chem.AddHs(AllChem.MolFromSmiles(smiles))
+    if mol is None:
+        return None
+    data = new_mol_to_graph_data(mol)
+    return data
+def new_mol_to_graph_data(mol):
+    """
+    mol_to_graph_data
+    Args:
+        atom_features: Atom features.
+        edge_features: Edge features.
+        morgan_fingerprint: Morgan fingerprint.
+        functional_groups: Functional groups.
+    """
+    if len(mol.GetAtoms()) == 0:
+        return None
+    atom_id_names = list(CompoundKit.atom_vocab_dict.keys()) + CompoundKit.atom_float_names
+    bond_id_names = list(CompoundKit.bond_vocab_dict.keys())
+    data = {}
+    ### atom features
+    data = {name: [] for name in atom_id_names}
+    raw_atom_feat_dicts = CompoundKit.get_atom_names(mol)
+    for atom_feat in raw_atom_feat_dicts:
+        for name in atom_id_names:
+            data[name].append(atom_feat[name])
+    ### bond and bond features
+    for name in bond_id_names:
+        data[name] = []
+    data['edges'] = []
+    for bond in mol.GetBonds():
+        i = bond.GetBeginAtomIdx()
+        j = bond.GetEndAtomIdx()
+        # i->j and j->i
+        data['edges'] += [(i, j), (j, i)]
+        for name in bond_id_names:
+            bond_feature_id = CompoundKit.get_bond_feature_id(bond, name)
+            data[name] += [bond_feature_id] * 2
+    #### self loop
+    N = len(data[atom_id_names[0]])
+    for i in range(N):
+        data['edges'] += [(i, i)]
+    for name in bond_id_names:
+        bond_feature_id = get_bond_feature_dims([name])[0] - 1   # self loop: value = len - 1
+        data[name] += [bond_feature_id] * N
+    ### make ndarray and check length
+    for name in list(CompoundKit.atom_vocab_dict.keys()):
+        data[name] = np.array(data[name], 'int64')
+    for name in CompoundKit.atom_float_names:
+        data[name] = np.array(data[name], 'float32')
+    for name in bond_id_names:
+        data[name] = np.array(data[name], 'int64')
+    data['edges'] = np.array(data['edges'], 'int64')
+    ### morgan fingerprint
+    data['morgan_fp'] = np.array(CompoundKit.get_morgan_fingerprint(mol), 'int64')
+    # data['morgan2048_fp'] = np.array(CompoundKit.get_morgan2048_fingerprint(mol), 'int64')
+    data['maccs_fp'] = np.array(CompoundKit.get_maccs_fingerprint(mol), 'int64')
+    data['daylight_fg_counts'] = np.array(CompoundKit.get_daylight_functional_group_counts(mol), 'int64')
+    return data
+def mol_to_graph_data(mol):
+    """
+    mol_to_graph_data
+    Args:
+        atom_features: Atom features.
+        edge_features: Edge features.
+        morgan_fingerprint: Morgan fingerprint.
+        functional_groups: Functional groups.
+    """
+    if len(mol.GetAtoms()) == 0:
+        return None
+    atom_id_names = [
+        "atomic_num"
+    ]
+    bond_id_names = [
+        "bond_dir", "bond_type"
+    ]
+    data = {}
+    for name in atom_id_names:
+        data[name] = []
+    data['mass'] = []
+    for name in bond_id_names:
+        data[name] = []
+    data['edges'] = []
+    ### atom features
+    for i, atom in enumerate(mol.GetAtoms()):
+        if atom.GetAtomicNum() == 0:
+            return None
+        for name in atom_id_names:
+            data[name].append(CompoundKit.get_atom_feature_id(atom, name) + 1)  # 0: OOV
+        data['mass'].append(CompoundKit.get_atom_value(atom, 'mass') * 0.01)
+    ### bond features
+    for bond in mol.GetBonds():
+        i = bond.GetBeginAtomIdx()
+        j = bond.GetEndAtomIdx()
+        # i->j and j->i
+        data['edges'] += [(i, j), (j, i)]
+        for name in bond_id_names:
+            bond_feature_id = CompoundKit.get_bond_feature_id(bond, name) + 1   # 0: OOV
+            data[name] += [bond_feature_id] * 2
+    num_atoms = mol.GetNumAtoms()
+    atoms_list = []
+    for i in range(num_atoms):
+        atom = mol.GetAtomWithIdx(i)
+        atoms_list.append(atom.GetSymbol())
+    ### self loop (+2)
+    N = len(data[atom_id_names[0]])
+    for i in range(N):
+        data['edges'] += [(i, i)]
+    for name in bond_id_names:
+        bond_feature_id = CompoundKit.get_bond_feature_size(name) + 2   # N + 2: self loop
+        data[name] += [bond_feature_id] * N
+    ### check whether edge exists
+    if len(data['edges']) == 0: # mol has no bonds
+        for name in bond_id_names:
+            data[name] = np.zeros((0,), dtype="int64")
+        data['edges'] = np.zeros((0, 2), dtype="int64")
+    ### make ndarray and check length
+    for name in atom_id_names:
+        data[name] = np.array(data[name], 'int64')
+    data['mass'] = np.array(data['mass'], 'float32')
+    for name in bond_id_names:
+        data[name] = np.array(data[name], 'int64')
+    data['edges'] = np.array(data['edges'], 'int64')
+    data['atoms'] = np.array(atoms_list)
+    ### morgan fingerprint
+    #data['morgan_fp'] = np.array(CompoundKit.get_morgan_fingerprint(mol), 'int64')
+    # data['morgan2048_fp'] = np.array(CompoundKit.get_morgan2048_fingerprint(mol), 'int64')
+    #data['maccs_fp'] = np.array(CompoundKit.get_maccs_fingerprint(mol), 'int64')
+    #data['daylight_fg_counts'] = np.array(CompoundKit.get_daylight_functional_group_counts(mol), 'int64')
+    #return data['bonds_dir'],data['adj_angle']
+    return data
+def mol_to_geognn_graph_data(mol, atom_poses, dir_type):
+    """
+    mol: rdkit molecule
+    dir_type: direction type for bond_angle grpah
+    """
+    if len(mol.GetAtoms()) == 0:
+        return None
+    data = mol_to_graph_data(mol)
+    data['atom_pos'] = np.array(atom_poses, 'float32')
+    data['bond_length'] = Compound3DKit.get_bond_lengths(data['edges'], data['atom_pos'])
+    # BondAngleGraph_edges, bond_angles, bond_angle_dirs = \
+    #         Compound3DKit.get_superedge_angles(data['edges'], data['atom_pos'])
+ #   data['BondAngleGraph_edges'] = BondAngleGraph_edges
+ #   data['bond_angle'] = np.array(bond_angles, 'float32')
+    data['adj_node'] = gen_adj(len(data['atoms']),data['edges'],data['bond_length'])
+   # data['adj_edge'] = gen_adj(len(data['bond_dir']),data['BondAngleGraph_edges'],data['bond_angle'])
+    return data['atoms'], data['adj_node']
+def mol_to_geognn_graph_data_MMFF3d(smiles):
+    """tbd"""
+    mol = Chem.AddHs(AllChem.MolFromSmiles(smiles))
+    if len(mol.GetAtoms()) <= 400:
+        mol, atom_poses = Compound3DKit.get_MMFF_atom_poses(mol, numConfs=10)
+    else:
+        atom_poses = Compound3DKit.get_2d_atom_poses(mol)
+    return mol_to_geognn_graph_data(mol, atom_poses, dir_type='HT')
+def mol_to_geognn_graph_data_raw3d(mol):
+    """tbd"""
+    atom_poses = Compound3DKit.get_atom_poses(mol, mol.GetConformer())
+    return mol_to_geognn_graph_data(mol, atom_poses, dir_type='HT')
+def gen_adj(shape,edges,length):
+    adj=edges
+    e = shape
+    ones = np.zeros([e,e])
+    #for i in range(e):
+    for i in range (len(length)):
+        if adj[i,0] != adj[i,1]:
+            ones[adj[i,0],adj[i,1]]=format(float(length[i] ), '.3f')
+    return ones
+if __name__ == "__main__":
+    import pandas as pd
+    from tqdm import tqdm
+    f = pd.read_csv (r"data/reg/train3.csv")
+    re = []
+    pce = f['PCE']
+    for ind,smile in enumerate ( f.iloc[:,1]):
+        print(ind)
+        atom,adj = mol_to_geognn_graph_data_MMFF3d(smile)
+        np.save('data/reg/train/adj'+str(ind)+'.npy',np.array(adj))
+        re.append([atom,'data/reg/train/adj'+str(ind)+'.npy',pce[ind] ])
+    r = pd.DataFrame(re)
+    r.to_csv('data/reg/train/train.csv')
+    re = []
+    f = pd.read_csv(r'data/reg/test3.csv')
+    re = []
+    pce = f['PCE']
+    for ind,smile in enumerate ( f.iloc[:,1]):
+        print(ind)
+        atom,adj = mol_to_geognn_graph_data_MMFF3d(smile)
+        np.save('data/reg/test/adj'+str(ind)+'.npy',np.array(adj))
+        re.append([atom,'data/reg/test/adj'+str(ind)+'.npy',pce[ind] ])
+    r = pd.DataFrame(re)
+    r.to_csv('data/reg/test/test.csv')
+    f = pd.read_csv(r'val.csv')
+    re = []
+    pce = f['PCE']
+    for ind,smile in enumerate ( f.iloc[:,1]):
+        print(ind)
+        atom,adj = mol_to_geognn_graph_data_MMFF3d(smile)
+        np.save('data/reg/val/adj'+str(ind)+'.npy',np.array(adj))
+        re.append([atom,'data/reg/val/adj'+str(ind)+'.npy',pce[ind] ])
+    r = pd.DataFrame(re)
+    r.to_csv('data/reg/val/val.csv')