Spaces:
Sleeping
Sleeping
Eachan Johnson
commited on
Commit
·
2dceef8
0
Parent(s):
Initial commit
Browse files- .github/workflows/python-package.yml +48 -0
- .github/workflows/python-publish.yml +42 -0
- .gitignore +28 -0
- .readthedocs.yml +25 -0
- LICENSE +21 -0
- README.md +73 -0
- build/lib/schemist/__init__.py +0 -0
- build/lib/schemist/cleaning.py +27 -0
- build/lib/schemist/cli.py +536 -0
- build/lib/schemist/collating.py +315 -0
- build/lib/schemist/converting.py +308 -0
- build/lib/schemist/features.py +149 -0
- build/lib/schemist/generating.py +262 -0
- build/lib/schemist/io.py +149 -0
- build/lib/schemist/rest_lookup.py +118 -0
- build/lib/schemist/splitting.py +204 -0
- build/lib/schemist/tables.py +220 -0
- build/lib/schemist/typing.py +7 -0
- build/lib/schemist/utils.py +1 -0
- docs/requirements.txt +8 -0
- docs/source/conf.py +45 -0
- docs/source/index.md +21 -0
- pyproject.toml +60 -0
- schemist/__init__.py +0 -0
- schemist/cleaning.py +27 -0
- schemist/cli.py +536 -0
- schemist/collating.py +315 -0
- schemist/converting.py +308 -0
- schemist/features.py +149 -0
- schemist/generating.py +262 -0
- schemist/io.py +149 -0
- schemist/rest_lookup.py +118 -0
- schemist/splitting.py +204 -0
- schemist/tables.py +220 -0
- schemist/typing.py +7 -0
- schemist/utils.py +1 -0
- test/data/AmpC_screen_table_10k.csv.gz +0 -0
- test/tests.py +6 -0
.github/workflows/python-package.yml
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
|
2 |
+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
|
3 |
+
|
4 |
+
name: Python package
|
5 |
+
|
6 |
+
on: [push]
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
build:
|
10 |
+
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
strategy:
|
13 |
+
fail-fast: false
|
14 |
+
matrix:
|
15 |
+
python-version: ["3.8", "3.9", "3.10", "3.11"]
|
16 |
+
|
17 |
+
steps:
|
18 |
+
- uses: actions/checkout@v3
|
19 |
+
- name: Set up Python ${{ matrix.python-version }}
|
20 |
+
uses: actions/setup-python@v3
|
21 |
+
with:
|
22 |
+
python-version: ${{ matrix.python-version }}
|
23 |
+
- name: Install dependencies
|
24 |
+
run: |
|
25 |
+
python -m pip install --upgrade pip
|
26 |
+
python -m pip install flake8 pytest pytest-cov
|
27 |
+
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
28 |
+
pip install -e .
|
29 |
+
- name: Lint with flake8
|
30 |
+
run: |
|
31 |
+
# stop the build if there are Python syntax errors or undefined names
|
32 |
+
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
33 |
+
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
34 |
+
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
35 |
+
- name: Test with pytest
|
36 |
+
run: |
|
37 |
+
pytest htstools --doctest-modules --junitxml=tests/test-results.xml --cov=com --cov-report=xml --cov-report=html
|
38 |
+
- name: Test with scripts
|
39 |
+
run: |
|
40 |
+
bash test/scripts/test-plate-tab.sh
|
41 |
+
bash test/scripts/test-row-xlsx.sh
|
42 |
+
- name: Upload pytest test results
|
43 |
+
uses: actions/upload-artifact@v3
|
44 |
+
with:
|
45 |
+
name: pytest-results-${{ matrix.python-version }}
|
46 |
+
path: junit/test-results-${{ matrix.python-version }}.xml
|
47 |
+
# Use always() to always run this step to publish test results when there are test failures
|
48 |
+
if: ${{ always() }}
|
.github/workflows/python-publish.yml
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This workflow will upload a Python Package using Twine when a release is created
|
2 |
+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
3 |
+
|
4 |
+
# This workflow uses actions that are not certified by GitHub.
|
5 |
+
# They are provided by a third-party and are governed by
|
6 |
+
# separate terms of service, privacy policy, and support
|
7 |
+
# documentation.
|
8 |
+
|
9 |
+
name: Upload Python Package
|
10 |
+
|
11 |
+
on:
|
12 |
+
release:
|
13 |
+
types: [published]
|
14 |
+
|
15 |
+
permissions:
|
16 |
+
contents: read
|
17 |
+
|
18 |
+
jobs:
|
19 |
+
deploy:
|
20 |
+
|
21 |
+
runs-on: ubuntu-latest
|
22 |
+
strategy:
|
23 |
+
matrix:
|
24 |
+
python-version: ["3.11"]
|
25 |
+
|
26 |
+
steps:
|
27 |
+
- uses: actions/checkout@v3
|
28 |
+
- name: Set up Python
|
29 |
+
uses: actions/setup-python@v3
|
30 |
+
with:
|
31 |
+
python-version: ${{ matrix.python-version }}
|
32 |
+
- name: Install dependencies
|
33 |
+
run: |
|
34 |
+
python -m pip install --upgrade pip
|
35 |
+
pip install build
|
36 |
+
- name: Build package
|
37 |
+
run: python -m build --sdist --wheel --outdir dist
|
38 |
+
- name: Publish package
|
39 |
+
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
|
40 |
+
with:
|
41 |
+
user: __token__
|
42 |
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
.gitignore
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.pyc
|
2 |
+
*.so
|
3 |
+
*.egg-info
|
4 |
+
*.whl
|
5 |
+
.DS_Store
|
6 |
+
.mypy_cache/
|
7 |
+
.pytype/
|
8 |
+
.idea
|
9 |
+
.vscode
|
10 |
+
.envrc
|
11 |
+
__pycache__
|
12 |
+
.pytest_cache
|
13 |
+
|
14 |
+
# Sphinx
|
15 |
+
/docs/build/
|
16 |
+
/docs/_autosummary/
|
17 |
+
/docs/make.bat
|
18 |
+
/docs/Makefile
|
19 |
+
|
20 |
+
/test/outputs/
|
21 |
+
/test/data/collate/
|
22 |
+
|
23 |
+
# virtualenv/venv directories
|
24 |
+
/venv/
|
25 |
+
/bin/
|
26 |
+
/include/
|
27 |
+
/lib/
|
28 |
+
/share/
|
.readthedocs.yml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# .readthedocs.yml
|
2 |
+
# Read the Docs configuration file
|
3 |
+
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
4 |
+
|
5 |
+
# Required
|
6 |
+
version: 2
|
7 |
+
|
8 |
+
build:
|
9 |
+
os: "ubuntu-20.04"
|
10 |
+
tools:
|
11 |
+
python: "3.10"
|
12 |
+
|
13 |
+
# Build documentation in the docs/ directory with Sphinx
|
14 |
+
sphinx:
|
15 |
+
configuration: docs/source/conf.py
|
16 |
+
fail_on_warning: false
|
17 |
+
|
18 |
+
# Optionally build your docs in additional formats such as PDF and ePub
|
19 |
+
formats:
|
20 |
+
- htmlzip
|
21 |
+
|
22 |
+
# Optionally set the version of Python and requirements required to build your docs
|
23 |
+
python:
|
24 |
+
install:
|
25 |
+
- requirements: docs/requirements.txt
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) [year] [fullname]
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ⬢⬢⬢ schemist
|
2 |
+
|
3 |
+

|
4 |
+

|
5 |
+

|
6 |
+
|
7 |
+
Cleaning, collating, and augmenting chemical datasets.
|
8 |
+
|
9 |
+
- [Installation](#installation)
|
10 |
+
- [Command-line usage](#command-line-usage)
|
11 |
+
- [Example](#example)
|
12 |
+
- [Other commands](#other-commands)
|
13 |
+
- [Python API](#python-api)
|
14 |
+
- [Documentation](#documentation)
|
15 |
+
|
16 |
+
## Installation
|
17 |
+
|
18 |
+
### The easy way
|
19 |
+
|
20 |
+
Install the pre-compiled version from PyPI:
|
21 |
+
|
22 |
+
```bash
|
23 |
+
pip install schemist
|
24 |
+
```
|
25 |
+
|
26 |
+
### From source
|
27 |
+
|
28 |
+
Clone the repository, then `cd` into it. Then run:
|
29 |
+
|
30 |
+
```bash
|
31 |
+
pip install -e .
|
32 |
+
```
|
33 |
+
|
34 |
+
## Command-line usage
|
35 |
+
|
36 |
+
**schemist** provides command-line utlities to ... The tools complete specific tasks which
|
37 |
+
can be easily composed into analysis pipelines, because the TSV table output goes to
|
38 |
+
`stdout` by default so they can be piped from one tool to another.
|
39 |
+
|
40 |
+
To get a list of commands (tools), do
|
41 |
+
|
42 |
+
```bash
|
43 |
+
schemist --help
|
44 |
+
```
|
45 |
+
|
46 |
+
And to get help for a specific command, do
|
47 |
+
|
48 |
+
```bash
|
49 |
+
schemist <command> --help
|
50 |
+
```
|
51 |
+
|
52 |
+
For the Python API, [see below](#python-api).
|
53 |
+
|
54 |
+
## Example
|
55 |
+
|
56 |
+
|
57 |
+
## Other commands
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
## Python API
|
62 |
+
|
63 |
+
**schemist** can be imported into Python to help make custom analyses.
|
64 |
+
|
65 |
+
```python
|
66 |
+
>>> import schemist as sch
|
67 |
+
```
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
## Documentation
|
72 |
+
|
73 |
+
Full API documentation is at [ReadTheDocs](https://schemist.readthedocs.org).
|
build/lib/schemist/__init__.py
ADDED
File without changes
|
build/lib/schemist/cleaning.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Chemical structure cleaning routines."""
|
2 |
+
|
3 |
+
from carabiner.decorators import vectorize
|
4 |
+
|
5 |
+
from datamol import sanitize_smiles
|
6 |
+
import selfies as sf
|
7 |
+
|
8 |
+
@vectorize
|
9 |
+
def clean_smiles(smiles: str,
|
10 |
+
*args, **kwargs) -> str:
|
11 |
+
|
12 |
+
"""Sanitize a SMILES string or list of SMILES strings.
|
13 |
+
|
14 |
+
"""
|
15 |
+
|
16 |
+
return sanitize_smiles(smiles, *args, **kwargs)
|
17 |
+
|
18 |
+
|
19 |
+
@vectorize
|
20 |
+
def clean_selfies(selfies: str,
|
21 |
+
*args, **kwargs) -> str:
|
22 |
+
|
23 |
+
"""Sanitize a SELFIES string or list of SELFIES strings.
|
24 |
+
|
25 |
+
"""
|
26 |
+
|
27 |
+
return sf.encode(sanitize_smiles(sf.decode(selfies), *args, **kwargs))
|
build/lib/schemist/cli.py
ADDED
@@ -0,0 +1,536 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Command-line interface for schemist."""
|
2 |
+
|
3 |
+
from typing import Any, Dict, List, Optional
|
4 |
+
|
5 |
+
from argparse import FileType, Namespace
|
6 |
+
from collections import Counter, defaultdict
|
7 |
+
from functools import partial
|
8 |
+
import os
|
9 |
+
import sys
|
10 |
+
from tempfile import NamedTemporaryFile, TemporaryDirectory
|
11 |
+
|
12 |
+
from carabiner import pprint_dict, upper_and_lower, print_err
|
13 |
+
from carabiner.cliutils import clicommand, CLIOption, CLICommand, CLIApp
|
14 |
+
from carabiner.itertools import tenumerate
|
15 |
+
from carabiner.pd import get_formats, write_stream
|
16 |
+
|
17 |
+
from .collating import collate_inventory, deduplicate_file
|
18 |
+
from .converting import _TO_FUNCTIONS, _FROM_FUNCTIONS
|
19 |
+
from .generating import AA, REACTIONS
|
20 |
+
from .io import _mutate_df_stream
|
21 |
+
from .tables import (converter, cleaner, featurizer, assign_groups,
|
22 |
+
_assign_splits, splitter, _peptide_table, reactor)
|
23 |
+
from .splitting import _SPLITTERS, _GROUPED_SPLITTERS
|
24 |
+
|
25 |
+
__version__ = '0.0.1'
|
26 |
+
|
27 |
+
def _option_parser(x: Optional[List[str]]) -> Dict[str, Any]:
|
28 |
+
|
29 |
+
options = {}
|
30 |
+
|
31 |
+
try:
|
32 |
+
for opt in x:
|
33 |
+
|
34 |
+
try:
|
35 |
+
key, value = opt.split('=')
|
36 |
+
except ValueError:
|
37 |
+
raise ValueError(f"Option {opt} is misformatted. It should be in the format keyword=value.")
|
38 |
+
|
39 |
+
try:
|
40 |
+
value = int(value)
|
41 |
+
except ValueError:
|
42 |
+
try:
|
43 |
+
value = float(value)
|
44 |
+
except ValueError:
|
45 |
+
pass
|
46 |
+
|
47 |
+
options[key] = value
|
48 |
+
|
49 |
+
except TypeError:
|
50 |
+
|
51 |
+
pass
|
52 |
+
|
53 |
+
return options
|
54 |
+
|
55 |
+
|
56 |
+
def _sum_tally(tallies: Counter,
|
57 |
+
message: str = "Error counts",
|
58 |
+
use_length: bool = False):
|
59 |
+
|
60 |
+
total_tally = Counter()
|
61 |
+
|
62 |
+
for tally in tallies:
|
63 |
+
|
64 |
+
if use_length:
|
65 |
+
total_tally.update({key: len(value) for key, value in tally.items()})
|
66 |
+
else:
|
67 |
+
total_tally.update(tally)
|
68 |
+
|
69 |
+
if len(tallies) == 0:
|
70 |
+
raise ValueError(f"Nothing generated!")
|
71 |
+
|
72 |
+
pprint_dict(total_tally, message=message)
|
73 |
+
|
74 |
+
return total_tally
|
75 |
+
|
76 |
+
|
77 |
+
@clicommand(message="Cleaning file with the following parameters")
|
78 |
+
def _clean(args: Namespace) -> None:
|
79 |
+
|
80 |
+
error_tallies = _mutate_df_stream(input_file=args.input,
|
81 |
+
output_file=args.output,
|
82 |
+
function=partial(cleaner,
|
83 |
+
column=args.column,
|
84 |
+
input_representation=args.representation,
|
85 |
+
prefix=args.prefix),
|
86 |
+
file_format=args.format)
|
87 |
+
|
88 |
+
_sum_tally(error_tallies)
|
89 |
+
|
90 |
+
return None
|
91 |
+
|
92 |
+
|
93 |
+
@clicommand(message="Converting between string representations with the following parameters")
|
94 |
+
def _convert(args: Namespace) -> None:
|
95 |
+
|
96 |
+
options = _option_parser(args.options)
|
97 |
+
|
98 |
+
error_tallies = _mutate_df_stream(input_file=args.input,
|
99 |
+
output_file=args.output,
|
100 |
+
function=partial(converter,
|
101 |
+
column=args.column,
|
102 |
+
input_representation=args.representation,
|
103 |
+
output_representation=args.to,
|
104 |
+
prefix=args.prefix,
|
105 |
+
options=options),
|
106 |
+
file_format=args.format)
|
107 |
+
|
108 |
+
_sum_tally(error_tallies)
|
109 |
+
|
110 |
+
return None
|
111 |
+
|
112 |
+
|
113 |
+
@clicommand(message="Adding features to files with the following parameters")
|
114 |
+
def _featurize(args: Namespace) -> None:
|
115 |
+
|
116 |
+
error_tallies = _mutate_df_stream(input_file=args.input,
|
117 |
+
output_file=args.output,
|
118 |
+
function=partial(featurizer,
|
119 |
+
feature_type=args.feature,
|
120 |
+
column=args.column,
|
121 |
+
ids=args.id,
|
122 |
+
input_representation=args.representation,
|
123 |
+
prefix=args.prefix),
|
124 |
+
file_format=args.format)
|
125 |
+
|
126 |
+
_sum_tally(error_tallies)
|
127 |
+
|
128 |
+
return None
|
129 |
+
|
130 |
+
|
131 |
+
@clicommand(message="Splitting table with the following parameters")
|
132 |
+
def _split(args: Namespace) -> None:
|
133 |
+
|
134 |
+
split_type = args.type.casefold()
|
135 |
+
|
136 |
+
if split_type in _GROUPED_SPLITTERS:
|
137 |
+
|
138 |
+
chunk_processor, aggregator = _GROUPED_SPLITTERS[split_type]
|
139 |
+
|
140 |
+
with TemporaryDirectory() as dir:
|
141 |
+
|
142 |
+
with NamedTemporaryFile("w", dir=dir, delete=False) as f:
|
143 |
+
|
144 |
+
group_idxs = _mutate_df_stream(input_file=args.input,
|
145 |
+
output_file=f,
|
146 |
+
function=partial(assign_groups,
|
147 |
+
grouper=chunk_processor,
|
148 |
+
group_name=split_type,
|
149 |
+
column=args.column,
|
150 |
+
input_representation=args.representation),
|
151 |
+
file_format=args.format)
|
152 |
+
f.close()
|
153 |
+
new_group_idx = defaultdict(list)
|
154 |
+
|
155 |
+
totals = 0
|
156 |
+
for group_idx in group_idxs:
|
157 |
+
these_totals = 0
|
158 |
+
for key, value in group_idx.items():
|
159 |
+
these_totals += len(value)
|
160 |
+
new_group_idx[key] += [idx + totals for idx in value]
|
161 |
+
totals += these_totals
|
162 |
+
|
163 |
+
group_idx = aggregator(new_group_idx,
|
164 |
+
train=args.train,
|
165 |
+
test=args.test)
|
166 |
+
|
167 |
+
split_tallies = _mutate_df_stream(input_file=f.name,
|
168 |
+
output_file=args.output,
|
169 |
+
function=partial(_assign_splits,
|
170 |
+
split_idx=group_idx,
|
171 |
+
use_df_index=True),
|
172 |
+
file_format=args.format)
|
173 |
+
if os.path.exists(f.name):
|
174 |
+
os.remove(f.name)
|
175 |
+
|
176 |
+
else:
|
177 |
+
|
178 |
+
split_tallies = _mutate_df_stream(input_file=args.input,
|
179 |
+
output_file=args.output,
|
180 |
+
function=partial(splitter,
|
181 |
+
split_type=args.type,
|
182 |
+
column=args.column,
|
183 |
+
input_representation=args.representation,
|
184 |
+
train=args.train,
|
185 |
+
test=args.test,
|
186 |
+
set_seed=args.seed),
|
187 |
+
file_format=args.format)
|
188 |
+
|
189 |
+
_sum_tally(split_tallies,
|
190 |
+
message="Split counts")
|
191 |
+
|
192 |
+
return None
|
193 |
+
|
194 |
+
|
195 |
+
@clicommand(message="Collating files with the following parameters")
|
196 |
+
def _collate(args: Namespace) -> None:
|
197 |
+
|
198 |
+
root_dir = args.data_dir or '.'
|
199 |
+
|
200 |
+
error_tallies = _mutate_df_stream(input_file=args.input,
|
201 |
+
output_file=args.output,
|
202 |
+
function=partial(collate_inventory,
|
203 |
+
root_dir=root_dir,
|
204 |
+
drop_unmapped=not args.keep_extra_columns,
|
205 |
+
catalog_smiles_column=args.column,
|
206 |
+
id_column_name=args.id_column,
|
207 |
+
id_n_digits=args.digits,
|
208 |
+
id_prefix=args.prefix),
|
209 |
+
file_format=args.format)
|
210 |
+
|
211 |
+
_sum_tally(error_tallies,
|
212 |
+
message="Collated chemicals:")
|
213 |
+
|
214 |
+
return None
|
215 |
+
|
216 |
+
|
217 |
+
@clicommand(message="Deduplicating chemical structures with the following parameters")
|
218 |
+
def _dedup(args: Namespace) -> None:
|
219 |
+
|
220 |
+
report, deduped_df = deduplicate_file(args.input,
|
221 |
+
format=args.format,
|
222 |
+
column=args.column,
|
223 |
+
input_representation=args.representation,
|
224 |
+
index_columns=args.indexes)
|
225 |
+
|
226 |
+
if args.prefix is not None and 'inchikey' in deduped_df:
|
227 |
+
deduped_df = deduped_df.rename(columns={'inchikey': f'{args.prefix}inchikey'})
|
228 |
+
|
229 |
+
write_stream(deduped_df,
|
230 |
+
output=args.output,
|
231 |
+
format=args.format)
|
232 |
+
|
233 |
+
pprint_dict(report, message="Finished deduplicating:")
|
234 |
+
|
235 |
+
return None
|
236 |
+
|
237 |
+
|
238 |
+
@clicommand(message="Enumerating peptides with the following parameters")
|
239 |
+
def _enum(args: Namespace) -> None:
|
240 |
+
|
241 |
+
tables = _peptide_table(max_length=args.max_length,
|
242 |
+
min_length=args.min_length,
|
243 |
+
n=args.number,
|
244 |
+
indexes=args.slice,
|
245 |
+
set_seed=args.seed,
|
246 |
+
prefix=args.prefix,
|
247 |
+
suffix=args.suffix,
|
248 |
+
d_aa_only=args.d_aa_only,
|
249 |
+
include_d_aa=args.include_d_aa,
|
250 |
+
generator=True)
|
251 |
+
|
252 |
+
dAA_use = any(aa.islower() for aa in args.prefix + args.suffix)
|
253 |
+
dAA_use = dAA_use or args.include_d_aa or args.d_aa_only
|
254 |
+
|
255 |
+
tallies, error_tallies = [], []
|
256 |
+
options = _option_parser(args.options)
|
257 |
+
_converter = partial(converter,
|
258 |
+
column='peptide_sequence',
|
259 |
+
input_representation='minihelm' if dAA_use else 'aa_seq', ## affects performance
|
260 |
+
output_representation=args.to,
|
261 |
+
options=options)
|
262 |
+
|
263 |
+
for i, table in tenumerate(tables):
|
264 |
+
|
265 |
+
_err_tally, df = _converter(table)
|
266 |
+
|
267 |
+
tallies.append({"Number of peptides": df.shape[0]})
|
268 |
+
error_tallies.append(_err_tally)
|
269 |
+
|
270 |
+
write_stream(df,
|
271 |
+
output=args.output,
|
272 |
+
format=args.format,
|
273 |
+
mode='w' if i == 0 else 'a',
|
274 |
+
header=i == 0)
|
275 |
+
|
276 |
+
_sum_tally(tallies,
|
277 |
+
message="Enumerated peptides")
|
278 |
+
_sum_tally(error_tallies,
|
279 |
+
message="Conversion errors")
|
280 |
+
|
281 |
+
return None
|
282 |
+
|
283 |
+
|
284 |
+
@clicommand(message="Reacting peptides with the following parameters")
|
285 |
+
def _react(args: Namespace) -> None:
|
286 |
+
|
287 |
+
error_tallies = _mutate_df_stream(input_file=args.input,
|
288 |
+
output_file=args.output,
|
289 |
+
function=partial(reactor,
|
290 |
+
column=args.column,
|
291 |
+
input_representation=args.representation,
|
292 |
+
reaction=args.reaction,
|
293 |
+
product_name=args.name),
|
294 |
+
file_format=args.format)
|
295 |
+
|
296 |
+
_sum_tally(error_tallies)
|
297 |
+
|
298 |
+
return None
|
299 |
+
|
300 |
+
|
301 |
+
def main() -> None:
|
302 |
+
|
303 |
+
inputs = CLIOption('input',
|
304 |
+
default=sys.stdin,
|
305 |
+
type=FileType('r'),
|
306 |
+
nargs='?',
|
307 |
+
help='Input columnar Excel, CSV or TSV file. Default: STDIN.')
|
308 |
+
representation = CLIOption('--representation', '-r',
|
309 |
+
type=str,
|
310 |
+
default='SMILES',
|
311 |
+
choices=upper_and_lower(_FROM_FUNCTIONS),
|
312 |
+
help='Chemical representation to use for input. ')
|
313 |
+
column = CLIOption('--column', '-c',
|
314 |
+
default='smiles',
|
315 |
+
type=str,
|
316 |
+
help='Column to use as input string representation. ')
|
317 |
+
prefix = CLIOption('--prefix', '-p',
|
318 |
+
default=None,
|
319 |
+
type=str,
|
320 |
+
help='Prefix to add to new column name. Default: no prefix')
|
321 |
+
to = CLIOption('--to', '-2',
|
322 |
+
type=str,
|
323 |
+
default='SMILES',
|
324 |
+
nargs='*',
|
325 |
+
choices=upper_and_lower(_TO_FUNCTIONS),
|
326 |
+
help='Format to convert to.')
|
327 |
+
options = CLIOption('--options', '-x',
|
328 |
+
type=str,
|
329 |
+
default=None,
|
330 |
+
nargs='*',
|
331 |
+
help='Options to pass to converter, in the format '
|
332 |
+
'"keyword1=value1 keyword2=value2"')
|
333 |
+
output = CLIOption('--output', '-o',
|
334 |
+
type=FileType('w'),
|
335 |
+
default=sys.stdout,
|
336 |
+
help='Output file. Default: STDOUT')
|
337 |
+
formatting = CLIOption('--format', '-f',
|
338 |
+
type=str,
|
339 |
+
default=None,
|
340 |
+
choices=upper_and_lower(get_formats()),
|
341 |
+
help='Override file extensions for input and output. '
|
342 |
+
'Default: infer from file extension.')
|
343 |
+
|
344 |
+
## featurize
|
345 |
+
id_feat = CLIOption('--id', '-i',
|
346 |
+
type=str,
|
347 |
+
default=None,
|
348 |
+
nargs='*',
|
349 |
+
help='Columns to retain in output table. Default: use all')
|
350 |
+
feature = CLIOption('--feature', '-t',
|
351 |
+
type=str,
|
352 |
+
default='2d',
|
353 |
+
choices=['2d', 'fp'], ## TODO: implement 3d
|
354 |
+
help='Which feature type to generate.')
|
355 |
+
|
356 |
+
## split
|
357 |
+
type_ = CLIOption('--type', '-t',
|
358 |
+
type=str,
|
359 |
+
default='random',
|
360 |
+
choices=upper_and_lower(_SPLITTERS),
|
361 |
+
help='Which split type to use.')
|
362 |
+
train = CLIOption('--train', '-a',
|
363 |
+
type=float,
|
364 |
+
default=1.,
|
365 |
+
help='Proportion of data to use for training. ')
|
366 |
+
test = CLIOption('--test', '-b',
|
367 |
+
type=float,
|
368 |
+
default=0.,
|
369 |
+
help='Proportion of data to use for testing. ')
|
370 |
+
|
371 |
+
## collate
|
372 |
+
data_dir = CLIOption('--data-dir', '-d',
|
373 |
+
type=str,
|
374 |
+
default=None,
|
375 |
+
help='Directory containing data files. '
|
376 |
+
'Default: current directory')
|
377 |
+
id_column = CLIOption('--id-column', '-s',
|
378 |
+
default=None,
|
379 |
+
type=str,
|
380 |
+
help='If provided, add a structure ID column with this name. '
|
381 |
+
'Default: don\'t add structure IDs')
|
382 |
+
prefix_collate = CLIOption('--prefix', '-p',
|
383 |
+
default='ID-',
|
384 |
+
type=str,
|
385 |
+
help='Prefix to add to structure IDs. '
|
386 |
+
'Default: no prefix')
|
387 |
+
digits = CLIOption('--digits', '-n',
|
388 |
+
default=8,
|
389 |
+
type=int,
|
390 |
+
help='Number of digits in structure IDs. ')
|
391 |
+
keep_extra_columns = CLIOption('--keep-extra-columns', '-x',
|
392 |
+
action='store_true',
|
393 |
+
help='Whether to keep columns not mentioned in the catalog. '
|
394 |
+
'Default: drop extra columns.')
|
395 |
+
keep_invalid_smiles = CLIOption('--keep-invalid-smiles', '-y',
|
396 |
+
action='store_true',
|
397 |
+
help='Whether to keep rows with invalid SMILES. '
|
398 |
+
'Default: drop invalid rows.')
|
399 |
+
|
400 |
+
## dedup
|
401 |
+
indexes = CLIOption('--indexes', '-x',
|
402 |
+
type=str,
|
403 |
+
default=None,
|
404 |
+
nargs='*',
|
405 |
+
help='Columns to retain and collapse (if multiple values per unique structure). '
|
406 |
+
'Default: retain no other columns than structure and InchiKey.')
|
407 |
+
drop_inchikey = CLIOption('--drop-inchikey', '-d',
|
408 |
+
action='store_true',
|
409 |
+
help='Whether to drop the calculated InchiKey column. '
|
410 |
+
'Default: keep InchiKey.')
|
411 |
+
|
412 |
+
### enum
|
413 |
+
max_length = CLIOption('--max-length', '-l',
|
414 |
+
type=int,
|
415 |
+
help='Maximum length of enumerated peptide. '
|
416 |
+
'Required.')
|
417 |
+
min_length = CLIOption('--min-length', '-m',
|
418 |
+
type=int,
|
419 |
+
default=None,
|
420 |
+
help='Minimum length of enumerated peptide. '
|
421 |
+
'Default: same as maximum, i.e. all peptides same length.')
|
422 |
+
number_to_gen = CLIOption('--number', '-n',
|
423 |
+
type=float,
|
424 |
+
default=None,
|
425 |
+
help='Number of peptides to sample from all possible '
|
426 |
+
'within the constraints. If less than 1, sample '
|
427 |
+
'that fraction of all possible. If greater than 1, '
|
428 |
+
'sample that number. '
|
429 |
+
'Default: return all peptides.')
|
430 |
+
slicer = CLIOption('--slice', '-z',
|
431 |
+
type=str,
|
432 |
+
default=None,
|
433 |
+
nargs='*',
|
434 |
+
help='Subset of (possibly sampled) population to return, in the format <stop> '
|
435 |
+
'or <start> <stop> [<step>]. If "x" is used for <stop>, then it runs to the end. '
|
436 |
+
'For example, 1000 gives the first 1000, 2 600 gives items 2-600, and '
|
437 |
+
'3 500 2 gives every other from 3 to 500. Default: return all.')
|
438 |
+
alphabet = CLIOption('--alphabet', '-b',
|
439 |
+
type=str,
|
440 |
+
default=''.join(AA),
|
441 |
+
help='Alphabet to use in sampling.')
|
442 |
+
suffix = CLIOption('--suffix', '-s',
|
443 |
+
type=str,
|
444 |
+
default='',
|
445 |
+
help='Sequence to add to end. Lowercase for D-amino acids. '
|
446 |
+
'Default: no suffix.')
|
447 |
+
set_seed = CLIOption('--seed', '-e',
|
448 |
+
type=int,
|
449 |
+
default=None,
|
450 |
+
help='Seed to use for reproducible randomness. '
|
451 |
+
'Default: don\'t enable reproducibility.')
|
452 |
+
d_aa_only = CLIOption('--d-aa-only', '-a',
|
453 |
+
action='store_true',
|
454 |
+
help='Whether to only use D-amino acids. '
|
455 |
+
'Default: don\'t include.')
|
456 |
+
include_d_aa = CLIOption('--include-d-aa', '-y',
|
457 |
+
action='store_true',
|
458 |
+
help='Whether to include D-amino acids in enumeration. '
|
459 |
+
'Default: don\'t include.')
|
460 |
+
|
461 |
+
## reaction
|
462 |
+
name = CLIOption('--name', '-n',
|
463 |
+
type=str,
|
464 |
+
default=None,
|
465 |
+
help='Name of column for product. '
|
466 |
+
'Default: same as reaction name.')
|
467 |
+
reaction_opt = CLIOption('--reaction', '-x',
|
468 |
+
type=str,
|
469 |
+
nargs='*',
|
470 |
+
choices=list(REACTIONS),
|
471 |
+
default='N_to_C_cyclization',
|
472 |
+
help='Reaction(s) to apply.')
|
473 |
+
|
474 |
+
clean = CLICommand('clean',
|
475 |
+
description='Clean and normalize SMILES column of a table.',
|
476 |
+
main=_clean,
|
477 |
+
options=[output, formatting, inputs, representation, column, prefix])
|
478 |
+
convert = CLICommand('convert',
|
479 |
+
description='Convert between string representations of chemical structures.',
|
480 |
+
main=_convert,
|
481 |
+
options=[output, formatting, inputs, representation, column, prefix, to, options])
|
482 |
+
featurize = CLICommand('featurize',
|
483 |
+
description='Convert between string representations of chemical structures.',
|
484 |
+
main=_featurize,
|
485 |
+
options=[output, formatting, inputs, representation, column, prefix,
|
486 |
+
id_feat, feature])
|
487 |
+
collate = CLICommand('collate',
|
488 |
+
description='Collect disparate tables or SDF files of libraries into a single table.',
|
489 |
+
main=_collate,
|
490 |
+
options=[output, formatting, inputs, representation,
|
491 |
+
data_dir, column.replace(default='input_smiles'), id_column, prefix_collate,
|
492 |
+
digits, keep_extra_columns, keep_invalid_smiles])
|
493 |
+
dedup = CLICommand('dedup',
|
494 |
+
description='Deduplicate chemical structures and retain references.',
|
495 |
+
main=_dedup,
|
496 |
+
options=[output, formatting, inputs, representation, column, prefix,
|
497 |
+
indexes, drop_inchikey])
|
498 |
+
enum = CLICommand('enumerate',
|
499 |
+
description='Enumerate bio-chemical structures within length and sequence constraints.',
|
500 |
+
main=_enum,
|
501 |
+
options=[output, formatting, to, options,
|
502 |
+
alphabet, max_length, min_length, number_to_gen,
|
503 |
+
slicer, set_seed,
|
504 |
+
prefix.replace(default='',
|
505 |
+
help='Sequence to prepend. Lowercase for D-amino acids. '
|
506 |
+
'Default: no prefix.'),
|
507 |
+
suffix,
|
508 |
+
type_.replace(default='aa',
|
509 |
+
choices=['aa'],
|
510 |
+
help='Type of bio sequence to enumerate. '
|
511 |
+
'Default: %(default)s.'),
|
512 |
+
d_aa_only, include_d_aa])
|
513 |
+
reaction = CLICommand('react',
|
514 |
+
description='React compounds in silico in indicated columns using a named reaction.',
|
515 |
+
main=_react,
|
516 |
+
options=[output, formatting, inputs, representation, column, name,
|
517 |
+
reaction_opt])
|
518 |
+
split = CLICommand('split',
|
519 |
+
description='Split table based on chosen algorithm, optionally taking account of chemical structure during splits.',
|
520 |
+
main=_split,
|
521 |
+
options=[output, formatting, inputs, representation, column, prefix,
|
522 |
+
type_, train, test, set_seed])
|
523 |
+
|
524 |
+
app = CLIApp("schemist",
|
525 |
+
version=__version__,
|
526 |
+
description="Tools for cleaning, collating, and augmenting chemical datasets.",
|
527 |
+
commands=[clean, convert, featurize, collate, dedup, enum, reaction, split])
|
528 |
+
|
529 |
+
app.run()
|
530 |
+
|
531 |
+
return None
|
532 |
+
|
533 |
+
|
534 |
+
if __name__ == "__main__":
|
535 |
+
|
536 |
+
main()
|
build/lib/schemist/collating.py
ADDED
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools to collate chemical data files."""
|
2 |
+
|
3 |
+
from typing import Callable, Dict, Iterable, List, Optional, Tuple, TextIO, Union
|
4 |
+
|
5 |
+
from collections import Counter
|
6 |
+
from functools import partial
|
7 |
+
from glob import glob
|
8 |
+
import os
|
9 |
+
|
10 |
+
from carabiner.pd import read_table, resolve_delim
|
11 |
+
from carabiner import print_err
|
12 |
+
import numpy as np
|
13 |
+
from pandas import DataFrame, concat
|
14 |
+
|
15 |
+
from .converting import convert_string_representation, _FROM_FUNCTIONS
|
16 |
+
from .io import FILE_READERS
|
17 |
+
|
18 |
+
GROUPING_COLUMNS = ("filename", "file_format", "library_name", "string_representation")
|
19 |
+
ESSENTIAL_COLUMNS = GROUPING_COLUMNS + ("compound_collection", "plate_id", "well_id")
|
20 |
+
|
21 |
+
def _column_mapper(df: DataFrame,
|
22 |
+
cols: Iterable[str]) -> Tuple[Callable, Dict]:
|
23 |
+
|
24 |
+
basic_map = {column: df[column].tolist()[0] for column in cols}
|
25 |
+
inv_basic_map = {value: key for key, value in basic_map.items()}
|
26 |
+
|
27 |
+
def column_mapper(x: DataFrame) -> DataFrame:
|
28 |
+
|
29 |
+
new_df = DataFrame()
|
30 |
+
|
31 |
+
for new_col, old_col in basic_map.items():
|
32 |
+
|
33 |
+
# old_col = str(old_col)
|
34 |
+
|
35 |
+
if old_col is None or str(old_col) in ('None', 'nan', 'NA'):
|
36 |
+
|
37 |
+
new_df[new_col] = None
|
38 |
+
|
39 |
+
elif '+' in old_col:
|
40 |
+
|
41 |
+
splits = old_col.split('+')
|
42 |
+
new_df[new_col] = x[splits[0]].str.cat([x[s].astype(str)
|
43 |
+
for s in splits[1:]])
|
44 |
+
|
45 |
+
elif ';' in old_col:
|
46 |
+
|
47 |
+
col, char, index = old_col.split(';')
|
48 |
+
index = [int(i) for i in index.split(':')]
|
49 |
+
|
50 |
+
if len(index) == 1:
|
51 |
+
index = slice(index[0], index[0] + 1)
|
52 |
+
else:
|
53 |
+
index = slice(*index)
|
54 |
+
|
55 |
+
try:
|
56 |
+
|
57 |
+
new_df[new_col] = (x[col]
|
58 |
+
.str.split(char)
|
59 |
+
.map(lambda y: char.join(y[index] if y is not np.nan else []))
|
60 |
+
.str.strip())
|
61 |
+
|
62 |
+
except TypeError as e:
|
63 |
+
|
64 |
+
print_err(x[col].str.split(char))
|
65 |
+
|
66 |
+
raise e
|
67 |
+
|
68 |
+
else:
|
69 |
+
|
70 |
+
new_df[new_col] = x[old_col].copy()
|
71 |
+
|
72 |
+
return new_df
|
73 |
+
|
74 |
+
return column_mapper, inv_basic_map
|
75 |
+
|
76 |
+
|
77 |
+
def _check_catalog(catalog: DataFrame,
|
78 |
+
catalog_smiles_column: str = 'input_smiles') -> None:
|
79 |
+
|
80 |
+
essential_columns = (catalog_smiles_column, ) + ESSENTIAL_COLUMNS
|
81 |
+
missing_essential_cols = [col for col in essential_columns
|
82 |
+
if col not in catalog]
|
83 |
+
|
84 |
+
if len(missing_essential_cols) > 0:
|
85 |
+
|
86 |
+
print_err(catalog.columns.tolist())
|
87 |
+
|
88 |
+
raise KeyError("Missing required columns from catalog: " +
|
89 |
+
", ".join(missing_essential_cols))
|
90 |
+
|
91 |
+
return None
|
92 |
+
|
93 |
+
|
94 |
+
def collate_inventory(catalog: DataFrame,
|
95 |
+
root_dir: Optional[str] = None,
|
96 |
+
drop_invalid: bool = True,
|
97 |
+
drop_unmapped: bool = False,
|
98 |
+
catalog_smiles_column: str = 'input_smiles',
|
99 |
+
id_column_name: Optional[str] = None,
|
100 |
+
id_n_digits: int = 8,
|
101 |
+
id_prefix: str = '') -> DataFrame:
|
102 |
+
|
103 |
+
f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
|
104 |
+
|
105 |
+
The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
|
106 |
+
|
107 |
+
- filename is a glob pattern of files to collate
|
108 |
+
- file_format is one of {', '.join(FILE_READERS.keys())}
|
109 |
+
- smiles_column contains smiles strings
|
110 |
+
|
111 |
+
Other columns are optional and can have any name, but must contain the name or a pattern
|
112 |
+
matching a column (for tabular data) or field (for SDF data) in the files
|
113 |
+
of the `filename` column. In the output DataFrame, the named column data will be mapped.
|
114 |
+
|
115 |
+
Optional column contents can be either concatenated or split using the following
|
116 |
+
pattern:
|
117 |
+
|
118 |
+
- col1+col2: concatenates the contents of `col1` and `col2`
|
119 |
+
- col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
|
120 |
+
|
121 |
+
Parameters
|
122 |
+
----------
|
123 |
+
catalog : pd.DataFrame
|
124 |
+
Table cataloging locations and format of data. Requires
|
125 |
+
columns {', '.join(ESSENTIAL_COLUMNS)}.
|
126 |
+
root_dir : str, optional
|
127 |
+
Path to look for data files. Default: current directory.
|
128 |
+
drop_invalid : bool, optional
|
129 |
+
Whether to drop rows containing invalid SMILES.
|
130 |
+
|
131 |
+
|
132 |
+
Returns
|
133 |
+
-------
|
134 |
+
pd.DataFrame
|
135 |
+
Collated chemical data.
|
136 |
+
|
137 |
+
"""
|
138 |
+
|
139 |
+
root_dir = root_dir or '.'
|
140 |
+
|
141 |
+
_check_catalog(catalog, catalog_smiles_column)
|
142 |
+
|
143 |
+
nongroup_columns = [col for col in catalog
|
144 |
+
if col not in GROUPING_COLUMNS]
|
145 |
+
loaded_dataframes = []
|
146 |
+
report = Counter({"invalid SMILES": 0,
|
147 |
+
"rows processed": 0})
|
148 |
+
|
149 |
+
grouped_catalog = catalog.groupby(list(GROUPING_COLUMNS))
|
150 |
+
for (this_glob, this_filetype,
|
151 |
+
this_library_name, this_representation), filename_df in grouped_catalog:
|
152 |
+
|
153 |
+
print_err(f'\nProcessing {this_glob}:')
|
154 |
+
|
155 |
+
this_glob = glob(os.path.join(root_dir, this_glob))
|
156 |
+
|
157 |
+
these_filenames = sorted(f for f in this_glob
|
158 |
+
if not os.path.basename(f).startswith('~$'))
|
159 |
+
print_err('\t- ' + '\n\t- '.join(these_filenames))
|
160 |
+
|
161 |
+
column_mapper, mapped_cols = _column_mapper(filename_df,
|
162 |
+
nongroup_columns)
|
163 |
+
|
164 |
+
reader = FILE_READERS[this_filetype]
|
165 |
+
|
166 |
+
for filename in these_filenames:
|
167 |
+
|
168 |
+
this_data0 = reader(filename)
|
169 |
+
|
170 |
+
if not drop_unmapped:
|
171 |
+
unmapped_cols = {col: 'x_' + col.casefold().replace(' ', '_')
|
172 |
+
for col in this_data0 if col not in mapped_cols}
|
173 |
+
this_data = this_data0[list(unmapped_cols)].rename(columns=unmapped_cols)
|
174 |
+
this_data = concat([column_mapper(this_data0), this_data],
|
175 |
+
axis=1)
|
176 |
+
else:
|
177 |
+
this_data = column_mapper(this_data0)
|
178 |
+
|
179 |
+
if this_representation.casefold() not in _FROM_FUNCTIONS:
|
180 |
+
|
181 |
+
raise TypeError(' or '.join(list(set(this_representation, this_representation.casefold()))) +
|
182 |
+
"not a supported string representation. Try one of " + ", ".join(_FROM_FUNCTIONS))
|
183 |
+
|
184 |
+
this_converter = partial(convert_string_representation,
|
185 |
+
input_representation=this_representation.casefold())
|
186 |
+
|
187 |
+
this_data = (this_data
|
188 |
+
.query('compound_collection != "NA"')
|
189 |
+
.assign(library_name=this_library_name,
|
190 |
+
input_file_format=this_filetype,
|
191 |
+
input_string_representation=this_representation,
|
192 |
+
plate_id=lambda x: x['plate_id'].astype(str),
|
193 |
+
plate_loc=lambda x: x['library_name'].str.cat([x['compound_collection'], x['plate_id'], x['well_id']], sep=':'),
|
194 |
+
canonical_smiles=lambda x: this_converter(x[catalog_smiles_column]),
|
195 |
+
is_valid_smiles=lambda x: [s is not None for s in x['canonical_smiles']]))
|
196 |
+
|
197 |
+
report.update({"invalid SMILES": (~this_data['is_valid_smiles']).sum(),
|
198 |
+
"rows processed": this_data.shape[0]})
|
199 |
+
|
200 |
+
if drop_invalid:
|
201 |
+
|
202 |
+
this_data = this_data.query('is_valid_smiles')
|
203 |
+
|
204 |
+
if id_column_name is not None:
|
205 |
+
|
206 |
+
this_converter = partial(convert_string_representation,
|
207 |
+
output_representation='id',
|
208 |
+
options=dict(n=id_n_digits,
|
209 |
+
prefix=id_prefix))
|
210 |
+
this_data = this_data.assign(**{id_column_name: lambda x: this_converter(x['canonical_smiles'])})
|
211 |
+
|
212 |
+
loaded_dataframes.append(this_data)
|
213 |
+
|
214 |
+
collated_df = concat(loaded_dataframes, axis=0)
|
215 |
+
|
216 |
+
return report, collated_df
|
217 |
+
|
218 |
+
|
219 |
+
def collate_inventory_from_file(catalog_path: Union[str, TextIO],
|
220 |
+
root_dir: Optional[str] = None,
|
221 |
+
format: Optional[str] = None,
|
222 |
+
*args, **kwargs) -> DataFrame:
|
223 |
+
|
224 |
+
f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
|
225 |
+
|
226 |
+
The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
|
227 |
+
|
228 |
+
- filename is a glob pattern of files to collate
|
229 |
+
- file_format is one of {', '.join(FILE_READERS.keys())}
|
230 |
+
- smiles_column contains smiles strings
|
231 |
+
|
232 |
+
Other columns are optional and can have any name, but must contain the name or a pattern
|
233 |
+
matching a column (for tabular data) or field (for SDF data) in the files
|
234 |
+
of the `filename` column. In the output DataFrame, the named column data will be mapped.
|
235 |
+
|
236 |
+
Optional column contents can be either concatenated or split using the following
|
237 |
+
pattern:
|
238 |
+
|
239 |
+
- col1+col2: concatenates the contents of `col1` and `col2`
|
240 |
+
- col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
|
241 |
+
|
242 |
+
Parameters
|
243 |
+
----------
|
244 |
+
catalog_path : str
|
245 |
+
Path to catalog file in XLSX, TSV or CSV format. Requires
|
246 |
+
columns {', '.join(ESSENTIAL_COLUMNS)}.
|
247 |
+
format : str, optional
|
248 |
+
Format of catalog file. Default: infer from file extension.
|
249 |
+
root_dir : str, optional
|
250 |
+
Path to look for data files. Default: use directory containing
|
251 |
+
the catalog.
|
252 |
+
|
253 |
+
Returns
|
254 |
+
-------
|
255 |
+
pd.DataFrame
|
256 |
+
Collated chemical data.
|
257 |
+
|
258 |
+
"""
|
259 |
+
|
260 |
+
root_dir = root_dir or os.path.dirname(catalog_path)
|
261 |
+
|
262 |
+
data_catalog = read_table(catalog_path, format=format)
|
263 |
+
|
264 |
+
return collate_inventory(catalog=data_catalog,
|
265 |
+
root_dir=root_dir,
|
266 |
+
*args, **kwargs)
|
267 |
+
|
268 |
+
|
269 |
+
def deduplicate(df: DataFrame,
|
270 |
+
column: str = 'smiles',
|
271 |
+
input_representation: str = 'smiles',
|
272 |
+
index_columns: Optional[List[str]] = None,
|
273 |
+
drop_inchikey: bool = False) -> DataFrame:
|
274 |
+
|
275 |
+
index_columns = index_columns or []
|
276 |
+
|
277 |
+
inchikey_converter = partial(convert_string_representation,
|
278 |
+
input_representation=input_representation,
|
279 |
+
output_representation='inchikey')
|
280 |
+
|
281 |
+
df = df.assign(inchikey=lambda x: inchikey_converter(x[column]))
|
282 |
+
|
283 |
+
structure_columns = [column, 'inchikey']
|
284 |
+
df_unique = []
|
285 |
+
|
286 |
+
for (string_rep, inchikey), structure_df in df.groupby(structure_columns):
|
287 |
+
|
288 |
+
collapsed_indexes = {col: [';'.join(sorted(map(str, set(structure_df[col].tolist()))))]
|
289 |
+
for col in structure_df if col in index_columns}
|
290 |
+
collapsed_indexes.update({column: [string_rep],
|
291 |
+
'inchikey': [inchikey],
|
292 |
+
'instance_count': [structure_df.shape[0]]})
|
293 |
+
|
294 |
+
df_unique.append(DataFrame(collapsed_indexes))
|
295 |
+
|
296 |
+
df_unique = concat(df_unique, axis=0)
|
297 |
+
|
298 |
+
if drop_inchikey:
|
299 |
+
|
300 |
+
df_unique = df_unique.drop(columns=['inchikey'])
|
301 |
+
|
302 |
+
report = {'starting rows:': df.shape[0],
|
303 |
+
'ending_rows': df_unique.shape[0]}
|
304 |
+
|
305 |
+
return report, df_unique
|
306 |
+
|
307 |
+
|
308 |
+
def deduplicate_file(filename: Union[str, TextIO],
|
309 |
+
format: Optional[str] = None,
|
310 |
+
*args, **kwargs) -> DataFrame:
|
311 |
+
|
312 |
+
table = read_table(filename)
|
313 |
+
|
314 |
+
return deduplicate(table, *args, **kwargs)
|
315 |
+
|
build/lib/schemist/converting.py
ADDED
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Converting between chemical representation formats."""
|
2 |
+
|
3 |
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
4 |
+
|
5 |
+
from functools import wraps
|
6 |
+
|
7 |
+
from carabiner import print_err
|
8 |
+
from carabiner.cast import cast, flatten
|
9 |
+
from carabiner.decorators import return_none_on_error, vectorize
|
10 |
+
from carabiner.itertools import batched
|
11 |
+
|
12 |
+
from datamol import sanitize_smiles
|
13 |
+
import nemony as nm
|
14 |
+
from pandas import DataFrame
|
15 |
+
from rdkit.Chem import (Mol, MolFromInchi, MolFromHELM, MolFromSequence,
|
16 |
+
MolFromSmiles, MolToInchi, MolToInchiKey,
|
17 |
+
MolToSmiles)
|
18 |
+
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
|
19 |
+
from requests import Session
|
20 |
+
import selfies as sf
|
21 |
+
|
22 |
+
from .rest_lookup import _inchikey2pubchem_name_id, _inchikey2cactus_name
|
23 |
+
|
24 |
+
@vectorize
|
25 |
+
@return_none_on_error
|
26 |
+
def _seq2mol(s: str) -> Union[Mol, None]:
|
27 |
+
|
28 |
+
return MolFromSequence(s, sanitize=True)
|
29 |
+
|
30 |
+
|
31 |
+
@vectorize
|
32 |
+
@return_none_on_error
|
33 |
+
def _helm2mol(s: str) -> Union[Mol, None]:
|
34 |
+
|
35 |
+
return MolFromHELM(s, sanitize=True)
|
36 |
+
|
37 |
+
|
38 |
+
def mini_helm2helm(s: str) -> List[str]:
|
39 |
+
|
40 |
+
new_s = []
|
41 |
+
token = ''
|
42 |
+
between_sq_brackets = False
|
43 |
+
|
44 |
+
for letter in s:
|
45 |
+
|
46 |
+
if letter.islower() and not between_sq_brackets:
|
47 |
+
|
48 |
+
letter = f"[d{letter.upper()}]"
|
49 |
+
|
50 |
+
token += letter
|
51 |
+
|
52 |
+
if letter == '[':
|
53 |
+
between_sq_brackets = True
|
54 |
+
elif letter == ']':
|
55 |
+
between_sq_brackets = False
|
56 |
+
|
57 |
+
if not between_sq_brackets:
|
58 |
+
new_s.append(token)
|
59 |
+
token = ''
|
60 |
+
|
61 |
+
return "PEPTIDE1{{{inner_helm}}}$$$$".format(inner_helm='.'.join(new_s))
|
62 |
+
|
63 |
+
|
64 |
+
@vectorize
|
65 |
+
@return_none_on_error
|
66 |
+
def _mini_helm2mol(s: str) -> Mol:
|
67 |
+
|
68 |
+
s = mini_helm2helm(s)
|
69 |
+
|
70 |
+
return MolFromHELM(s, sanitize=True)
|
71 |
+
|
72 |
+
|
73 |
+
@vectorize
|
74 |
+
@return_none_on_error
|
75 |
+
def _inchi2mol(s: str) -> Mol:
|
76 |
+
|
77 |
+
return MolFromInchi(s,
|
78 |
+
sanitize=True,
|
79 |
+
removeHs=True)
|
80 |
+
|
81 |
+
@vectorize
|
82 |
+
# @return_none_on_error
|
83 |
+
def _smiles2mol(s: str) -> Mol:
|
84 |
+
|
85 |
+
return MolFromSmiles(sanitize_smiles(s))
|
86 |
+
|
87 |
+
|
88 |
+
@vectorize
|
89 |
+
@return_none_on_error
|
90 |
+
def _selfies2mol(s: str) -> Mol:
|
91 |
+
|
92 |
+
return MolFromSmiles(sf.decoder(s))
|
93 |
+
|
94 |
+
|
95 |
+
@vectorize
|
96 |
+
@return_none_on_error
|
97 |
+
def _mol2nonstandard_inchikey(m: Mol,
|
98 |
+
**kwargs) -> str:
|
99 |
+
|
100 |
+
return MolToInchiKey(m,
|
101 |
+
options="/FixedH /SUU /RecMet /KET /15T")
|
102 |
+
|
103 |
+
|
104 |
+
@vectorize
|
105 |
+
@return_none_on_error
|
106 |
+
def _mol2hash(m: Mol,
|
107 |
+
**kwargs) -> str:
|
108 |
+
|
109 |
+
nonstandard_inchikey = _mol2nonstandard_inchikey(m)
|
110 |
+
|
111 |
+
return nm.hash(nonstandard_inchikey)
|
112 |
+
|
113 |
+
|
114 |
+
@vectorize
|
115 |
+
@return_none_on_error
|
116 |
+
def _mol2id(m: Mol,
|
117 |
+
n: int = 8,
|
118 |
+
prefix: str = '',
|
119 |
+
**kwargs) -> str:
|
120 |
+
|
121 |
+
return prefix + str(int(_mol2hash(m), 16))[:n]
|
122 |
+
|
123 |
+
|
124 |
+
@vectorize
|
125 |
+
@return_none_on_error
|
126 |
+
def _mol2isomeric_canonical_smiles(m: Mol,
|
127 |
+
**kwargs) -> str:
|
128 |
+
|
129 |
+
return MolToSmiles(m,
|
130 |
+
isomericSmiles=True,
|
131 |
+
canonical=True)
|
132 |
+
|
133 |
+
|
134 |
+
@vectorize
|
135 |
+
@return_none_on_error
|
136 |
+
def _mol2inchi(m: Mol,
|
137 |
+
**kwargs) -> str:
|
138 |
+
|
139 |
+
return MolToInchi(m)
|
140 |
+
|
141 |
+
|
142 |
+
@vectorize
|
143 |
+
@return_none_on_error
|
144 |
+
def _mol2inchikey(m: Mol,
|
145 |
+
**kwargs) -> str:
|
146 |
+
|
147 |
+
return MolToInchiKey(m)
|
148 |
+
|
149 |
+
|
150 |
+
@vectorize
|
151 |
+
@return_none_on_error
|
152 |
+
def _mol2random_smiles(m: Mol,
|
153 |
+
**kwargs) -> str:
|
154 |
+
|
155 |
+
return MolToSmiles(m,
|
156 |
+
isomericSmiles=True,
|
157 |
+
doRandom=True)
|
158 |
+
|
159 |
+
|
160 |
+
@vectorize
|
161 |
+
@return_none_on_error
|
162 |
+
def _mol2mnemonic(m: Mol,
|
163 |
+
**kwargs) -> str:
|
164 |
+
|
165 |
+
nonstandard_inchikey = _mol2nonstandard_inchikey(m)
|
166 |
+
|
167 |
+
return nm.encode(nonstandard_inchikey)
|
168 |
+
|
169 |
+
|
170 |
+
def _mol2pubchem(m: Union[Mol, Iterable[Mol]],
|
171 |
+
session: Optional[Session] = None,
|
172 |
+
chunksize: int = 32) -> List[Dict[str, Union[None, int, str]]]:
|
173 |
+
|
174 |
+
inchikeys = cast(_mol2inchikey(m), to=list)
|
175 |
+
pubchem_ids = []
|
176 |
+
|
177 |
+
for _inchikeys in batched(inchikeys, chunksize):
|
178 |
+
|
179 |
+
these_ids = _inchikey2pubchem_name_id(_inchikeys,
|
180 |
+
session=session)
|
181 |
+
pubchem_ids += these_ids
|
182 |
+
|
183 |
+
return pubchem_ids
|
184 |
+
|
185 |
+
|
186 |
+
@return_none_on_error
|
187 |
+
def _mol2pubchem_id(m: Union[Mol, Iterable[Mol]],
|
188 |
+
session: Optional[Session] = None,
|
189 |
+
chunksize: int = 32,
|
190 |
+
**kwargs) -> Union[str, List[str]]:
|
191 |
+
|
192 |
+
return flatten([val['pubchem_id']
|
193 |
+
for val in _mol2pubchem(m,
|
194 |
+
session=session,
|
195 |
+
chunksize=chunksize)])
|
196 |
+
|
197 |
+
|
198 |
+
@return_none_on_error
|
199 |
+
def _mol2pubchem_name(m: Union[Mol, Iterable[Mol]],
|
200 |
+
session: Optional[Session] = None,
|
201 |
+
chunksize: int = 32,
|
202 |
+
**kwargs) -> Union[str, List[str]]:
|
203 |
+
|
204 |
+
return flatten([val['pubchem_name']
|
205 |
+
for val in _mol2pubchem(m,
|
206 |
+
session=session,
|
207 |
+
chunksize=chunksize)])
|
208 |
+
|
209 |
+
@return_none_on_error
|
210 |
+
def _mol2cactus_name(m: Union[Mol, Iterable[Mol]],
|
211 |
+
session: Optional[Session] = None,
|
212 |
+
**kwargs) -> Union[str, List[str]]:
|
213 |
+
|
214 |
+
return _inchikey2cactus_name(_mol2inchikey(m),
|
215 |
+
session=session)
|
216 |
+
|
217 |
+
|
218 |
+
@vectorize
|
219 |
+
@return_none_on_error
|
220 |
+
def _mol2scaffold(m: Mol,
|
221 |
+
chiral: bool = True,
|
222 |
+
**kwargs) -> str:
|
223 |
+
|
224 |
+
return MurckoScaffoldSmiles(mol=m,
|
225 |
+
includeChirality=chiral)
|
226 |
+
|
227 |
+
|
228 |
+
@vectorize
|
229 |
+
@return_none_on_error
|
230 |
+
def _mol2selfies(m: Mol,
|
231 |
+
**kwargs) -> str:
|
232 |
+
|
233 |
+
s = sf.encoder(_mol2isomeric_canonical_smiles(m))
|
234 |
+
|
235 |
+
return s if s != -1 else None
|
236 |
+
|
237 |
+
|
238 |
+
_TO_FUNCTIONS = {"smiles": _mol2isomeric_canonical_smiles,
|
239 |
+
"selfies": _mol2selfies,
|
240 |
+
"inchi": _mol2inchi,
|
241 |
+
"inchikey": _mol2inchikey,
|
242 |
+
"nonstandard_inchikey": _mol2nonstandard_inchikey,
|
243 |
+
"hash": _mol2hash,
|
244 |
+
"mnemonic": _mol2mnemonic,
|
245 |
+
"id": _mol2id,
|
246 |
+
"scaffold": _mol2scaffold,
|
247 |
+
"permuted_smiles": _mol2random_smiles,
|
248 |
+
"pubchem_id": _mol2pubchem_id,
|
249 |
+
"pubchem_name": _mol2pubchem_name,
|
250 |
+
"cactus_name": _mol2cactus_name}
|
251 |
+
|
252 |
+
_FROM_FUNCTIONS = {"smiles": _smiles2mol,
|
253 |
+
"selfies": _selfies2mol,
|
254 |
+
"inchi": _inchi2mol,
|
255 |
+
"aa_seq": _seq2mol,
|
256 |
+
"helm": _helm2mol,
|
257 |
+
"minihelm": _mini_helm2mol}
|
258 |
+
|
259 |
+
|
260 |
+
def _x2mol(strings: Union[Iterable[str], str],
|
261 |
+
input_representation: str = 'smiles') -> Union[Mol, None, Iterable[Union[Mol, None]]]:
|
262 |
+
|
263 |
+
from_function = _FROM_FUNCTIONS[input_representation.casefold()]
|
264 |
+
|
265 |
+
return from_function(strings)
|
266 |
+
|
267 |
+
|
268 |
+
def _mol2x(mols: Union[Iterable[Mol], Mol],
|
269 |
+
output_representation: str = 'smiles',
|
270 |
+
**kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
|
271 |
+
|
272 |
+
to_function = _TO_FUNCTIONS[output_representation.casefold()]
|
273 |
+
|
274 |
+
return to_function(mols, **kwargs)
|
275 |
+
|
276 |
+
|
277 |
+
def convert_string_representation(strings: Union[Iterable[str], str],
|
278 |
+
input_representation: str = 'smiles',
|
279 |
+
output_representation: str = 'smiles',
|
280 |
+
**kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
|
281 |
+
|
282 |
+
"""Convert between string representations of chemical structures.
|
283 |
+
|
284 |
+
"""
|
285 |
+
|
286 |
+
mols = _x2mol(strings, input_representation)
|
287 |
+
# print_err(mols)
|
288 |
+
outstrings = _mol2x(mols, output_representation, **kwargs)
|
289 |
+
# print_err(outstrings)
|
290 |
+
|
291 |
+
return outstrings
|
292 |
+
|
293 |
+
|
294 |
+
def _convert_input_to_smiles(f: Callable) -> Callable:
|
295 |
+
|
296 |
+
@wraps(f)
|
297 |
+
def _f(strings: Union[Iterable[str], str],
|
298 |
+
input_representation: str = 'smiles',
|
299 |
+
*args, **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
|
300 |
+
|
301 |
+
smiles = convert_string_representation(strings,
|
302 |
+
output_representation='smiles',
|
303 |
+
input_representation=input_representation)
|
304 |
+
|
305 |
+
return f(strings=smiles,
|
306 |
+
*args, **kwargs)
|
307 |
+
|
308 |
+
return _f
|
build/lib/schemist/features.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools for generating chemical features."""
|
2 |
+
|
3 |
+
from typing import Any, Callable, Iterable, Optional, Union
|
4 |
+
|
5 |
+
from descriptastorus.descriptors import MakeGenerator
|
6 |
+
from pandas import DataFrame, Series
|
7 |
+
import numpy as np
|
8 |
+
from rdkit.Chem.AllChem import FingeprintGenerator64, GetMorganGenerator, Mol
|
9 |
+
|
10 |
+
from .converting import _smiles2mol, _convert_input_to_smiles
|
11 |
+
|
12 |
+
def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], DataFrame]:
|
13 |
+
|
14 |
+
def _f(prefix: Optional[str] = None,
|
15 |
+
*args, **kwargs) -> DataFrame:
|
16 |
+
|
17 |
+
feature_matrix = f(*args, **kwargs)
|
18 |
+
|
19 |
+
if prefix is not None:
|
20 |
+
|
21 |
+
new_cols = {col: f"{prefix}_{col}"
|
22 |
+
for col in feature_matrix.columns
|
23 |
+
if not col.startswith('_meta')}
|
24 |
+
feature_matrix = feature_matrix.rename(columns=new_cols)
|
25 |
+
|
26 |
+
return feature_matrix
|
27 |
+
|
28 |
+
return _f
|
29 |
+
|
30 |
+
|
31 |
+
def _get_descriptastorus_features(smiles: Iterable[str],
|
32 |
+
generator: str) -> DataFrame:
|
33 |
+
|
34 |
+
generator = MakeGenerator((generator, ))
|
35 |
+
smiles = Series(smiles)
|
36 |
+
|
37 |
+
features = smiles.apply(lambda z: np.array(generator.process(z)))
|
38 |
+
matrix = np.stack(features.values, axis=0)
|
39 |
+
|
40 |
+
return DataFrame(matrix,
|
41 |
+
index=smiles.index,
|
42 |
+
columns=[col for col, _ in generator.GetColumns()])
|
43 |
+
|
44 |
+
|
45 |
+
@_feature_matrix
|
46 |
+
@_convert_input_to_smiles
|
47 |
+
def calculate_2d_features(strings: Union[Iterable[str], str],
|
48 |
+
normalized: bool = True,
|
49 |
+
histogram_normalized: bool = True) -> DataFrame:
|
50 |
+
|
51 |
+
"""Calculate 2d features from string representation.
|
52 |
+
|
53 |
+
"""
|
54 |
+
|
55 |
+
if normalized:
|
56 |
+
if histogram_normalized:
|
57 |
+
generator_name = "RDKit2DHistogramNormalized"
|
58 |
+
else:
|
59 |
+
generator_name = "RDKit2DNormalized"
|
60 |
+
else:
|
61 |
+
generator_name = "RDKit2D"
|
62 |
+
|
63 |
+
feature_matrix = _get_descriptastorus_features(strings,
|
64 |
+
generator=generator_name)
|
65 |
+
|
66 |
+
feature_matrix = (feature_matrix
|
67 |
+
.rename(columns={f"{generator_name}_calculated": "meta_feature_valid0"})
|
68 |
+
.assign(meta_feature_type=generator_name,
|
69 |
+
meta_feature_valid=lambda x: (x['meta_feature_valid0'] == 1.))
|
70 |
+
.drop(columns=['meta_feature_valid0']))
|
71 |
+
|
72 |
+
return feature_matrix
|
73 |
+
|
74 |
+
|
75 |
+
def _fast_fingerprint(generator: FingeprintGenerator64,
|
76 |
+
mol: Mol,
|
77 |
+
to_np: bool = True) -> Union[str, np.ndarray]:
|
78 |
+
|
79 |
+
try:
|
80 |
+
fp_string = generator.GetFingerprint(mol).ToBitString()
|
81 |
+
except:
|
82 |
+
return None
|
83 |
+
else:
|
84 |
+
if to_np:
|
85 |
+
return np.frombuffer(fp_string.encode(), 'u1') - ord('0')
|
86 |
+
else:
|
87 |
+
return fp_string
|
88 |
+
|
89 |
+
|
90 |
+
@_feature_matrix
|
91 |
+
@_convert_input_to_smiles
|
92 |
+
def calculate_fingerprints(strings: Union[Iterable[str], str],
|
93 |
+
fp_type: str = 'morgan',
|
94 |
+
radius: int = 2,
|
95 |
+
chiral: bool = True,
|
96 |
+
on_bits: bool = True) -> DataFrame:
|
97 |
+
|
98 |
+
"""
|
99 |
+
|
100 |
+
"""
|
101 |
+
|
102 |
+
if fp_type.casefold() == 'morgan':
|
103 |
+
generator_class = GetMorganGenerator
|
104 |
+
else:
|
105 |
+
raise AttributeError(f"Fingerprint type {fp_type} not supported!")
|
106 |
+
|
107 |
+
fp_generator = generator_class(radius=radius,
|
108 |
+
includeChirality=chiral)
|
109 |
+
mols = (_smiles2mol(s) for s in strings)
|
110 |
+
fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits)
|
111 |
+
for mol in mols)
|
112 |
+
|
113 |
+
if on_bits:
|
114 |
+
|
115 |
+
fingerprints = (map(str, np.flatnonzero(fp_string).tolist())
|
116 |
+
for fp_string in fp_strings)
|
117 |
+
fingerprints = [';'.join(fp) for fp in fingerprints]
|
118 |
+
validity = [len(fp) > 0 for fp in fingerprints]
|
119 |
+
|
120 |
+
feature_matrix = DataFrame(fingerprints,
|
121 |
+
columns=['fp_bits'])
|
122 |
+
|
123 |
+
else:
|
124 |
+
|
125 |
+
fingerprints = [np.array(int(digit) for digit in fp_string)
|
126 |
+
if fp_string is not None
|
127 |
+
else (-np.ones((fp_generator.GetOptions().fpSize, )))
|
128 |
+
for fp_string in fp_strings]
|
129 |
+
validity = [np.all(fp >= 0) for fp in fingerprints]
|
130 |
+
|
131 |
+
feature_matrix = DataFrame(np.stack(fingerprints, axis=0),
|
132 |
+
columns=[f"fp_{i}" for i in range(len(fingerprints[0]))])
|
133 |
+
|
134 |
+
return feature_matrix.assign(meta_feature_type=fp_type.casefold(),
|
135 |
+
meta_feature_valid=validity)
|
136 |
+
|
137 |
+
|
138 |
+
_FEATURE_CALCULATORS = {"2d": calculate_2d_features, "fp": calculate_fingerprints}
|
139 |
+
|
140 |
+
def calculate_feature(feature_type: str,
|
141 |
+
*args, **kwargs):
|
142 |
+
|
143 |
+
"""
|
144 |
+
|
145 |
+
"""
|
146 |
+
|
147 |
+
featurizer = _FEATURE_CALCULATORS[feature_type]
|
148 |
+
|
149 |
+
return featurizer(*args, **kwargs)
|
build/lib/schemist/generating.py
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools for enumerating compounds. Currently only works with peptides."""
|
2 |
+
|
3 |
+
from typing import Callable, Iterable, Optional, Tuple, Union
|
4 |
+
|
5 |
+
from functools import partial
|
6 |
+
from itertools import chain, islice, product, repeat
|
7 |
+
from math import ceil, expm1, floor
|
8 |
+
from random import choice, choices, random, seed
|
9 |
+
|
10 |
+
from carabiner import print_err
|
11 |
+
from carabiner.decorators import vectorize, return_none_on_error
|
12 |
+
from carabiner.random import sample_iter
|
13 |
+
from rdkit.Chem import Mol, rdChemReactions
|
14 |
+
import numpy as np
|
15 |
+
|
16 |
+
from .converting import (_x2mol, _mol2x,
|
17 |
+
_convert_input_to_smiles)
|
18 |
+
|
19 |
+
AA = tuple('GALVITSMCPFYWHKRDENQ')
|
20 |
+
dAA = tuple(aa.casefold() for aa in AA)
|
21 |
+
|
22 |
+
REACTIONS = {'N_to_C_cyclization': '([N;H1:5][C:1][C:2](=[O:6])[O:3].[N;H2:4][C:7][C:8](=[O:9])[N;H1:10])>>[N;H1:5][C:1][C:2](=[O:6])[N;H1:4][C:7][C:8](=[O:9])[N;H1:10].[O;H2:3]',
|
23 |
+
'cysteine_to_chloroacetyl_cyclization': '([N;H1:5][C:2](=[O:6])[C:1][Cl:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8]',
|
24 |
+
'cysteine_to_N_cyclization':'([N;H1:5][C:2](=[O:6])[C:1][N;H2:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8].[N;H3:3]'}
|
25 |
+
|
26 |
+
def _get_alphabet(alphabet: Optional[Iterable[str]] = None,
|
27 |
+
d_aa_only: bool = False,
|
28 |
+
include_d_aa: bool = False) -> Tuple[str]:
|
29 |
+
|
30 |
+
alphabet = alphabet or AA
|
31 |
+
alphabet_lower = tuple(set(aa.casefold() for aa in AA))
|
32 |
+
|
33 |
+
if d_aa_only:
|
34 |
+
alphabet = alphabet_lower
|
35 |
+
elif include_d_aa:
|
36 |
+
alphabet = tuple(set(chain(alphabet, alphabet_lower)))
|
37 |
+
|
38 |
+
return alphabet
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
def all_peptides_of_one_length(length: int,
|
43 |
+
alphabet: Optional[Iterable[str]] = None,
|
44 |
+
d_aa_only: bool = False,
|
45 |
+
include_d_aa: bool = False) -> Iterable[str]:
|
46 |
+
|
47 |
+
"""
|
48 |
+
|
49 |
+
"""
|
50 |
+
|
51 |
+
alphabet = _get_alphabet(alphabet=alphabet,
|
52 |
+
d_aa_only=d_aa_only,
|
53 |
+
include_d_aa=include_d_aa)
|
54 |
+
|
55 |
+
return (''.join(peptide)
|
56 |
+
for peptide in product(alphabet, repeat=length))
|
57 |
+
|
58 |
+
|
59 |
+
def all_peptides_in_length_range(max_length: int,
|
60 |
+
min_length: int = 1,
|
61 |
+
by: int = 1,
|
62 |
+
alphabet: Optional[Iterable[str]] = None,
|
63 |
+
d_aa_only: bool = False,
|
64 |
+
include_d_aa: bool = False,
|
65 |
+
*args, **kwargs) -> Iterable[str]:
|
66 |
+
|
67 |
+
"""
|
68 |
+
|
69 |
+
"""
|
70 |
+
|
71 |
+
length_range = range(*sorted([min_length, max_length + 1]), by)
|
72 |
+
peptide_maker = partial(all_peptides_of_one_length,
|
73 |
+
alphabet=alphabet,
|
74 |
+
d_aa_only=d_aa_only,
|
75 |
+
include_d_aa=include_d_aa,
|
76 |
+
*args, **kwargs)
|
77 |
+
|
78 |
+
return chain.from_iterable(peptide_maker(length=length)
|
79 |
+
for length in length_range)
|
80 |
+
|
81 |
+
|
82 |
+
def _number_of_peptides(max_length: int,
|
83 |
+
min_length: int = 1,
|
84 |
+
by: int = 1,
|
85 |
+
alphabet: Optional[Iterable[str]] = None,
|
86 |
+
d_aa_only: bool = False,
|
87 |
+
include_d_aa: bool = False):
|
88 |
+
|
89 |
+
alphabet = _get_alphabet(alphabet=alphabet,
|
90 |
+
d_aa_only=d_aa_only,
|
91 |
+
include_d_aa=include_d_aa)
|
92 |
+
n_peptides = [len(alphabet) ** length
|
93 |
+
for length in range(*sorted([min_length, max_length + 1]), by)]
|
94 |
+
|
95 |
+
return n_peptides
|
96 |
+
|
97 |
+
|
98 |
+
def _naive_sample_peptides_in_length_range(max_length: int,
|
99 |
+
min_length: int = 1,
|
100 |
+
by: int = 1,
|
101 |
+
n: Optional[Union[float, int]] = None,
|
102 |
+
alphabet: Optional[Iterable[str]] = None,
|
103 |
+
d_aa_only: bool = False,
|
104 |
+
include_d_aa: bool = False,
|
105 |
+
set_seed: Optional[int] = None):
|
106 |
+
|
107 |
+
alphabet = _get_alphabet(alphabet=alphabet,
|
108 |
+
d_aa_only=d_aa_only,
|
109 |
+
include_d_aa=include_d_aa)
|
110 |
+
n_peptides = _number_of_peptides(max_length=max_length,
|
111 |
+
min_length=min_length,
|
112 |
+
by=by,
|
113 |
+
alphabet=alphabet,
|
114 |
+
d_aa_only=d_aa_only,
|
115 |
+
include_d_aa=include_d_aa)
|
116 |
+
lengths = list(range(*sorted([min_length, max_length + 1]), by))
|
117 |
+
weight_per_length = [n / min(n_peptides) for n in n_peptides]
|
118 |
+
weighted_lengths = list(chain.from_iterable(repeat(l, ceil(w)) for l, w in zip(lengths, weight_per_length)))
|
119 |
+
|
120 |
+
lengths_sample = (choice(weighted_lengths) for _ in range(n))
|
121 |
+
return (''.join(choices(list(alphabet), k=k)) for k in lengths_sample)
|
122 |
+
|
123 |
+
|
124 |
+
def sample_peptides_in_length_range(max_length: int,
|
125 |
+
min_length: int = 1,
|
126 |
+
by: int = 1,
|
127 |
+
n: Optional[Union[float, int]] = None,
|
128 |
+
alphabet: Optional[Iterable[str]] = None,
|
129 |
+
d_aa_only: bool = False,
|
130 |
+
include_d_aa: bool = False,
|
131 |
+
naive_sampling_cutoff: float = 5e-3,
|
132 |
+
reservoir_sampling: bool = True,
|
133 |
+
indexes: Optional[Iterable[int]] = None,
|
134 |
+
set_seed: Optional[int] = None,
|
135 |
+
*args, **kwargs) -> Iterable[str]:
|
136 |
+
|
137 |
+
"""
|
138 |
+
|
139 |
+
"""
|
140 |
+
|
141 |
+
seed(set_seed)
|
142 |
+
|
143 |
+
alphabet = _get_alphabet(alphabet=alphabet,
|
144 |
+
d_aa_only=d_aa_only,
|
145 |
+
include_d_aa=include_d_aa)
|
146 |
+
|
147 |
+
n_peptides = sum(len(alphabet) ** length
|
148 |
+
for length in range(*sorted([min_length, max_length + 1]), by))
|
149 |
+
if n is None:
|
150 |
+
n_requested = n_peptides
|
151 |
+
elif n >= 1.:
|
152 |
+
n_requested = min(floor(n), n_peptides)
|
153 |
+
elif n < 1.:
|
154 |
+
n_requested = floor(n * n_peptides)
|
155 |
+
|
156 |
+
frac_requested = n_requested / n_peptides
|
157 |
+
|
158 |
+
# approximation of birthday problem
|
159 |
+
p_any_collision = -expm1(-n_requested * (n_requested - 1.) / (2. * n_peptides))
|
160 |
+
n_collisons = n_requested * (1. - ((n_peptides - 1.) / n_peptides) ** (n_requested - 1.))
|
161 |
+
frac_collisions = n_collisons / n_requested
|
162 |
+
|
163 |
+
print_err(f"Sampling {n_requested} ({frac_requested * 100.} %) peptides from "
|
164 |
+
f"length {min_length} to {max_length} ({n_peptides} combinations). "
|
165 |
+
f"Probability of collision if drawing randomly is {p_any_collision}, "
|
166 |
+
f"with {n_collisons} ({100. * frac_collisions} %) collisions on average.")
|
167 |
+
|
168 |
+
if frac_collisions < naive_sampling_cutoff and n_peptides > 2e9:
|
169 |
+
|
170 |
+
print_err("> Executing naive sampling. ")
|
171 |
+
|
172 |
+
peptides = _naive_sample_peptides_in_length_range(max_length, min_length, by,
|
173 |
+
n=n_requested,
|
174 |
+
alphabet=alphabet,
|
175 |
+
d_aa_only=d_aa_only,
|
176 |
+
include_d_aa=include_d_aa)
|
177 |
+
|
178 |
+
else:
|
179 |
+
|
180 |
+
print_err("> Executing exhaustive sampling.")
|
181 |
+
|
182 |
+
all_peptides = all_peptides_in_length_range(max_length, min_length, by,
|
183 |
+
alphabet=alphabet,
|
184 |
+
d_aa_only=d_aa_only,
|
185 |
+
include_d_aa=include_d_aa,
|
186 |
+
*args, **kwargs)
|
187 |
+
|
188 |
+
if n is None:
|
189 |
+
|
190 |
+
peptides = all_peptides
|
191 |
+
|
192 |
+
elif n >= 1.:
|
193 |
+
|
194 |
+
if reservoir_sampling:
|
195 |
+
peptides = sample_iter(all_peptides, k=n_requested,
|
196 |
+
shuffle_output=False)
|
197 |
+
else:
|
198 |
+
peptides = (pep for pep in all_peptides
|
199 |
+
if random() <= frac_requested)
|
200 |
+
|
201 |
+
elif n < 1.:
|
202 |
+
|
203 |
+
peptides = (pep for pep in all_peptides
|
204 |
+
if random() <= n)
|
205 |
+
|
206 |
+
if indexes is not None:
|
207 |
+
|
208 |
+
indexes = (int(ix) if (isinstance(ix, str) and ix.isdigit()) or isinstance(ix, int) or isinstance(ix, float)
|
209 |
+
else None
|
210 |
+
for ix in islice(indexes, 3))
|
211 |
+
indexes = [ix if (ix is None or ix >= 0) else None
|
212 |
+
for ix in indexes]
|
213 |
+
|
214 |
+
if len(indexes) > 1:
|
215 |
+
if n is not None and n >=1. and indexes[0] > n:
|
216 |
+
raise ValueError(f"Minimum slice ({indexes[0]}) is higher than number of items ({n}).")
|
217 |
+
|
218 |
+
peptides = islice(peptides, *indexes)
|
219 |
+
|
220 |
+
return peptides
|
221 |
+
|
222 |
+
|
223 |
+
def _reactor(smarts: str) -> Callable[[Mol], Union[Mol, None]]:
|
224 |
+
|
225 |
+
rxn = rdChemReactions.ReactionFromSmarts(smarts)
|
226 |
+
reaction_function = rxn.RunReactants
|
227 |
+
|
228 |
+
@vectorize
|
229 |
+
@return_none_on_error
|
230 |
+
def reactor(s: Mol) -> Mol:
|
231 |
+
|
232 |
+
return reaction_function([s])[0][0]
|
233 |
+
|
234 |
+
return reactor
|
235 |
+
|
236 |
+
|
237 |
+
@_convert_input_to_smiles
|
238 |
+
def react(strings: Union[str, Iterable[str]],
|
239 |
+
reaction: str = 'N_to_C_cyclization',
|
240 |
+
output_representation: str = 'smiles',
|
241 |
+
**kwargs) -> Union[str, Iterable[str]]:
|
242 |
+
|
243 |
+
"""
|
244 |
+
|
245 |
+
"""
|
246 |
+
|
247 |
+
try:
|
248 |
+
_this_reaction = REACTIONS[reaction]
|
249 |
+
except KeyError:
|
250 |
+
raise KeyError(f"Reaction {reaction} is not available. Try: " +
|
251 |
+
", ".join(list(REACTIONS)))
|
252 |
+
|
253 |
+
# strings = cast(strings, to=list)
|
254 |
+
# print_err((strings))
|
255 |
+
|
256 |
+
reactor = _reactor(_this_reaction)
|
257 |
+
mols = _x2mol(strings)
|
258 |
+
mols = reactor(mols)
|
259 |
+
|
260 |
+
return _mol2x(mols,
|
261 |
+
output_representation=output_representation,
|
262 |
+
**kwargs)
|
build/lib/schemist/io.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools to facilitate input and output."""
|
2 |
+
|
3 |
+
from typing import Any, Callable, List, Optional, TextIO, Tuple, Union
|
4 |
+
|
5 |
+
from collections import defaultdict
|
6 |
+
from functools import partial
|
7 |
+
from string import printable
|
8 |
+
from tempfile import NamedTemporaryFile
|
9 |
+
from xml.etree import ElementTree
|
10 |
+
|
11 |
+
from carabiner import print_err
|
12 |
+
from carabiner.cast import cast
|
13 |
+
from carabiner.itertools import tenumerate
|
14 |
+
from carabiner.pd import read_table, write_stream
|
15 |
+
|
16 |
+
from pandas import DataFrame, read_excel
|
17 |
+
from rdkit.Chem import SDMolSupplier
|
18 |
+
|
19 |
+
from .converting import _mol2isomeric_canonical_smiles
|
20 |
+
|
21 |
+
def _mutate_df_stream(input_file: Union[str, TextIO],
|
22 |
+
output_file: Union[str, TextIO],
|
23 |
+
function: Callable[[DataFrame], Tuple[Any, DataFrame]],
|
24 |
+
file_format: Optional[str] = None,
|
25 |
+
chunksize: int = 1000) -> List[Any]:
|
26 |
+
|
27 |
+
carries = []
|
28 |
+
|
29 |
+
for i, chunk in tenumerate(read_table(input_file,
|
30 |
+
format=file_format,
|
31 |
+
progress=False,
|
32 |
+
chunksize=chunksize)):
|
33 |
+
|
34 |
+
result = function(chunk)
|
35 |
+
|
36 |
+
try:
|
37 |
+
carry, df = result
|
38 |
+
except ValueError:
|
39 |
+
df = result
|
40 |
+
carry = 0
|
41 |
+
|
42 |
+
write_stream(df,
|
43 |
+
output=output_file,
|
44 |
+
format=file_format,
|
45 |
+
header=i == 0,
|
46 |
+
mode='w' if i == 0 else 'a')
|
47 |
+
|
48 |
+
carries.append(carry)
|
49 |
+
|
50 |
+
return carries
|
51 |
+
|
52 |
+
|
53 |
+
def read_weird_xml(filename: Union[str, TextIO],
|
54 |
+
header: bool = True,
|
55 |
+
namespace: str = '{urn:schemas-microsoft-com:office:spreadsheet}') -> DataFrame:
|
56 |
+
|
57 |
+
"""
|
58 |
+
|
59 |
+
"""
|
60 |
+
|
61 |
+
with cast(filename, TextIOWrapper, mode='r') as f:
|
62 |
+
|
63 |
+
xml_string = ''.join(filter(printable.__contains__, f.read()))
|
64 |
+
|
65 |
+
try:
|
66 |
+
|
67 |
+
root = ElementTree.fromstring(xml_string)
|
68 |
+
|
69 |
+
except Exception as e:
|
70 |
+
|
71 |
+
print_err('\n!!! ' + xml_string.split('\n')[1184][377:380])
|
72 |
+
|
73 |
+
raise e
|
74 |
+
|
75 |
+
for i, row in enumerate(root.iter(f'{namespace}Row') ):
|
76 |
+
|
77 |
+
this_row = [datum.text for datum in row.iter(f'{namespace}Data')]
|
78 |
+
|
79 |
+
if i == 0:
|
80 |
+
|
81 |
+
if header:
|
82 |
+
|
83 |
+
heading = this_row
|
84 |
+
df = {colname: [] for colname in heading}
|
85 |
+
|
86 |
+
else:
|
87 |
+
|
88 |
+
heading = [f'X{j}' for j, _ in enumerate(this_row)]
|
89 |
+
df = {colname: [datum] for colname, datum in zip(heading, this_row)}
|
90 |
+
|
91 |
+
else:
|
92 |
+
|
93 |
+
for colname, datum in zip(heading, this_row):
|
94 |
+
|
95 |
+
df[colname].append(datum)
|
96 |
+
|
97 |
+
return DataFrame(df)
|
98 |
+
|
99 |
+
|
100 |
+
def read_sdf(filename: Union[str, TextIO]):
|
101 |
+
|
102 |
+
"""
|
103 |
+
|
104 |
+
"""
|
105 |
+
|
106 |
+
filename = cast(filename, str)
|
107 |
+
|
108 |
+
with open(filename, 'r', errors='replace') as f:
|
109 |
+
with NamedTemporaryFile("w") as o:
|
110 |
+
|
111 |
+
o.write(f.read())
|
112 |
+
o.seek(0)
|
113 |
+
|
114 |
+
df = defaultdict(list)
|
115 |
+
|
116 |
+
for i, mol in enumerate(SDMolSupplier(o.name)):
|
117 |
+
|
118 |
+
if mol is None:
|
119 |
+
|
120 |
+
continue
|
121 |
+
|
122 |
+
propdict = mol.GetPropsAsDict()
|
123 |
+
propdict['SMILES'] = _mol2isomeric_canonical_smiles(mol)
|
124 |
+
|
125 |
+
for colname in propdict:
|
126 |
+
|
127 |
+
df[colname].append(propdict[colname])
|
128 |
+
|
129 |
+
for colname in df:
|
130 |
+
|
131 |
+
if colname not in propdict:
|
132 |
+
|
133 |
+
df[colname].append(None)
|
134 |
+
|
135 |
+
col_lengths = {col: len(val) for col, val in df.items()}
|
136 |
+
|
137 |
+
if len(set(col_lengths.values())) > 1:
|
138 |
+
|
139 |
+
raise ValueError(f"Column lengths not all the same:\n\t" +
|
140 |
+
'\n\t'.join(f"{key}:{val}" for key, val in col_lengths.items()))
|
141 |
+
|
142 |
+
return DataFrame(df)
|
143 |
+
|
144 |
+
|
145 |
+
FILE_READERS = {
|
146 |
+
'bad_xml': read_weird_xml,
|
147 |
+
'xlsx': partial(read_excel, engine='openpyxl'),
|
148 |
+
'sdf': read_sdf
|
149 |
+
}
|
build/lib/schemist/rest_lookup.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools for querying PubChem."""
|
2 |
+
|
3 |
+
from typing import Dict, Iterable, List, Optional, Union
|
4 |
+
from time import sleep
|
5 |
+
from xml.etree import ElementTree
|
6 |
+
|
7 |
+
from carabiner import print_err
|
8 |
+
from carabiner.cast import cast
|
9 |
+
from carabiner.decorators import vectorize
|
10 |
+
from requests import Response, Session
|
11 |
+
|
12 |
+
_PUBCHEM_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/{get}/{format}"
|
13 |
+
_CACTUS_URL = "https://cactus.nci.nih.gov/chemical/structure/{inchikey}/{get}"
|
14 |
+
|
15 |
+
_OVERLOAD_CODES = {500, 501, 503, 504}
|
16 |
+
|
17 |
+
|
18 |
+
def _url_request(inchikeys: Union[str, Iterable[str]],
|
19 |
+
url: str,
|
20 |
+
session: Optional[Session] = None,
|
21 |
+
**kwargs) -> Response:
|
22 |
+
|
23 |
+
if session is None:
|
24 |
+
session = Session()
|
25 |
+
|
26 |
+
inchikeys = cast(inchikeys, to=list)
|
27 |
+
|
28 |
+
return session.get(url.format(inchikey=','.join(inchikeys), **kwargs))
|
29 |
+
|
30 |
+
|
31 |
+
def _inchikey2pubchem_name_id(inchikeys: Union[str, Iterable[str]],
|
32 |
+
session: Optional[Session] = None,
|
33 |
+
counter: int = 0,
|
34 |
+
max_tries: int = 10,
|
35 |
+
namespace: str = "{http://pubchem.ncbi.nlm.nih.gov/pug_rest}") -> List[Dict[str, Union[None, int, str]]]:
|
36 |
+
|
37 |
+
r = _url_request(inchikeys, url=_PUBCHEM_URL,
|
38 |
+
session=session,
|
39 |
+
get="Title,InchiKey", format="XML")
|
40 |
+
|
41 |
+
if r.status_code == 200:
|
42 |
+
|
43 |
+
root = ElementTree.fromstring(r.text)
|
44 |
+
compounds = root.iter(f'{namespace}Properties')
|
45 |
+
|
46 |
+
result_dict = dict()
|
47 |
+
|
48 |
+
for cmpd in compounds:
|
49 |
+
|
50 |
+
cmpd_dict = dict()
|
51 |
+
|
52 |
+
for child in cmpd:
|
53 |
+
cmpd_dict[child.tag.split(namespace)[1]] = child.text
|
54 |
+
|
55 |
+
try:
|
56 |
+
inchikey, name, pcid = cmpd_dict['InChIKey'], cmpd_dict['Title'], cmpd_dict['CID']
|
57 |
+
except KeyError:
|
58 |
+
print(cmpd_dict)
|
59 |
+
else:
|
60 |
+
result_dict[inchikey] = {'pubchem_name': name.casefold(),
|
61 |
+
'pubchem_id': pcid}
|
62 |
+
|
63 |
+
print_err(f'PubChem: Looked up InchiKeys: {",".join(inchikeys)}')
|
64 |
+
|
65 |
+
result_list = [result_dict[inchikey]
|
66 |
+
if inchikey in result_dict
|
67 |
+
else {'pubchem_name': None, 'pubchem_id': None}
|
68 |
+
for inchikey in inchikeys]
|
69 |
+
|
70 |
+
return result_list
|
71 |
+
|
72 |
+
elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
|
73 |
+
|
74 |
+
sleep(1.)
|
75 |
+
|
76 |
+
return _inchikey2pubchem_name_id(inchikeys,
|
77 |
+
session=session,
|
78 |
+
counter=counter + 1,
|
79 |
+
max_tries=max_tries,
|
80 |
+
namespace=namespace)
|
81 |
+
|
82 |
+
else:
|
83 |
+
|
84 |
+
print_err(f'PubChem: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
|
85 |
+
|
86 |
+
return [{'pubchem_name': None, 'pubchem_id': None}
|
87 |
+
for _ in range(len(inchikeys))]
|
88 |
+
|
89 |
+
|
90 |
+
@vectorize
|
91 |
+
def _inchikey2cactus_name(inchikeys: str,
|
92 |
+
session: Optional[Session] = None,
|
93 |
+
counter: int = 0,
|
94 |
+
max_tries: int = 10):
|
95 |
+
|
96 |
+
r = _url_request(inchikeys, url=_CACTUS_URL,
|
97 |
+
session=session,
|
98 |
+
get="names")
|
99 |
+
|
100 |
+
if r.status_code == 200:
|
101 |
+
|
102 |
+
return r.text.split('\n')[0].casefold()
|
103 |
+
|
104 |
+
elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
|
105 |
+
|
106 |
+
sleep(1.)
|
107 |
+
|
108 |
+
return _inchikey2cactus_name(inchikeys,
|
109 |
+
session=session,
|
110 |
+
counter=counter + 1,
|
111 |
+
max_tries=max_tries)
|
112 |
+
|
113 |
+
else:
|
114 |
+
|
115 |
+
print_err(f'Cactus: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
|
116 |
+
|
117 |
+
return None
|
118 |
+
|
build/lib/schemist/splitting.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools for splitting tabular datasets, optionally based on chemical features."""
|
2 |
+
|
3 |
+
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
4 |
+
from collections import defaultdict
|
5 |
+
from math import ceil
|
6 |
+
from random import random, seed
|
7 |
+
|
8 |
+
try:
|
9 |
+
from itertools import batched
|
10 |
+
except ImportError:
|
11 |
+
from carabiner.itertools import batched
|
12 |
+
|
13 |
+
from tqdm.auto import tqdm
|
14 |
+
|
15 |
+
from .converting import convert_string_representation, _convert_input_to_smiles
|
16 |
+
from .typing import DataSplits
|
17 |
+
|
18 |
+
# def _train_test_splits
|
19 |
+
|
20 |
+
def _train_test_val_sizes(total: int,
|
21 |
+
train: float = 1.,
|
22 |
+
test: float = 0.) -> Tuple[int]:
|
23 |
+
|
24 |
+
n_train = int(ceil(train * total))
|
25 |
+
n_test = int(ceil(test * total))
|
26 |
+
n_val = total - n_train - n_test
|
27 |
+
|
28 |
+
return n_train, n_test, n_val
|
29 |
+
|
30 |
+
|
31 |
+
def _random_chunk(strings: str,
|
32 |
+
train: float = 1.,
|
33 |
+
test: float = 0.,
|
34 |
+
carry: Optional[Dict[str, List[int]]] = None,
|
35 |
+
start_from: int = 0) -> Dict[str, List[int]]:
|
36 |
+
|
37 |
+
carry = carry or defaultdict(list)
|
38 |
+
|
39 |
+
train_test: float = train + test
|
40 |
+
|
41 |
+
for i, _ in enumerate(strings):
|
42 |
+
|
43 |
+
random_number: float = random()
|
44 |
+
|
45 |
+
if random_number < train:
|
46 |
+
|
47 |
+
key = 'train'
|
48 |
+
|
49 |
+
elif random_number < train_test:
|
50 |
+
|
51 |
+
key = 'test'
|
52 |
+
|
53 |
+
else:
|
54 |
+
|
55 |
+
key = 'validation'
|
56 |
+
|
57 |
+
carry[key].append(start_from + i)
|
58 |
+
|
59 |
+
return carry
|
60 |
+
|
61 |
+
|
62 |
+
def split_random(strings: Union[str, Iterable[str]],
|
63 |
+
train: float = 1.,
|
64 |
+
test: float = 0.,
|
65 |
+
chunksize: Optional[int] = None,
|
66 |
+
set_seed: Optional[int] = None,
|
67 |
+
*args, **kwargs) -> DataSplits:
|
68 |
+
|
69 |
+
"""
|
70 |
+
|
71 |
+
"""
|
72 |
+
|
73 |
+
if set_seed is not None:
|
74 |
+
|
75 |
+
seed(set_seed)
|
76 |
+
|
77 |
+
|
78 |
+
if chunksize is None:
|
79 |
+
|
80 |
+
idx = _random_chunk(strings=strings,
|
81 |
+
train=train,
|
82 |
+
test=test)
|
83 |
+
|
84 |
+
else:
|
85 |
+
|
86 |
+
idx = defaultdict(list)
|
87 |
+
|
88 |
+
for i, chunk in enumerate(batched(strings, chunksize)):
|
89 |
+
|
90 |
+
idx = _random_chunk(strings=chunk,
|
91 |
+
train=train,
|
92 |
+
test=test,
|
93 |
+
carry=idx,
|
94 |
+
start_from=i * chunksize)
|
95 |
+
|
96 |
+
seed(None)
|
97 |
+
|
98 |
+
return DataSplits(**idx)
|
99 |
+
|
100 |
+
|
101 |
+
@_convert_input_to_smiles
|
102 |
+
def _scaffold_chunk(strings: str,
|
103 |
+
carry: Optional[Dict[str, List[int]]] = None,
|
104 |
+
start_from: int = 0) -> Dict[str, List[int]]:
|
105 |
+
|
106 |
+
carry = carry or defaultdict(list)
|
107 |
+
|
108 |
+
these_scaffolds = convert_string_representation(strings=strings,
|
109 |
+
output_representation='scaffold')
|
110 |
+
|
111 |
+
for j, scaff in enumerate(these_scaffolds):
|
112 |
+
carry[scaff].append(start_from + j)
|
113 |
+
|
114 |
+
return carry
|
115 |
+
|
116 |
+
|
117 |
+
def _scaffold_aggregator(scaffold_sets: Dict[str, List[int]],
|
118 |
+
train: float = 1.,
|
119 |
+
test: float = 0.,
|
120 |
+
progress: bool = False) -> DataSplits:
|
121 |
+
|
122 |
+
scaffold_sets = {key: sorted(value)
|
123 |
+
for key, value in scaffold_sets.items()}
|
124 |
+
scaffold_sets = sorted(scaffold_sets.items(),
|
125 |
+
key=lambda x: (len(x[1]), x[1][0]),
|
126 |
+
reverse=True)
|
127 |
+
nrows = sum(len(idx) for _, idx in scaffold_sets)
|
128 |
+
n_train, n_test, n_val = _train_test_val_sizes(nrows,
|
129 |
+
train,
|
130 |
+
test)
|
131 |
+
idx = defaultdict(list)
|
132 |
+
|
133 |
+
iterator = tqdm(scaffold_sets) if progress else scaffold_sets
|
134 |
+
for _, scaffold_idx in iterator:
|
135 |
+
|
136 |
+
if (len(idx['train']) + len(scaffold_idx)) > n_train:
|
137 |
+
|
138 |
+
if (len(idx['test']) + len(scaffold_idx)) > n_test:
|
139 |
+
|
140 |
+
key = 'validation'
|
141 |
+
|
142 |
+
else:
|
143 |
+
|
144 |
+
key = 'test'
|
145 |
+
else:
|
146 |
+
|
147 |
+
key = 'train'
|
148 |
+
|
149 |
+
idx[key] += scaffold_idx
|
150 |
+
|
151 |
+
return DataSplits(**idx)
|
152 |
+
|
153 |
+
|
154 |
+
def split_scaffold(strings: Union[str, Iterable[str]],
|
155 |
+
train: float = 1.,
|
156 |
+
test: float = 0.,
|
157 |
+
chunksize: Optional[int] = None,
|
158 |
+
progress: bool = True) -> DataSplits:
|
159 |
+
|
160 |
+
"""
|
161 |
+
|
162 |
+
"""
|
163 |
+
|
164 |
+
if chunksize is None:
|
165 |
+
|
166 |
+
scaffold_sets = _scaffold_chunk(strings)
|
167 |
+
|
168 |
+
else:
|
169 |
+
|
170 |
+
scaffold_sets = defaultdict(list)
|
171 |
+
|
172 |
+
for i, chunk in enumerate(batched(strings, chunksize)):
|
173 |
+
|
174 |
+
scaffold_sets = _scaffold_chunk(chunk,
|
175 |
+
carry=scaffold_sets,
|
176 |
+
start_from=i * chunksize)
|
177 |
+
|
178 |
+
return _scaffold_aggregator(scaffold_sets,
|
179 |
+
train=train, test=test,
|
180 |
+
progress=progress)
|
181 |
+
|
182 |
+
|
183 |
+
_SPLITTERS = {#'simpd': split_simpd,
|
184 |
+
'scaffold': split_scaffold,
|
185 |
+
'random': split_random}
|
186 |
+
|
187 |
+
# _SPLIT_SUPERTYPES = {'scaffold': 'grouped',
|
188 |
+
# 'random': 'independent'}
|
189 |
+
|
190 |
+
_GROUPED_SPLITTERS = {'scaffold': (_scaffold_chunk, _scaffold_aggregator)}
|
191 |
+
|
192 |
+
assert all(_type in _SPLITTERS
|
193 |
+
for _type in _GROUPED_SPLITTERS) ## Should never fail!
|
194 |
+
|
195 |
+
def split(split_type: str,
|
196 |
+
*args, **kwargs) -> DataSplits:
|
197 |
+
|
198 |
+
"""
|
199 |
+
|
200 |
+
"""
|
201 |
+
|
202 |
+
splitter = _SPLITTERS[split_type]
|
203 |
+
|
204 |
+
return splitter(*args, **kwargs)
|
build/lib/schemist/tables.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools for processing tabular data."""
|
2 |
+
|
3 |
+
from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Tuple, Union
|
4 |
+
from functools import partial
|
5 |
+
|
6 |
+
try:
|
7 |
+
from itertools import batched
|
8 |
+
except ImportError:
|
9 |
+
from carabiner.itertools import batched
|
10 |
+
|
11 |
+
from carabiner.cast import cast, clist
|
12 |
+
from carabiner import print_err
|
13 |
+
from pandas import DataFrame, concat
|
14 |
+
|
15 |
+
from .cleaning import clean_smiles, clean_selfies
|
16 |
+
from .converting import convert_string_representation
|
17 |
+
from .features import calculate_feature
|
18 |
+
from .generating import sample_peptides_in_length_range, react
|
19 |
+
from .splitting import split
|
20 |
+
from .typing import DataSplits
|
21 |
+
|
22 |
+
def _get_error_tally(df: DataFrame,
|
23 |
+
cols: Union[str, List[str]]) -> Dict[str, int]:
|
24 |
+
|
25 |
+
cols = cast(cols, to=list)
|
26 |
+
|
27 |
+
try:
|
28 |
+
tally = {col: (df[col].isna() | ~df[col]).sum() for col in cols}
|
29 |
+
except TypeError:
|
30 |
+
tally = {col: df[col].isna().sum() for col in cols}
|
31 |
+
|
32 |
+
return tally
|
33 |
+
|
34 |
+
|
35 |
+
def converter(df: DataFrame,
|
36 |
+
column: str = 'smiles',
|
37 |
+
input_representation: str = 'smiles',
|
38 |
+
output_representation: Union[str, List[str]] = 'smiles',
|
39 |
+
prefix: Optional[str] = None,
|
40 |
+
options: Optional[Dict[str, Any]] = None) -> Tuple[Dict[str, int], DataFrame]:
|
41 |
+
|
42 |
+
"""
|
43 |
+
|
44 |
+
"""
|
45 |
+
|
46 |
+
prefix = prefix or ''
|
47 |
+
|
48 |
+
converters = {f"{prefix}{rep_out}": partial(convert_string_representation,
|
49 |
+
output_representation=rep_out,
|
50 |
+
input_representation=input_representation,
|
51 |
+
**options)
|
52 |
+
for rep_out in cast(output_representation, to=list)}
|
53 |
+
|
54 |
+
column_values = df[column]
|
55 |
+
|
56 |
+
converted = {col: cast(f(column_values), to=list)
|
57 |
+
for col, f in converters.items()}
|
58 |
+
|
59 |
+
df = df.assign(**converted)
|
60 |
+
|
61 |
+
return _get_error_tally(df, list(converters)), df
|
62 |
+
|
63 |
+
|
64 |
+
def cleaner(df: DataFrame,
|
65 |
+
column: str = 'smiles',
|
66 |
+
input_representation: str = 'smiles',
|
67 |
+
prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
|
68 |
+
|
69 |
+
"""
|
70 |
+
|
71 |
+
"""
|
72 |
+
|
73 |
+
if input_representation.casefold() == 'smiles':
|
74 |
+
cleaner = clean_smiles
|
75 |
+
elif input_representation.casefold() == 'selfies':
|
76 |
+
cleaner = clean_selfies
|
77 |
+
else:
|
78 |
+
raise ValueError(f"Representation {input_representation} is not supported for cleaning.")
|
79 |
+
|
80 |
+
prefix = prefix or ''
|
81 |
+
new_column = f"{prefix}{column}"
|
82 |
+
|
83 |
+
df = df.assign(**{new_column: lambda x: cleaner(x[column])})
|
84 |
+
|
85 |
+
return _get_error_tally(df, new_column), df
|
86 |
+
|
87 |
+
|
88 |
+
def featurizer(df: DataFrame,
|
89 |
+
feature_type: str,
|
90 |
+
column: str = 'smiles',
|
91 |
+
ids: Optional[Union[str, List[str]]] = None,
|
92 |
+
input_representation: str = 'smiles',
|
93 |
+
prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
|
94 |
+
|
95 |
+
"""
|
96 |
+
|
97 |
+
"""
|
98 |
+
|
99 |
+
if ids is None:
|
100 |
+
ids = df.columns.tolist()
|
101 |
+
else:
|
102 |
+
ids = cast(ids, to=list)
|
103 |
+
|
104 |
+
feature_df = calculate_feature(feature_type=feature_type,
|
105 |
+
strings=df[column],
|
106 |
+
prefix=prefix,
|
107 |
+
input_representation=input_representation)
|
108 |
+
|
109 |
+
if len(ids) > 0:
|
110 |
+
df = concat([df[ids], feature_df], axis=1)
|
111 |
+
|
112 |
+
return _get_error_tally(feature_df, 'meta_feature_valid'), df
|
113 |
+
|
114 |
+
|
115 |
+
def assign_groups(df: DataFrame,
|
116 |
+
grouper: Callable[[Union[str, Iterable[str]]], Dict[str, Tuple[int]]],
|
117 |
+
group_name: str = 'group',
|
118 |
+
column: str = 'smiles',
|
119 |
+
input_representation: str = 'smiles',
|
120 |
+
*args, **kwargs) -> Tuple[Dict[str, Tuple[int]], DataFrame]:
|
121 |
+
|
122 |
+
group_idx = grouper(strings=df[column],
|
123 |
+
input_representation=input_representation,
|
124 |
+
*args, **kwargs)
|
125 |
+
|
126 |
+
inv_group_idx = {i: group for group, idx in group_idx.items() for i in idx}
|
127 |
+
groups = [inv_group_idx[i] for i in range(len(inv_group_idx))]
|
128 |
+
|
129 |
+
return group_idx, df.assign(**{group_name: groups})
|
130 |
+
|
131 |
+
|
132 |
+
def _assign_splits(df: DataFrame,
|
133 |
+
split_idx: DataSplits,
|
134 |
+
use_df_index: bool = False) -> DataFrame:
|
135 |
+
|
136 |
+
row_index = df.index if use_df_index else tuple(range(df.shape[0]))
|
137 |
+
|
138 |
+
df = df.assign(**{f'is_{key}': [i in getattr(split_idx, key) for i in row_index]
|
139 |
+
for key in split_idx._fields})
|
140 |
+
split_counts = {key: sum(df[f'is_{key}'].values) for key in split_idx._fields}
|
141 |
+
|
142 |
+
return split_counts, df
|
143 |
+
|
144 |
+
|
145 |
+
def splitter(df: DataFrame,
|
146 |
+
split_type: str = 'random',
|
147 |
+
column: str = 'smiles',
|
148 |
+
input_representation: str = 'smiles',
|
149 |
+
*args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
|
150 |
+
|
151 |
+
"""
|
152 |
+
|
153 |
+
"""
|
154 |
+
|
155 |
+
split_idx = split(split_type=split_type,
|
156 |
+
strings=df[column],
|
157 |
+
input_representation=input_representation,
|
158 |
+
*args, **kwargs)
|
159 |
+
|
160 |
+
return _assign_splits(df, split_idx=split_idx)
|
161 |
+
|
162 |
+
|
163 |
+
def reactor(df: DataFrame,
|
164 |
+
column: str = 'smiles',
|
165 |
+
reaction: Union[str, Iterable[str]] = 'N_to_C_cyclization',
|
166 |
+
prefix: Optional[str] = None,
|
167 |
+
*args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
|
168 |
+
|
169 |
+
"""
|
170 |
+
|
171 |
+
"""
|
172 |
+
|
173 |
+
prefix = prefix or ''
|
174 |
+
|
175 |
+
reactors = {col: partial(react, reaction=col)
|
176 |
+
for col in cast(reaction, to=list)}
|
177 |
+
|
178 |
+
column_values = df[column]
|
179 |
+
|
180 |
+
new_columns = {f"{prefix}{col}": list(_reactor(strings=column_values, *args, **kwargs))
|
181 |
+
for col, _reactor in reactors.items()}
|
182 |
+
|
183 |
+
df = df.assign(**new_columns)
|
184 |
+
|
185 |
+
return _get_error_tally(df, reaction), df
|
186 |
+
|
187 |
+
|
188 |
+
def _peptide_table(max_length: int,
|
189 |
+
min_length: Optional[int] = None,
|
190 |
+
by: int = 1,
|
191 |
+
n: Optional[Union[float, int]] = None,
|
192 |
+
prefix: str = '',
|
193 |
+
suffix: str = '',
|
194 |
+
generator: bool = False,
|
195 |
+
batch_size: int = 1000,
|
196 |
+
*args, **kwargs) -> Union[DataFrame, Generator]:
|
197 |
+
|
198 |
+
min_length = min_length or max_length
|
199 |
+
|
200 |
+
peptides = sample_peptides_in_length_range(max_length=max_length,
|
201 |
+
min_length=min_length,
|
202 |
+
by=by,
|
203 |
+
n=n,
|
204 |
+
*args, **kwargs)
|
205 |
+
|
206 |
+
if generator:
|
207 |
+
|
208 |
+
for peps in batched(peptides, batch_size):
|
209 |
+
|
210 |
+
peps = [f"{prefix}{pep}{suffix}"
|
211 |
+
for pep in peps]
|
212 |
+
|
213 |
+
yield DataFrame(dict(peptide_sequence=peps))
|
214 |
+
|
215 |
+
else:
|
216 |
+
|
217 |
+
peps = [f"{prefix}{pep}{suffix}"
|
218 |
+
for pep in peptides]
|
219 |
+
|
220 |
+
return DataFrame(dict(peptide_sequence=peps))
|
build/lib/schemist/typing.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Types used in schemist."""
|
2 |
+
|
3 |
+
from collections import namedtuple
|
4 |
+
|
5 |
+
DataSplits = namedtuple('DataSplits',
|
6 |
+
['train', 'test', 'validation'],
|
7 |
+
defaults=[tuple(), tuple(), tuple()])
|
build/lib/schemist/utils.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""Miscellaneous utilities for schemist."""
|
docs/requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
myst_parser
|
2 |
+
matplotlib
|
3 |
+
numpy
|
4 |
+
openpyxl==3.1.0
|
5 |
+
pandas
|
6 |
+
scipy
|
7 |
+
sphinx_rtd_theme
|
8 |
+
./
|
docs/source/conf.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Configuration file for the Sphinx documentation builder.
|
2 |
+
#
|
3 |
+
# For the full list of built-in configuration values, see the documentation:
|
4 |
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
5 |
+
|
6 |
+
# -- Project information -----------------------------------------------------
|
7 |
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
8 |
+
|
9 |
+
project = 'schemist'
|
10 |
+
copyright = '2024, Eachan Johnson'
|
11 |
+
author = 'Eachan Johnson'
|
12 |
+
release = '0.0.1'
|
13 |
+
|
14 |
+
# -- General configuration ---------------------------------------------------
|
15 |
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
16 |
+
|
17 |
+
extensions = ['sphinx.ext.doctest',
|
18 |
+
'sphinx.ext.autodoc',
|
19 |
+
'sphinx.ext.autosummary',
|
20 |
+
'sphinx.ext.napoleon',
|
21 |
+
'sphinx.ext.viewcode',
|
22 |
+
'myst_parser']
|
23 |
+
|
24 |
+
myst_enable_extensions = [
|
25 |
+
"amsmath",
|
26 |
+
"dollarmath",
|
27 |
+
]
|
28 |
+
|
29 |
+
source_suffix = {
|
30 |
+
'.rst': 'restructuredtext',
|
31 |
+
'.txt': 'markdown',
|
32 |
+
'.md': 'markdown',
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
templates_path = ['_templates']
|
37 |
+
exclude_patterns = []
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
# -- Options for HTML output -------------------------------------------------
|
42 |
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
43 |
+
|
44 |
+
html_theme = 'sphinx_rtd_theme'
|
45 |
+
html_static_path = []
|
docs/source/index.md
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ⬢⬢⬢ schemist
|
2 |
+
|
3 |
+

|
4 |
+

|
5 |
+

|
6 |
+
|
7 |
+
Cleaning, collating, and augmenting chemical datasets.
|
8 |
+
|
9 |
+
```{toctree}
|
10 |
+
:maxdepth: 2
|
11 |
+
:caption: Contents:
|
12 |
+
|
13 |
+
installation
|
14 |
+
usage
|
15 |
+
python
|
16 |
+
modules
|
17 |
+
```
|
18 |
+
|
19 |
+
## Source
|
20 |
+
|
21 |
+
`GitHub <https://github.com/scbirlab/schemist>`_
|
pyproject.toml
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "schemist"
|
3 |
+
version = "0.0.1"
|
4 |
+
authors = [
|
5 |
+
{ name="Eachan Johnson", email="[email protected]" },
|
6 |
+
]
|
7 |
+
description = "Organizing and processing tables of chemical structures."
|
8 |
+
readme = "README.md"
|
9 |
+
requires-python = ">=3.8"
|
10 |
+
license = {file = "LICENSE"}
|
11 |
+
keywords = ["science", "chemistry", "SMILES", "SELFIES", "cheminformatics"]
|
12 |
+
|
13 |
+
classifiers = [
|
14 |
+
|
15 |
+
"Development Status :: 3 - Alpha",
|
16 |
+
|
17 |
+
# Indicate who your project is intended for
|
18 |
+
"Intended Audience :: Science/Research",
|
19 |
+
"Topic :: Scientific/Engineering :: Chemistry",
|
20 |
+
|
21 |
+
"License :: OSI Approved :: MIT License",
|
22 |
+
|
23 |
+
"Programming Language :: Python :: 3.8",
|
24 |
+
"Programming Language :: Python :: 3.9",
|
25 |
+
"Programming Language :: Python :: 3.10",
|
26 |
+
"Programming Language :: Python :: 3.11",
|
27 |
+
"Programming Language :: Python :: 3 :: Only",
|
28 |
+
]
|
29 |
+
|
30 |
+
dependencies = [
|
31 |
+
"git+https://github.com/scbirlab/carabiner.git",
|
32 |
+
"datamol",
|
33 |
+
"descriptastorus",
|
34 |
+
"nemony",
|
35 |
+
"openpyxl==3.1.0",
|
36 |
+
"pandas",
|
37 |
+
"rdkit",
|
38 |
+
"requests",
|
39 |
+
"selfies"
|
40 |
+
]
|
41 |
+
|
42 |
+
[project.urls]
|
43 |
+
"Homepage" = "https://github.com/scbirlab/schemist"
|
44 |
+
"Repository" = "https://github.com/scbirlab/schemist.git"
|
45 |
+
"Bug Tracker" = "https://github.com/scbirlab/schemist/issues"
|
46 |
+
"Documentation" = "https://readthedocs.org/schemist"
|
47 |
+
|
48 |
+
[project.scripts] # Optional
|
49 |
+
schemist = "schemist.cli:main"
|
50 |
+
|
51 |
+
[tool.setuptools]
|
52 |
+
# If there are data files included in your packages that need to be
|
53 |
+
# installed, specify them here.
|
54 |
+
# package-data = {"" = ["*.yml"]}
|
55 |
+
|
56 |
+
[build-system]
|
57 |
+
# These are the assumed default build requirements from pip:
|
58 |
+
# https://pip.pypa.io/en/stable/reference/pip/#pep-517-and-518-support
|
59 |
+
requires = ["setuptools>=43.0.0", "wheel"]
|
60 |
+
build-backend = "setuptools.build_meta"
|
schemist/__init__.py
ADDED
File without changes
|
schemist/cleaning.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Chemical structure cleaning routines."""
|
2 |
+
|
3 |
+
from carabiner.decorators import vectorize
|
4 |
+
|
5 |
+
from datamol import sanitize_smiles
|
6 |
+
import selfies as sf
|
7 |
+
|
8 |
+
@vectorize
|
9 |
+
def clean_smiles(smiles: str,
|
10 |
+
*args, **kwargs) -> str:
|
11 |
+
|
12 |
+
"""Sanitize a SMILES string or list of SMILES strings.
|
13 |
+
|
14 |
+
"""
|
15 |
+
|
16 |
+
return sanitize_smiles(smiles, *args, **kwargs)
|
17 |
+
|
18 |
+
|
19 |
+
@vectorize
|
20 |
+
def clean_selfies(selfies: str,
|
21 |
+
*args, **kwargs) -> str:
|
22 |
+
|
23 |
+
"""Sanitize a SELFIES string or list of SELFIES strings.
|
24 |
+
|
25 |
+
"""
|
26 |
+
|
27 |
+
return sf.encode(sanitize_smiles(sf.decode(selfies), *args, **kwargs))
|
schemist/cli.py
ADDED
@@ -0,0 +1,536 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Command-line interface for schemist."""
|
2 |
+
|
3 |
+
from typing import Any, Dict, List, Optional
|
4 |
+
|
5 |
+
from argparse import FileType, Namespace
|
6 |
+
from collections import Counter, defaultdict
|
7 |
+
from functools import partial
|
8 |
+
import os
|
9 |
+
import sys
|
10 |
+
from tempfile import NamedTemporaryFile, TemporaryDirectory
|
11 |
+
|
12 |
+
from carabiner import pprint_dict, upper_and_lower, print_err
|
13 |
+
from carabiner.cliutils import clicommand, CLIOption, CLICommand, CLIApp
|
14 |
+
from carabiner.itertools import tenumerate
|
15 |
+
from carabiner.pd import get_formats, write_stream
|
16 |
+
|
17 |
+
from .collating import collate_inventory, deduplicate_file
|
18 |
+
from .converting import _TO_FUNCTIONS, _FROM_FUNCTIONS
|
19 |
+
from .generating import AA, REACTIONS
|
20 |
+
from .io import _mutate_df_stream
|
21 |
+
from .tables import (converter, cleaner, featurizer, assign_groups,
|
22 |
+
_assign_splits, splitter, _peptide_table, reactor)
|
23 |
+
from .splitting import _SPLITTERS, _GROUPED_SPLITTERS
|
24 |
+
|
25 |
+
__version__ = '0.0.1'
|
26 |
+
|
27 |
+
def _option_parser(x: Optional[List[str]]) -> Dict[str, Any]:
|
28 |
+
|
29 |
+
options = {}
|
30 |
+
|
31 |
+
try:
|
32 |
+
for opt in x:
|
33 |
+
|
34 |
+
try:
|
35 |
+
key, value = opt.split('=')
|
36 |
+
except ValueError:
|
37 |
+
raise ValueError(f"Option {opt} is misformatted. It should be in the format keyword=value.")
|
38 |
+
|
39 |
+
try:
|
40 |
+
value = int(value)
|
41 |
+
except ValueError:
|
42 |
+
try:
|
43 |
+
value = float(value)
|
44 |
+
except ValueError:
|
45 |
+
pass
|
46 |
+
|
47 |
+
options[key] = value
|
48 |
+
|
49 |
+
except TypeError:
|
50 |
+
|
51 |
+
pass
|
52 |
+
|
53 |
+
return options
|
54 |
+
|
55 |
+
|
56 |
+
def _sum_tally(tallies: Counter,
|
57 |
+
message: str = "Error counts",
|
58 |
+
use_length: bool = False):
|
59 |
+
|
60 |
+
total_tally = Counter()
|
61 |
+
|
62 |
+
for tally in tallies:
|
63 |
+
|
64 |
+
if use_length:
|
65 |
+
total_tally.update({key: len(value) for key, value in tally.items()})
|
66 |
+
else:
|
67 |
+
total_tally.update(tally)
|
68 |
+
|
69 |
+
if len(tallies) == 0:
|
70 |
+
raise ValueError(f"Nothing generated!")
|
71 |
+
|
72 |
+
pprint_dict(total_tally, message=message)
|
73 |
+
|
74 |
+
return total_tally
|
75 |
+
|
76 |
+
|
77 |
+
@clicommand(message="Cleaning file with the following parameters")
|
78 |
+
def _clean(args: Namespace) -> None:
|
79 |
+
|
80 |
+
error_tallies = _mutate_df_stream(input_file=args.input,
|
81 |
+
output_file=args.output,
|
82 |
+
function=partial(cleaner,
|
83 |
+
column=args.column,
|
84 |
+
input_representation=args.representation,
|
85 |
+
prefix=args.prefix),
|
86 |
+
file_format=args.format)
|
87 |
+
|
88 |
+
_sum_tally(error_tallies)
|
89 |
+
|
90 |
+
return None
|
91 |
+
|
92 |
+
|
93 |
+
@clicommand(message="Converting between string representations with the following parameters")
|
94 |
+
def _convert(args: Namespace) -> None:
|
95 |
+
|
96 |
+
options = _option_parser(args.options)
|
97 |
+
|
98 |
+
error_tallies = _mutate_df_stream(input_file=args.input,
|
99 |
+
output_file=args.output,
|
100 |
+
function=partial(converter,
|
101 |
+
column=args.column,
|
102 |
+
input_representation=args.representation,
|
103 |
+
output_representation=args.to,
|
104 |
+
prefix=args.prefix,
|
105 |
+
options=options),
|
106 |
+
file_format=args.format)
|
107 |
+
|
108 |
+
_sum_tally(error_tallies)
|
109 |
+
|
110 |
+
return None
|
111 |
+
|
112 |
+
|
113 |
+
@clicommand(message="Adding features to files with the following parameters")
|
114 |
+
def _featurize(args: Namespace) -> None:
|
115 |
+
|
116 |
+
error_tallies = _mutate_df_stream(input_file=args.input,
|
117 |
+
output_file=args.output,
|
118 |
+
function=partial(featurizer,
|
119 |
+
feature_type=args.feature,
|
120 |
+
column=args.column,
|
121 |
+
ids=args.id,
|
122 |
+
input_representation=args.representation,
|
123 |
+
prefix=args.prefix),
|
124 |
+
file_format=args.format)
|
125 |
+
|
126 |
+
_sum_tally(error_tallies)
|
127 |
+
|
128 |
+
return None
|
129 |
+
|
130 |
+
|
131 |
+
@clicommand(message="Splitting table with the following parameters")
|
132 |
+
def _split(args: Namespace) -> None:
|
133 |
+
|
134 |
+
split_type = args.type.casefold()
|
135 |
+
|
136 |
+
if split_type in _GROUPED_SPLITTERS:
|
137 |
+
|
138 |
+
chunk_processor, aggregator = _GROUPED_SPLITTERS[split_type]
|
139 |
+
|
140 |
+
with TemporaryDirectory() as dir:
|
141 |
+
|
142 |
+
with NamedTemporaryFile("w", dir=dir, delete=False) as f:
|
143 |
+
|
144 |
+
group_idxs = _mutate_df_stream(input_file=args.input,
|
145 |
+
output_file=f,
|
146 |
+
function=partial(assign_groups,
|
147 |
+
grouper=chunk_processor,
|
148 |
+
group_name=split_type,
|
149 |
+
column=args.column,
|
150 |
+
input_representation=args.representation),
|
151 |
+
file_format=args.format)
|
152 |
+
f.close()
|
153 |
+
new_group_idx = defaultdict(list)
|
154 |
+
|
155 |
+
totals = 0
|
156 |
+
for group_idx in group_idxs:
|
157 |
+
these_totals = 0
|
158 |
+
for key, value in group_idx.items():
|
159 |
+
these_totals += len(value)
|
160 |
+
new_group_idx[key] += [idx + totals for idx in value]
|
161 |
+
totals += these_totals
|
162 |
+
|
163 |
+
group_idx = aggregator(new_group_idx,
|
164 |
+
train=args.train,
|
165 |
+
test=args.test)
|
166 |
+
|
167 |
+
split_tallies = _mutate_df_stream(input_file=f.name,
|
168 |
+
output_file=args.output,
|
169 |
+
function=partial(_assign_splits,
|
170 |
+
split_idx=group_idx,
|
171 |
+
use_df_index=True),
|
172 |
+
file_format=args.format)
|
173 |
+
if os.path.exists(f.name):
|
174 |
+
os.remove(f.name)
|
175 |
+
|
176 |
+
else:
|
177 |
+
|
178 |
+
split_tallies = _mutate_df_stream(input_file=args.input,
|
179 |
+
output_file=args.output,
|
180 |
+
function=partial(splitter,
|
181 |
+
split_type=args.type,
|
182 |
+
column=args.column,
|
183 |
+
input_representation=args.representation,
|
184 |
+
train=args.train,
|
185 |
+
test=args.test,
|
186 |
+
set_seed=args.seed),
|
187 |
+
file_format=args.format)
|
188 |
+
|
189 |
+
_sum_tally(split_tallies,
|
190 |
+
message="Split counts")
|
191 |
+
|
192 |
+
return None
|
193 |
+
|
194 |
+
|
195 |
+
@clicommand(message="Collating files with the following parameters")
|
196 |
+
def _collate(args: Namespace) -> None:
|
197 |
+
|
198 |
+
root_dir = args.data_dir or '.'
|
199 |
+
|
200 |
+
error_tallies = _mutate_df_stream(input_file=args.input,
|
201 |
+
output_file=args.output,
|
202 |
+
function=partial(collate_inventory,
|
203 |
+
root_dir=root_dir,
|
204 |
+
drop_unmapped=not args.keep_extra_columns,
|
205 |
+
catalog_smiles_column=args.column,
|
206 |
+
id_column_name=args.id_column,
|
207 |
+
id_n_digits=args.digits,
|
208 |
+
id_prefix=args.prefix),
|
209 |
+
file_format=args.format)
|
210 |
+
|
211 |
+
_sum_tally(error_tallies,
|
212 |
+
message="Collated chemicals:")
|
213 |
+
|
214 |
+
return None
|
215 |
+
|
216 |
+
|
217 |
+
@clicommand(message="Deduplicating chemical structures with the following parameters")
|
218 |
+
def _dedup(args: Namespace) -> None:
|
219 |
+
|
220 |
+
report, deduped_df = deduplicate_file(args.input,
|
221 |
+
format=args.format,
|
222 |
+
column=args.column,
|
223 |
+
input_representation=args.representation,
|
224 |
+
index_columns=args.indexes)
|
225 |
+
|
226 |
+
if args.prefix is not None and 'inchikey' in deduped_df:
|
227 |
+
deduped_df = deduped_df.rename(columns={'inchikey': f'{args.prefix}inchikey'})
|
228 |
+
|
229 |
+
write_stream(deduped_df,
|
230 |
+
output=args.output,
|
231 |
+
format=args.format)
|
232 |
+
|
233 |
+
pprint_dict(report, message="Finished deduplicating:")
|
234 |
+
|
235 |
+
return None
|
236 |
+
|
237 |
+
|
238 |
+
@clicommand(message="Enumerating peptides with the following parameters")
|
239 |
+
def _enum(args: Namespace) -> None:
|
240 |
+
|
241 |
+
tables = _peptide_table(max_length=args.max_length,
|
242 |
+
min_length=args.min_length,
|
243 |
+
n=args.number,
|
244 |
+
indexes=args.slice,
|
245 |
+
set_seed=args.seed,
|
246 |
+
prefix=args.prefix,
|
247 |
+
suffix=args.suffix,
|
248 |
+
d_aa_only=args.d_aa_only,
|
249 |
+
include_d_aa=args.include_d_aa,
|
250 |
+
generator=True)
|
251 |
+
|
252 |
+
dAA_use = any(aa.islower() for aa in args.prefix + args.suffix)
|
253 |
+
dAA_use = dAA_use or args.include_d_aa or args.d_aa_only
|
254 |
+
|
255 |
+
tallies, error_tallies = [], []
|
256 |
+
options = _option_parser(args.options)
|
257 |
+
_converter = partial(converter,
|
258 |
+
column='peptide_sequence',
|
259 |
+
input_representation='minihelm' if dAA_use else 'aa_seq', ## affects performance
|
260 |
+
output_representation=args.to,
|
261 |
+
options=options)
|
262 |
+
|
263 |
+
for i, table in tenumerate(tables):
|
264 |
+
|
265 |
+
_err_tally, df = _converter(table)
|
266 |
+
|
267 |
+
tallies.append({"Number of peptides": df.shape[0]})
|
268 |
+
error_tallies.append(_err_tally)
|
269 |
+
|
270 |
+
write_stream(df,
|
271 |
+
output=args.output,
|
272 |
+
format=args.format,
|
273 |
+
mode='w' if i == 0 else 'a',
|
274 |
+
header=i == 0)
|
275 |
+
|
276 |
+
_sum_tally(tallies,
|
277 |
+
message="Enumerated peptides")
|
278 |
+
_sum_tally(error_tallies,
|
279 |
+
message="Conversion errors")
|
280 |
+
|
281 |
+
return None
|
282 |
+
|
283 |
+
|
284 |
+
@clicommand(message="Reacting peptides with the following parameters")
|
285 |
+
def _react(args: Namespace) -> None:
|
286 |
+
|
287 |
+
error_tallies = _mutate_df_stream(input_file=args.input,
|
288 |
+
output_file=args.output,
|
289 |
+
function=partial(reactor,
|
290 |
+
column=args.column,
|
291 |
+
input_representation=args.representation,
|
292 |
+
reaction=args.reaction,
|
293 |
+
product_name=args.name),
|
294 |
+
file_format=args.format)
|
295 |
+
|
296 |
+
_sum_tally(error_tallies)
|
297 |
+
|
298 |
+
return None
|
299 |
+
|
300 |
+
|
301 |
+
def main() -> None:
|
302 |
+
|
303 |
+
inputs = CLIOption('input',
|
304 |
+
default=sys.stdin,
|
305 |
+
type=FileType('r'),
|
306 |
+
nargs='?',
|
307 |
+
help='Input columnar Excel, CSV or TSV file. Default: STDIN.')
|
308 |
+
representation = CLIOption('--representation', '-r',
|
309 |
+
type=str,
|
310 |
+
default='SMILES',
|
311 |
+
choices=upper_and_lower(_FROM_FUNCTIONS),
|
312 |
+
help='Chemical representation to use for input. ')
|
313 |
+
column = CLIOption('--column', '-c',
|
314 |
+
default='smiles',
|
315 |
+
type=str,
|
316 |
+
help='Column to use as input string representation. ')
|
317 |
+
prefix = CLIOption('--prefix', '-p',
|
318 |
+
default=None,
|
319 |
+
type=str,
|
320 |
+
help='Prefix to add to new column name. Default: no prefix')
|
321 |
+
to = CLIOption('--to', '-2',
|
322 |
+
type=str,
|
323 |
+
default='SMILES',
|
324 |
+
nargs='*',
|
325 |
+
choices=upper_and_lower(_TO_FUNCTIONS),
|
326 |
+
help='Format to convert to.')
|
327 |
+
options = CLIOption('--options', '-x',
|
328 |
+
type=str,
|
329 |
+
default=None,
|
330 |
+
nargs='*',
|
331 |
+
help='Options to pass to converter, in the format '
|
332 |
+
'"keyword1=value1 keyword2=value2"')
|
333 |
+
output = CLIOption('--output', '-o',
|
334 |
+
type=FileType('w'),
|
335 |
+
default=sys.stdout,
|
336 |
+
help='Output file. Default: STDOUT')
|
337 |
+
formatting = CLIOption('--format', '-f',
|
338 |
+
type=str,
|
339 |
+
default=None,
|
340 |
+
choices=upper_and_lower(get_formats()),
|
341 |
+
help='Override file extensions for input and output. '
|
342 |
+
'Default: infer from file extension.')
|
343 |
+
|
344 |
+
## featurize
|
345 |
+
id_feat = CLIOption('--id', '-i',
|
346 |
+
type=str,
|
347 |
+
default=None,
|
348 |
+
nargs='*',
|
349 |
+
help='Columns to retain in output table. Default: use all')
|
350 |
+
feature = CLIOption('--feature', '-t',
|
351 |
+
type=str,
|
352 |
+
default='2d',
|
353 |
+
choices=['2d', 'fp'], ## TODO: implement 3d
|
354 |
+
help='Which feature type to generate.')
|
355 |
+
|
356 |
+
## split
|
357 |
+
type_ = CLIOption('--type', '-t',
|
358 |
+
type=str,
|
359 |
+
default='random',
|
360 |
+
choices=upper_and_lower(_SPLITTERS),
|
361 |
+
help='Which split type to use.')
|
362 |
+
train = CLIOption('--train', '-a',
|
363 |
+
type=float,
|
364 |
+
default=1.,
|
365 |
+
help='Proportion of data to use for training. ')
|
366 |
+
test = CLIOption('--test', '-b',
|
367 |
+
type=float,
|
368 |
+
default=0.,
|
369 |
+
help='Proportion of data to use for testing. ')
|
370 |
+
|
371 |
+
## collate
|
372 |
+
data_dir = CLIOption('--data-dir', '-d',
|
373 |
+
type=str,
|
374 |
+
default=None,
|
375 |
+
help='Directory containing data files. '
|
376 |
+
'Default: current directory')
|
377 |
+
id_column = CLIOption('--id-column', '-s',
|
378 |
+
default=None,
|
379 |
+
type=str,
|
380 |
+
help='If provided, add a structure ID column with this name. '
|
381 |
+
'Default: don\'t add structure IDs')
|
382 |
+
prefix_collate = CLIOption('--prefix', '-p',
|
383 |
+
default='ID-',
|
384 |
+
type=str,
|
385 |
+
help='Prefix to add to structure IDs. '
|
386 |
+
'Default: no prefix')
|
387 |
+
digits = CLIOption('--digits', '-n',
|
388 |
+
default=8,
|
389 |
+
type=int,
|
390 |
+
help='Number of digits in structure IDs. ')
|
391 |
+
keep_extra_columns = CLIOption('--keep-extra-columns', '-x',
|
392 |
+
action='store_true',
|
393 |
+
help='Whether to keep columns not mentioned in the catalog. '
|
394 |
+
'Default: drop extra columns.')
|
395 |
+
keep_invalid_smiles = CLIOption('--keep-invalid-smiles', '-y',
|
396 |
+
action='store_true',
|
397 |
+
help='Whether to keep rows with invalid SMILES. '
|
398 |
+
'Default: drop invalid rows.')
|
399 |
+
|
400 |
+
## dedup
|
401 |
+
indexes = CLIOption('--indexes', '-x',
|
402 |
+
type=str,
|
403 |
+
default=None,
|
404 |
+
nargs='*',
|
405 |
+
help='Columns to retain and collapse (if multiple values per unique structure). '
|
406 |
+
'Default: retain no other columns than structure and InchiKey.')
|
407 |
+
drop_inchikey = CLIOption('--drop-inchikey', '-d',
|
408 |
+
action='store_true',
|
409 |
+
help='Whether to drop the calculated InchiKey column. '
|
410 |
+
'Default: keep InchiKey.')
|
411 |
+
|
412 |
+
### enum
|
413 |
+
max_length = CLIOption('--max-length', '-l',
|
414 |
+
type=int,
|
415 |
+
help='Maximum length of enumerated peptide. '
|
416 |
+
'Required.')
|
417 |
+
min_length = CLIOption('--min-length', '-m',
|
418 |
+
type=int,
|
419 |
+
default=None,
|
420 |
+
help='Minimum length of enumerated peptide. '
|
421 |
+
'Default: same as maximum, i.e. all peptides same length.')
|
422 |
+
number_to_gen = CLIOption('--number', '-n',
|
423 |
+
type=float,
|
424 |
+
default=None,
|
425 |
+
help='Number of peptides to sample from all possible '
|
426 |
+
'within the constraints. If less than 1, sample '
|
427 |
+
'that fraction of all possible. If greater than 1, '
|
428 |
+
'sample that number. '
|
429 |
+
'Default: return all peptides.')
|
430 |
+
slicer = CLIOption('--slice', '-z',
|
431 |
+
type=str,
|
432 |
+
default=None,
|
433 |
+
nargs='*',
|
434 |
+
help='Subset of (possibly sampled) population to return, in the format <stop> '
|
435 |
+
'or <start> <stop> [<step>]. If "x" is used for <stop>, then it runs to the end. '
|
436 |
+
'For example, 1000 gives the first 1000, 2 600 gives items 2-600, and '
|
437 |
+
'3 500 2 gives every other from 3 to 500. Default: return all.')
|
438 |
+
alphabet = CLIOption('--alphabet', '-b',
|
439 |
+
type=str,
|
440 |
+
default=''.join(AA),
|
441 |
+
help='Alphabet to use in sampling.')
|
442 |
+
suffix = CLIOption('--suffix', '-s',
|
443 |
+
type=str,
|
444 |
+
default='',
|
445 |
+
help='Sequence to add to end. Lowercase for D-amino acids. '
|
446 |
+
'Default: no suffix.')
|
447 |
+
set_seed = CLIOption('--seed', '-e',
|
448 |
+
type=int,
|
449 |
+
default=None,
|
450 |
+
help='Seed to use for reproducible randomness. '
|
451 |
+
'Default: don\'t enable reproducibility.')
|
452 |
+
d_aa_only = CLIOption('--d-aa-only', '-a',
|
453 |
+
action='store_true',
|
454 |
+
help='Whether to only use D-amino acids. '
|
455 |
+
'Default: don\'t include.')
|
456 |
+
include_d_aa = CLIOption('--include-d-aa', '-y',
|
457 |
+
action='store_true',
|
458 |
+
help='Whether to include D-amino acids in enumeration. '
|
459 |
+
'Default: don\'t include.')
|
460 |
+
|
461 |
+
## reaction
|
462 |
+
name = CLIOption('--name', '-n',
|
463 |
+
type=str,
|
464 |
+
default=None,
|
465 |
+
help='Name of column for product. '
|
466 |
+
'Default: same as reaction name.')
|
467 |
+
reaction_opt = CLIOption('--reaction', '-x',
|
468 |
+
type=str,
|
469 |
+
nargs='*',
|
470 |
+
choices=list(REACTIONS),
|
471 |
+
default='N_to_C_cyclization',
|
472 |
+
help='Reaction(s) to apply.')
|
473 |
+
|
474 |
+
clean = CLICommand('clean',
|
475 |
+
description='Clean and normalize SMILES column of a table.',
|
476 |
+
main=_clean,
|
477 |
+
options=[output, formatting, inputs, representation, column, prefix])
|
478 |
+
convert = CLICommand('convert',
|
479 |
+
description='Convert between string representations of chemical structures.',
|
480 |
+
main=_convert,
|
481 |
+
options=[output, formatting, inputs, representation, column, prefix, to, options])
|
482 |
+
featurize = CLICommand('featurize',
|
483 |
+
description='Convert between string representations of chemical structures.',
|
484 |
+
main=_featurize,
|
485 |
+
options=[output, formatting, inputs, representation, column, prefix,
|
486 |
+
id_feat, feature])
|
487 |
+
collate = CLICommand('collate',
|
488 |
+
description='Collect disparate tables or SDF files of libraries into a single table.',
|
489 |
+
main=_collate,
|
490 |
+
options=[output, formatting, inputs, representation,
|
491 |
+
data_dir, column.replace(default='input_smiles'), id_column, prefix_collate,
|
492 |
+
digits, keep_extra_columns, keep_invalid_smiles])
|
493 |
+
dedup = CLICommand('dedup',
|
494 |
+
description='Deduplicate chemical structures and retain references.',
|
495 |
+
main=_dedup,
|
496 |
+
options=[output, formatting, inputs, representation, column, prefix,
|
497 |
+
indexes, drop_inchikey])
|
498 |
+
enum = CLICommand('enumerate',
|
499 |
+
description='Enumerate bio-chemical structures within length and sequence constraints.',
|
500 |
+
main=_enum,
|
501 |
+
options=[output, formatting, to, options,
|
502 |
+
alphabet, max_length, min_length, number_to_gen,
|
503 |
+
slicer, set_seed,
|
504 |
+
prefix.replace(default='',
|
505 |
+
help='Sequence to prepend. Lowercase for D-amino acids. '
|
506 |
+
'Default: no prefix.'),
|
507 |
+
suffix,
|
508 |
+
type_.replace(default='aa',
|
509 |
+
choices=['aa'],
|
510 |
+
help='Type of bio sequence to enumerate. '
|
511 |
+
'Default: %(default)s.'),
|
512 |
+
d_aa_only, include_d_aa])
|
513 |
+
reaction = CLICommand('react',
|
514 |
+
description='React compounds in silico in indicated columns using a named reaction.',
|
515 |
+
main=_react,
|
516 |
+
options=[output, formatting, inputs, representation, column, name,
|
517 |
+
reaction_opt])
|
518 |
+
split = CLICommand('split',
|
519 |
+
description='Split table based on chosen algorithm, optionally taking account of chemical structure during splits.',
|
520 |
+
main=_split,
|
521 |
+
options=[output, formatting, inputs, representation, column, prefix,
|
522 |
+
type_, train, test, set_seed])
|
523 |
+
|
524 |
+
app = CLIApp("schemist",
|
525 |
+
version=__version__,
|
526 |
+
description="Tools for cleaning, collating, and augmenting chemical datasets.",
|
527 |
+
commands=[clean, convert, featurize, collate, dedup, enum, reaction, split])
|
528 |
+
|
529 |
+
app.run()
|
530 |
+
|
531 |
+
return None
|
532 |
+
|
533 |
+
|
534 |
+
if __name__ == "__main__":
|
535 |
+
|
536 |
+
main()
|
schemist/collating.py
ADDED
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools to collate chemical data files."""
|
2 |
+
|
3 |
+
from typing import Callable, Dict, Iterable, List, Optional, Tuple, TextIO, Union
|
4 |
+
|
5 |
+
from collections import Counter
|
6 |
+
from functools import partial
|
7 |
+
from glob import glob
|
8 |
+
import os
|
9 |
+
|
10 |
+
from carabiner.pd import read_table, resolve_delim
|
11 |
+
from carabiner import print_err
|
12 |
+
import numpy as np
|
13 |
+
from pandas import DataFrame, concat
|
14 |
+
|
15 |
+
from .converting import convert_string_representation, _FROM_FUNCTIONS
|
16 |
+
from .io import FILE_READERS
|
17 |
+
|
18 |
+
GROUPING_COLUMNS = ("filename", "file_format", "library_name", "string_representation")
|
19 |
+
ESSENTIAL_COLUMNS = GROUPING_COLUMNS + ("compound_collection", "plate_id", "well_id")
|
20 |
+
|
21 |
+
def _column_mapper(df: DataFrame,
|
22 |
+
cols: Iterable[str]) -> Tuple[Callable, Dict]:
|
23 |
+
|
24 |
+
basic_map = {column: df[column].tolist()[0] for column in cols}
|
25 |
+
inv_basic_map = {value: key for key, value in basic_map.items()}
|
26 |
+
|
27 |
+
def column_mapper(x: DataFrame) -> DataFrame:
|
28 |
+
|
29 |
+
new_df = DataFrame()
|
30 |
+
|
31 |
+
for new_col, old_col in basic_map.items():
|
32 |
+
|
33 |
+
# old_col = str(old_col)
|
34 |
+
|
35 |
+
if old_col is None or str(old_col) in ('None', 'nan', 'NA'):
|
36 |
+
|
37 |
+
new_df[new_col] = None
|
38 |
+
|
39 |
+
elif '+' in old_col:
|
40 |
+
|
41 |
+
splits = old_col.split('+')
|
42 |
+
new_df[new_col] = x[splits[0]].str.cat([x[s].astype(str)
|
43 |
+
for s in splits[1:]])
|
44 |
+
|
45 |
+
elif ';' in old_col:
|
46 |
+
|
47 |
+
col, char, index = old_col.split(';')
|
48 |
+
index = [int(i) for i in index.split(':')]
|
49 |
+
|
50 |
+
if len(index) == 1:
|
51 |
+
index = slice(index[0], index[0] + 1)
|
52 |
+
else:
|
53 |
+
index = slice(*index)
|
54 |
+
|
55 |
+
try:
|
56 |
+
|
57 |
+
new_df[new_col] = (x[col]
|
58 |
+
.str.split(char)
|
59 |
+
.map(lambda y: char.join(y[index] if y is not np.nan else []))
|
60 |
+
.str.strip())
|
61 |
+
|
62 |
+
except TypeError as e:
|
63 |
+
|
64 |
+
print_err(x[col].str.split(char))
|
65 |
+
|
66 |
+
raise e
|
67 |
+
|
68 |
+
else:
|
69 |
+
|
70 |
+
new_df[new_col] = x[old_col].copy()
|
71 |
+
|
72 |
+
return new_df
|
73 |
+
|
74 |
+
return column_mapper, inv_basic_map
|
75 |
+
|
76 |
+
|
77 |
+
def _check_catalog(catalog: DataFrame,
|
78 |
+
catalog_smiles_column: str = 'input_smiles') -> None:
|
79 |
+
|
80 |
+
essential_columns = (catalog_smiles_column, ) + ESSENTIAL_COLUMNS
|
81 |
+
missing_essential_cols = [col for col in essential_columns
|
82 |
+
if col not in catalog]
|
83 |
+
|
84 |
+
if len(missing_essential_cols) > 0:
|
85 |
+
|
86 |
+
print_err(catalog.columns.tolist())
|
87 |
+
|
88 |
+
raise KeyError("Missing required columns from catalog: " +
|
89 |
+
", ".join(missing_essential_cols))
|
90 |
+
|
91 |
+
return None
|
92 |
+
|
93 |
+
|
94 |
+
def collate_inventory(catalog: DataFrame,
|
95 |
+
root_dir: Optional[str] = None,
|
96 |
+
drop_invalid: bool = True,
|
97 |
+
drop_unmapped: bool = False,
|
98 |
+
catalog_smiles_column: str = 'input_smiles',
|
99 |
+
id_column_name: Optional[str] = None,
|
100 |
+
id_n_digits: int = 8,
|
101 |
+
id_prefix: str = '') -> DataFrame:
|
102 |
+
|
103 |
+
f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
|
104 |
+
|
105 |
+
The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
|
106 |
+
|
107 |
+
- filename is a glob pattern of files to collate
|
108 |
+
- file_format is one of {', '.join(FILE_READERS.keys())}
|
109 |
+
- smiles_column contains smiles strings
|
110 |
+
|
111 |
+
Other columns are optional and can have any name, but must contain the name or a pattern
|
112 |
+
matching a column (for tabular data) or field (for SDF data) in the files
|
113 |
+
of the `filename` column. In the output DataFrame, the named column data will be mapped.
|
114 |
+
|
115 |
+
Optional column contents can be either concatenated or split using the following
|
116 |
+
pattern:
|
117 |
+
|
118 |
+
- col1+col2: concatenates the contents of `col1` and `col2`
|
119 |
+
- col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
|
120 |
+
|
121 |
+
Parameters
|
122 |
+
----------
|
123 |
+
catalog : pd.DataFrame
|
124 |
+
Table cataloging locations and format of data. Requires
|
125 |
+
columns {', '.join(ESSENTIAL_COLUMNS)}.
|
126 |
+
root_dir : str, optional
|
127 |
+
Path to look for data files. Default: current directory.
|
128 |
+
drop_invalid : bool, optional
|
129 |
+
Whether to drop rows containing invalid SMILES.
|
130 |
+
|
131 |
+
|
132 |
+
Returns
|
133 |
+
-------
|
134 |
+
pd.DataFrame
|
135 |
+
Collated chemical data.
|
136 |
+
|
137 |
+
"""
|
138 |
+
|
139 |
+
root_dir = root_dir or '.'
|
140 |
+
|
141 |
+
_check_catalog(catalog, catalog_smiles_column)
|
142 |
+
|
143 |
+
nongroup_columns = [col for col in catalog
|
144 |
+
if col not in GROUPING_COLUMNS]
|
145 |
+
loaded_dataframes = []
|
146 |
+
report = Counter({"invalid SMILES": 0,
|
147 |
+
"rows processed": 0})
|
148 |
+
|
149 |
+
grouped_catalog = catalog.groupby(list(GROUPING_COLUMNS))
|
150 |
+
for (this_glob, this_filetype,
|
151 |
+
this_library_name, this_representation), filename_df in grouped_catalog:
|
152 |
+
|
153 |
+
print_err(f'\nProcessing {this_glob}:')
|
154 |
+
|
155 |
+
this_glob = glob(os.path.join(root_dir, this_glob))
|
156 |
+
|
157 |
+
these_filenames = sorted(f for f in this_glob
|
158 |
+
if not os.path.basename(f).startswith('~$'))
|
159 |
+
print_err('\t- ' + '\n\t- '.join(these_filenames))
|
160 |
+
|
161 |
+
column_mapper, mapped_cols = _column_mapper(filename_df,
|
162 |
+
nongroup_columns)
|
163 |
+
|
164 |
+
reader = FILE_READERS[this_filetype]
|
165 |
+
|
166 |
+
for filename in these_filenames:
|
167 |
+
|
168 |
+
this_data0 = reader(filename)
|
169 |
+
|
170 |
+
if not drop_unmapped:
|
171 |
+
unmapped_cols = {col: 'x_' + col.casefold().replace(' ', '_')
|
172 |
+
for col in this_data0 if col not in mapped_cols}
|
173 |
+
this_data = this_data0[list(unmapped_cols)].rename(columns=unmapped_cols)
|
174 |
+
this_data = concat([column_mapper(this_data0), this_data],
|
175 |
+
axis=1)
|
176 |
+
else:
|
177 |
+
this_data = column_mapper(this_data0)
|
178 |
+
|
179 |
+
if this_representation.casefold() not in _FROM_FUNCTIONS:
|
180 |
+
|
181 |
+
raise TypeError(' or '.join(list(set(this_representation, this_representation.casefold()))) +
|
182 |
+
"not a supported string representation. Try one of " + ", ".join(_FROM_FUNCTIONS))
|
183 |
+
|
184 |
+
this_converter = partial(convert_string_representation,
|
185 |
+
input_representation=this_representation.casefold())
|
186 |
+
|
187 |
+
this_data = (this_data
|
188 |
+
.query('compound_collection != "NA"')
|
189 |
+
.assign(library_name=this_library_name,
|
190 |
+
input_file_format=this_filetype,
|
191 |
+
input_string_representation=this_representation,
|
192 |
+
plate_id=lambda x: x['plate_id'].astype(str),
|
193 |
+
plate_loc=lambda x: x['library_name'].str.cat([x['compound_collection'], x['plate_id'], x['well_id']], sep=':'),
|
194 |
+
canonical_smiles=lambda x: this_converter(x[catalog_smiles_column]),
|
195 |
+
is_valid_smiles=lambda x: [s is not None for s in x['canonical_smiles']]))
|
196 |
+
|
197 |
+
report.update({"invalid SMILES": (~this_data['is_valid_smiles']).sum(),
|
198 |
+
"rows processed": this_data.shape[0]})
|
199 |
+
|
200 |
+
if drop_invalid:
|
201 |
+
|
202 |
+
this_data = this_data.query('is_valid_smiles')
|
203 |
+
|
204 |
+
if id_column_name is not None:
|
205 |
+
|
206 |
+
this_converter = partial(convert_string_representation,
|
207 |
+
output_representation='id',
|
208 |
+
options=dict(n=id_n_digits,
|
209 |
+
prefix=id_prefix))
|
210 |
+
this_data = this_data.assign(**{id_column_name: lambda x: this_converter(x['canonical_smiles'])})
|
211 |
+
|
212 |
+
loaded_dataframes.append(this_data)
|
213 |
+
|
214 |
+
collated_df = concat(loaded_dataframes, axis=0)
|
215 |
+
|
216 |
+
return report, collated_df
|
217 |
+
|
218 |
+
|
219 |
+
def collate_inventory_from_file(catalog_path: Union[str, TextIO],
|
220 |
+
root_dir: Optional[str] = None,
|
221 |
+
format: Optional[str] = None,
|
222 |
+
*args, **kwargs) -> DataFrame:
|
223 |
+
|
224 |
+
f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
|
225 |
+
|
226 |
+
The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
|
227 |
+
|
228 |
+
- filename is a glob pattern of files to collate
|
229 |
+
- file_format is one of {', '.join(FILE_READERS.keys())}
|
230 |
+
- smiles_column contains smiles strings
|
231 |
+
|
232 |
+
Other columns are optional and can have any name, but must contain the name or a pattern
|
233 |
+
matching a column (for tabular data) or field (for SDF data) in the files
|
234 |
+
of the `filename` column. In the output DataFrame, the named column data will be mapped.
|
235 |
+
|
236 |
+
Optional column contents can be either concatenated or split using the following
|
237 |
+
pattern:
|
238 |
+
|
239 |
+
- col1+col2: concatenates the contents of `col1` and `col2`
|
240 |
+
- col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
|
241 |
+
|
242 |
+
Parameters
|
243 |
+
----------
|
244 |
+
catalog_path : str
|
245 |
+
Path to catalog file in XLSX, TSV or CSV format. Requires
|
246 |
+
columns {', '.join(ESSENTIAL_COLUMNS)}.
|
247 |
+
format : str, optional
|
248 |
+
Format of catalog file. Default: infer from file extension.
|
249 |
+
root_dir : str, optional
|
250 |
+
Path to look for data files. Default: use directory containing
|
251 |
+
the catalog.
|
252 |
+
|
253 |
+
Returns
|
254 |
+
-------
|
255 |
+
pd.DataFrame
|
256 |
+
Collated chemical data.
|
257 |
+
|
258 |
+
"""
|
259 |
+
|
260 |
+
root_dir = root_dir or os.path.dirname(catalog_path)
|
261 |
+
|
262 |
+
data_catalog = read_table(catalog_path, format=format)
|
263 |
+
|
264 |
+
return collate_inventory(catalog=data_catalog,
|
265 |
+
root_dir=root_dir,
|
266 |
+
*args, **kwargs)
|
267 |
+
|
268 |
+
|
269 |
+
def deduplicate(df: DataFrame,
|
270 |
+
column: str = 'smiles',
|
271 |
+
input_representation: str = 'smiles',
|
272 |
+
index_columns: Optional[List[str]] = None,
|
273 |
+
drop_inchikey: bool = False) -> DataFrame:
|
274 |
+
|
275 |
+
index_columns = index_columns or []
|
276 |
+
|
277 |
+
inchikey_converter = partial(convert_string_representation,
|
278 |
+
input_representation=input_representation,
|
279 |
+
output_representation='inchikey')
|
280 |
+
|
281 |
+
df = df.assign(inchikey=lambda x: inchikey_converter(x[column]))
|
282 |
+
|
283 |
+
structure_columns = [column, 'inchikey']
|
284 |
+
df_unique = []
|
285 |
+
|
286 |
+
for (string_rep, inchikey), structure_df in df.groupby(structure_columns):
|
287 |
+
|
288 |
+
collapsed_indexes = {col: [';'.join(sorted(map(str, set(structure_df[col].tolist()))))]
|
289 |
+
for col in structure_df if col in index_columns}
|
290 |
+
collapsed_indexes.update({column: [string_rep],
|
291 |
+
'inchikey': [inchikey],
|
292 |
+
'instance_count': [structure_df.shape[0]]})
|
293 |
+
|
294 |
+
df_unique.append(DataFrame(collapsed_indexes))
|
295 |
+
|
296 |
+
df_unique = concat(df_unique, axis=0)
|
297 |
+
|
298 |
+
if drop_inchikey:
|
299 |
+
|
300 |
+
df_unique = df_unique.drop(columns=['inchikey'])
|
301 |
+
|
302 |
+
report = {'starting rows:': df.shape[0],
|
303 |
+
'ending_rows': df_unique.shape[0]}
|
304 |
+
|
305 |
+
return report, df_unique
|
306 |
+
|
307 |
+
|
308 |
+
def deduplicate_file(filename: Union[str, TextIO],
|
309 |
+
format: Optional[str] = None,
|
310 |
+
*args, **kwargs) -> DataFrame:
|
311 |
+
|
312 |
+
table = read_table(filename)
|
313 |
+
|
314 |
+
return deduplicate(table, *args, **kwargs)
|
315 |
+
|
schemist/converting.py
ADDED
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Converting between chemical representation formats."""
|
2 |
+
|
3 |
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
4 |
+
|
5 |
+
from functools import wraps
|
6 |
+
|
7 |
+
from carabiner import print_err
|
8 |
+
from carabiner.cast import cast, flatten
|
9 |
+
from carabiner.decorators import return_none_on_error, vectorize
|
10 |
+
from carabiner.itertools import batched
|
11 |
+
|
12 |
+
from datamol import sanitize_smiles
|
13 |
+
import nemony as nm
|
14 |
+
from pandas import DataFrame
|
15 |
+
from rdkit.Chem import (Mol, MolFromInchi, MolFromHELM, MolFromSequence,
|
16 |
+
MolFromSmiles, MolToInchi, MolToInchiKey,
|
17 |
+
MolToSmiles)
|
18 |
+
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
|
19 |
+
from requests import Session
|
20 |
+
import selfies as sf
|
21 |
+
|
22 |
+
from .rest_lookup import _inchikey2pubchem_name_id, _inchikey2cactus_name
|
23 |
+
|
24 |
+
@vectorize
|
25 |
+
@return_none_on_error
|
26 |
+
def _seq2mol(s: str) -> Union[Mol, None]:
|
27 |
+
|
28 |
+
return MolFromSequence(s, sanitize=True)
|
29 |
+
|
30 |
+
|
31 |
+
@vectorize
|
32 |
+
@return_none_on_error
|
33 |
+
def _helm2mol(s: str) -> Union[Mol, None]:
|
34 |
+
|
35 |
+
return MolFromHELM(s, sanitize=True)
|
36 |
+
|
37 |
+
|
38 |
+
def mini_helm2helm(s: str) -> List[str]:
|
39 |
+
|
40 |
+
new_s = []
|
41 |
+
token = ''
|
42 |
+
between_sq_brackets = False
|
43 |
+
|
44 |
+
for letter in s:
|
45 |
+
|
46 |
+
if letter.islower() and not between_sq_brackets:
|
47 |
+
|
48 |
+
letter = f"[d{letter.upper()}]"
|
49 |
+
|
50 |
+
token += letter
|
51 |
+
|
52 |
+
if letter == '[':
|
53 |
+
between_sq_brackets = True
|
54 |
+
elif letter == ']':
|
55 |
+
between_sq_brackets = False
|
56 |
+
|
57 |
+
if not between_sq_brackets:
|
58 |
+
new_s.append(token)
|
59 |
+
token = ''
|
60 |
+
|
61 |
+
return "PEPTIDE1{{{inner_helm}}}$$$$".format(inner_helm='.'.join(new_s))
|
62 |
+
|
63 |
+
|
64 |
+
@vectorize
|
65 |
+
@return_none_on_error
|
66 |
+
def _mini_helm2mol(s: str) -> Mol:
|
67 |
+
|
68 |
+
s = mini_helm2helm(s)
|
69 |
+
|
70 |
+
return MolFromHELM(s, sanitize=True)
|
71 |
+
|
72 |
+
|
73 |
+
@vectorize
|
74 |
+
@return_none_on_error
|
75 |
+
def _inchi2mol(s: str) -> Mol:
|
76 |
+
|
77 |
+
return MolFromInchi(s,
|
78 |
+
sanitize=True,
|
79 |
+
removeHs=True)
|
80 |
+
|
81 |
+
@vectorize
|
82 |
+
# @return_none_on_error
|
83 |
+
def _smiles2mol(s: str) -> Mol:
|
84 |
+
|
85 |
+
return MolFromSmiles(sanitize_smiles(s))
|
86 |
+
|
87 |
+
|
88 |
+
@vectorize
|
89 |
+
@return_none_on_error
|
90 |
+
def _selfies2mol(s: str) -> Mol:
|
91 |
+
|
92 |
+
return MolFromSmiles(sf.decoder(s))
|
93 |
+
|
94 |
+
|
95 |
+
@vectorize
|
96 |
+
@return_none_on_error
|
97 |
+
def _mol2nonstandard_inchikey(m: Mol,
|
98 |
+
**kwargs) -> str:
|
99 |
+
|
100 |
+
return MolToInchiKey(m,
|
101 |
+
options="/FixedH /SUU /RecMet /KET /15T")
|
102 |
+
|
103 |
+
|
104 |
+
@vectorize
|
105 |
+
@return_none_on_error
|
106 |
+
def _mol2hash(m: Mol,
|
107 |
+
**kwargs) -> str:
|
108 |
+
|
109 |
+
nonstandard_inchikey = _mol2nonstandard_inchikey(m)
|
110 |
+
|
111 |
+
return nm.hash(nonstandard_inchikey)
|
112 |
+
|
113 |
+
|
114 |
+
@vectorize
|
115 |
+
@return_none_on_error
|
116 |
+
def _mol2id(m: Mol,
|
117 |
+
n: int = 8,
|
118 |
+
prefix: str = '',
|
119 |
+
**kwargs) -> str:
|
120 |
+
|
121 |
+
return prefix + str(int(_mol2hash(m), 16))[:n]
|
122 |
+
|
123 |
+
|
124 |
+
@vectorize
|
125 |
+
@return_none_on_error
|
126 |
+
def _mol2isomeric_canonical_smiles(m: Mol,
|
127 |
+
**kwargs) -> str:
|
128 |
+
|
129 |
+
return MolToSmiles(m,
|
130 |
+
isomericSmiles=True,
|
131 |
+
canonical=True)
|
132 |
+
|
133 |
+
|
134 |
+
@vectorize
|
135 |
+
@return_none_on_error
|
136 |
+
def _mol2inchi(m: Mol,
|
137 |
+
**kwargs) -> str:
|
138 |
+
|
139 |
+
return MolToInchi(m)
|
140 |
+
|
141 |
+
|
142 |
+
@vectorize
|
143 |
+
@return_none_on_error
|
144 |
+
def _mol2inchikey(m: Mol,
|
145 |
+
**kwargs) -> str:
|
146 |
+
|
147 |
+
return MolToInchiKey(m)
|
148 |
+
|
149 |
+
|
150 |
+
@vectorize
|
151 |
+
@return_none_on_error
|
152 |
+
def _mol2random_smiles(m: Mol,
|
153 |
+
**kwargs) -> str:
|
154 |
+
|
155 |
+
return MolToSmiles(m,
|
156 |
+
isomericSmiles=True,
|
157 |
+
doRandom=True)
|
158 |
+
|
159 |
+
|
160 |
+
@vectorize
|
161 |
+
@return_none_on_error
|
162 |
+
def _mol2mnemonic(m: Mol,
|
163 |
+
**kwargs) -> str:
|
164 |
+
|
165 |
+
nonstandard_inchikey = _mol2nonstandard_inchikey(m)
|
166 |
+
|
167 |
+
return nm.encode(nonstandard_inchikey)
|
168 |
+
|
169 |
+
|
170 |
+
def _mol2pubchem(m: Union[Mol, Iterable[Mol]],
|
171 |
+
session: Optional[Session] = None,
|
172 |
+
chunksize: int = 32) -> List[Dict[str, Union[None, int, str]]]:
|
173 |
+
|
174 |
+
inchikeys = cast(_mol2inchikey(m), to=list)
|
175 |
+
pubchem_ids = []
|
176 |
+
|
177 |
+
for _inchikeys in batched(inchikeys, chunksize):
|
178 |
+
|
179 |
+
these_ids = _inchikey2pubchem_name_id(_inchikeys,
|
180 |
+
session=session)
|
181 |
+
pubchem_ids += these_ids
|
182 |
+
|
183 |
+
return pubchem_ids
|
184 |
+
|
185 |
+
|
186 |
+
@return_none_on_error
|
187 |
+
def _mol2pubchem_id(m: Union[Mol, Iterable[Mol]],
|
188 |
+
session: Optional[Session] = None,
|
189 |
+
chunksize: int = 32,
|
190 |
+
**kwargs) -> Union[str, List[str]]:
|
191 |
+
|
192 |
+
return flatten([val['pubchem_id']
|
193 |
+
for val in _mol2pubchem(m,
|
194 |
+
session=session,
|
195 |
+
chunksize=chunksize)])
|
196 |
+
|
197 |
+
|
198 |
+
@return_none_on_error
|
199 |
+
def _mol2pubchem_name(m: Union[Mol, Iterable[Mol]],
|
200 |
+
session: Optional[Session] = None,
|
201 |
+
chunksize: int = 32,
|
202 |
+
**kwargs) -> Union[str, List[str]]:
|
203 |
+
|
204 |
+
return flatten([val['pubchem_name']
|
205 |
+
for val in _mol2pubchem(m,
|
206 |
+
session=session,
|
207 |
+
chunksize=chunksize)])
|
208 |
+
|
209 |
+
@return_none_on_error
|
210 |
+
def _mol2cactus_name(m: Union[Mol, Iterable[Mol]],
|
211 |
+
session: Optional[Session] = None,
|
212 |
+
**kwargs) -> Union[str, List[str]]:
|
213 |
+
|
214 |
+
return _inchikey2cactus_name(_mol2inchikey(m),
|
215 |
+
session=session)
|
216 |
+
|
217 |
+
|
218 |
+
@vectorize
|
219 |
+
@return_none_on_error
|
220 |
+
def _mol2scaffold(m: Mol,
|
221 |
+
chiral: bool = True,
|
222 |
+
**kwargs) -> str:
|
223 |
+
|
224 |
+
return MurckoScaffoldSmiles(mol=m,
|
225 |
+
includeChirality=chiral)
|
226 |
+
|
227 |
+
|
228 |
+
@vectorize
|
229 |
+
@return_none_on_error
|
230 |
+
def _mol2selfies(m: Mol,
|
231 |
+
**kwargs) -> str:
|
232 |
+
|
233 |
+
s = sf.encoder(_mol2isomeric_canonical_smiles(m))
|
234 |
+
|
235 |
+
return s if s != -1 else None
|
236 |
+
|
237 |
+
|
238 |
+
_TO_FUNCTIONS = {"smiles": _mol2isomeric_canonical_smiles,
|
239 |
+
"selfies": _mol2selfies,
|
240 |
+
"inchi": _mol2inchi,
|
241 |
+
"inchikey": _mol2inchikey,
|
242 |
+
"nonstandard_inchikey": _mol2nonstandard_inchikey,
|
243 |
+
"hash": _mol2hash,
|
244 |
+
"mnemonic": _mol2mnemonic,
|
245 |
+
"id": _mol2id,
|
246 |
+
"scaffold": _mol2scaffold,
|
247 |
+
"permuted_smiles": _mol2random_smiles,
|
248 |
+
"pubchem_id": _mol2pubchem_id,
|
249 |
+
"pubchem_name": _mol2pubchem_name,
|
250 |
+
"cactus_name": _mol2cactus_name}
|
251 |
+
|
252 |
+
_FROM_FUNCTIONS = {"smiles": _smiles2mol,
|
253 |
+
"selfies": _selfies2mol,
|
254 |
+
"inchi": _inchi2mol,
|
255 |
+
"aa_seq": _seq2mol,
|
256 |
+
"helm": _helm2mol,
|
257 |
+
"minihelm": _mini_helm2mol}
|
258 |
+
|
259 |
+
|
260 |
+
def _x2mol(strings: Union[Iterable[str], str],
|
261 |
+
input_representation: str = 'smiles') -> Union[Mol, None, Iterable[Union[Mol, None]]]:
|
262 |
+
|
263 |
+
from_function = _FROM_FUNCTIONS[input_representation.casefold()]
|
264 |
+
|
265 |
+
return from_function(strings)
|
266 |
+
|
267 |
+
|
268 |
+
def _mol2x(mols: Union[Iterable[Mol], Mol],
|
269 |
+
output_representation: str = 'smiles',
|
270 |
+
**kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
|
271 |
+
|
272 |
+
to_function = _TO_FUNCTIONS[output_representation.casefold()]
|
273 |
+
|
274 |
+
return to_function(mols, **kwargs)
|
275 |
+
|
276 |
+
|
277 |
+
def convert_string_representation(strings: Union[Iterable[str], str],
|
278 |
+
input_representation: str = 'smiles',
|
279 |
+
output_representation: str = 'smiles',
|
280 |
+
**kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
|
281 |
+
|
282 |
+
"""Convert between string representations of chemical structures.
|
283 |
+
|
284 |
+
"""
|
285 |
+
|
286 |
+
mols = _x2mol(strings, input_representation)
|
287 |
+
# print_err(mols)
|
288 |
+
outstrings = _mol2x(mols, output_representation, **kwargs)
|
289 |
+
# print_err(outstrings)
|
290 |
+
|
291 |
+
return outstrings
|
292 |
+
|
293 |
+
|
294 |
+
def _convert_input_to_smiles(f: Callable) -> Callable:
|
295 |
+
|
296 |
+
@wraps(f)
|
297 |
+
def _f(strings: Union[Iterable[str], str],
|
298 |
+
input_representation: str = 'smiles',
|
299 |
+
*args, **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
|
300 |
+
|
301 |
+
smiles = convert_string_representation(strings,
|
302 |
+
output_representation='smiles',
|
303 |
+
input_representation=input_representation)
|
304 |
+
|
305 |
+
return f(strings=smiles,
|
306 |
+
*args, **kwargs)
|
307 |
+
|
308 |
+
return _f
|
schemist/features.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools for generating chemical features."""
|
2 |
+
|
3 |
+
from typing import Any, Callable, Iterable, Optional, Union
|
4 |
+
|
5 |
+
from descriptastorus.descriptors import MakeGenerator
|
6 |
+
from pandas import DataFrame, Series
|
7 |
+
import numpy as np
|
8 |
+
from rdkit.Chem.AllChem import FingeprintGenerator64, GetMorganGenerator, Mol
|
9 |
+
|
10 |
+
from .converting import _smiles2mol, _convert_input_to_smiles
|
11 |
+
|
12 |
+
def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], DataFrame]:
|
13 |
+
|
14 |
+
def _f(prefix: Optional[str] = None,
|
15 |
+
*args, **kwargs) -> DataFrame:
|
16 |
+
|
17 |
+
feature_matrix = f(*args, **kwargs)
|
18 |
+
|
19 |
+
if prefix is not None:
|
20 |
+
|
21 |
+
new_cols = {col: f"{prefix}_{col}"
|
22 |
+
for col in feature_matrix.columns
|
23 |
+
if not col.startswith('_meta')}
|
24 |
+
feature_matrix = feature_matrix.rename(columns=new_cols)
|
25 |
+
|
26 |
+
return feature_matrix
|
27 |
+
|
28 |
+
return _f
|
29 |
+
|
30 |
+
|
31 |
+
def _get_descriptastorus_features(smiles: Iterable[str],
|
32 |
+
generator: str) -> DataFrame:
|
33 |
+
|
34 |
+
generator = MakeGenerator((generator, ))
|
35 |
+
smiles = Series(smiles)
|
36 |
+
|
37 |
+
features = smiles.apply(lambda z: np.array(generator.process(z)))
|
38 |
+
matrix = np.stack(features.values, axis=0)
|
39 |
+
|
40 |
+
return DataFrame(matrix,
|
41 |
+
index=smiles.index,
|
42 |
+
columns=[col for col, _ in generator.GetColumns()])
|
43 |
+
|
44 |
+
|
45 |
+
@_feature_matrix
|
46 |
+
@_convert_input_to_smiles
|
47 |
+
def calculate_2d_features(strings: Union[Iterable[str], str],
|
48 |
+
normalized: bool = True,
|
49 |
+
histogram_normalized: bool = True) -> DataFrame:
|
50 |
+
|
51 |
+
"""Calculate 2d features from string representation.
|
52 |
+
|
53 |
+
"""
|
54 |
+
|
55 |
+
if normalized:
|
56 |
+
if histogram_normalized:
|
57 |
+
generator_name = "RDKit2DHistogramNormalized"
|
58 |
+
else:
|
59 |
+
generator_name = "RDKit2DNormalized"
|
60 |
+
else:
|
61 |
+
generator_name = "RDKit2D"
|
62 |
+
|
63 |
+
feature_matrix = _get_descriptastorus_features(strings,
|
64 |
+
generator=generator_name)
|
65 |
+
|
66 |
+
feature_matrix = (feature_matrix
|
67 |
+
.rename(columns={f"{generator_name}_calculated": "meta_feature_valid0"})
|
68 |
+
.assign(meta_feature_type=generator_name,
|
69 |
+
meta_feature_valid=lambda x: (x['meta_feature_valid0'] == 1.))
|
70 |
+
.drop(columns=['meta_feature_valid0']))
|
71 |
+
|
72 |
+
return feature_matrix
|
73 |
+
|
74 |
+
|
75 |
+
def _fast_fingerprint(generator: FingeprintGenerator64,
|
76 |
+
mol: Mol,
|
77 |
+
to_np: bool = True) -> Union[str, np.ndarray]:
|
78 |
+
|
79 |
+
try:
|
80 |
+
fp_string = generator.GetFingerprint(mol).ToBitString()
|
81 |
+
except:
|
82 |
+
return None
|
83 |
+
else:
|
84 |
+
if to_np:
|
85 |
+
return np.frombuffer(fp_string.encode(), 'u1') - ord('0')
|
86 |
+
else:
|
87 |
+
return fp_string
|
88 |
+
|
89 |
+
|
90 |
+
@_feature_matrix
|
91 |
+
@_convert_input_to_smiles
|
92 |
+
def calculate_fingerprints(strings: Union[Iterable[str], str],
|
93 |
+
fp_type: str = 'morgan',
|
94 |
+
radius: int = 2,
|
95 |
+
chiral: bool = True,
|
96 |
+
on_bits: bool = True) -> DataFrame:
|
97 |
+
|
98 |
+
"""
|
99 |
+
|
100 |
+
"""
|
101 |
+
|
102 |
+
if fp_type.casefold() == 'morgan':
|
103 |
+
generator_class = GetMorganGenerator
|
104 |
+
else:
|
105 |
+
raise AttributeError(f"Fingerprint type {fp_type} not supported!")
|
106 |
+
|
107 |
+
fp_generator = generator_class(radius=radius,
|
108 |
+
includeChirality=chiral)
|
109 |
+
mols = (_smiles2mol(s) for s in strings)
|
110 |
+
fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits)
|
111 |
+
for mol in mols)
|
112 |
+
|
113 |
+
if on_bits:
|
114 |
+
|
115 |
+
fingerprints = (map(str, np.flatnonzero(fp_string).tolist())
|
116 |
+
for fp_string in fp_strings)
|
117 |
+
fingerprints = [';'.join(fp) for fp in fingerprints]
|
118 |
+
validity = [len(fp) > 0 for fp in fingerprints]
|
119 |
+
|
120 |
+
feature_matrix = DataFrame(fingerprints,
|
121 |
+
columns=['fp_bits'])
|
122 |
+
|
123 |
+
else:
|
124 |
+
|
125 |
+
fingerprints = [np.array(int(digit) for digit in fp_string)
|
126 |
+
if fp_string is not None
|
127 |
+
else (-np.ones((fp_generator.GetOptions().fpSize, )))
|
128 |
+
for fp_string in fp_strings]
|
129 |
+
validity = [np.all(fp >= 0) for fp in fingerprints]
|
130 |
+
|
131 |
+
feature_matrix = DataFrame(np.stack(fingerprints, axis=0),
|
132 |
+
columns=[f"fp_{i}" for i in range(len(fingerprints[0]))])
|
133 |
+
|
134 |
+
return feature_matrix.assign(meta_feature_type=fp_type.casefold(),
|
135 |
+
meta_feature_valid=validity)
|
136 |
+
|
137 |
+
|
138 |
+
_FEATURE_CALCULATORS = {"2d": calculate_2d_features, "fp": calculate_fingerprints}
|
139 |
+
|
140 |
+
def calculate_feature(feature_type: str,
|
141 |
+
*args, **kwargs):
|
142 |
+
|
143 |
+
"""
|
144 |
+
|
145 |
+
"""
|
146 |
+
|
147 |
+
featurizer = _FEATURE_CALCULATORS[feature_type]
|
148 |
+
|
149 |
+
return featurizer(*args, **kwargs)
|
schemist/generating.py
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools for enumerating compounds. Currently only works with peptides."""
|
2 |
+
|
3 |
+
from typing import Callable, Iterable, Optional, Tuple, Union
|
4 |
+
|
5 |
+
from functools import partial
|
6 |
+
from itertools import chain, islice, product, repeat
|
7 |
+
from math import ceil, expm1, floor
|
8 |
+
from random import choice, choices, random, seed
|
9 |
+
|
10 |
+
from carabiner import print_err
|
11 |
+
from carabiner.decorators import vectorize, return_none_on_error
|
12 |
+
from carabiner.random import sample_iter
|
13 |
+
from rdkit.Chem import Mol, rdChemReactions
|
14 |
+
import numpy as np
|
15 |
+
|
16 |
+
from .converting import (_x2mol, _mol2x,
|
17 |
+
_convert_input_to_smiles)
|
18 |
+
|
19 |
+
AA = tuple('GALVITSMCPFYWHKRDENQ')
|
20 |
+
dAA = tuple(aa.casefold() for aa in AA)
|
21 |
+
|
22 |
+
REACTIONS = {'N_to_C_cyclization': '([N;H1:5][C:1][C:2](=[O:6])[O:3].[N;H2:4][C:7][C:8](=[O:9])[N;H1:10])>>[N;H1:5][C:1][C:2](=[O:6])[N;H1:4][C:7][C:8](=[O:9])[N;H1:10].[O;H2:3]',
|
23 |
+
'cysteine_to_chloroacetyl_cyclization': '([N;H1:5][C:2](=[O:6])[C:1][Cl:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8]',
|
24 |
+
'cysteine_to_N_cyclization':'([N;H1:5][C:2](=[O:6])[C:1][N;H2:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8].[N;H3:3]'}
|
25 |
+
|
26 |
+
def _get_alphabet(alphabet: Optional[Iterable[str]] = None,
|
27 |
+
d_aa_only: bool = False,
|
28 |
+
include_d_aa: bool = False) -> Tuple[str]:
|
29 |
+
|
30 |
+
alphabet = alphabet or AA
|
31 |
+
alphabet_lower = tuple(set(aa.casefold() for aa in AA))
|
32 |
+
|
33 |
+
if d_aa_only:
|
34 |
+
alphabet = alphabet_lower
|
35 |
+
elif include_d_aa:
|
36 |
+
alphabet = tuple(set(chain(alphabet, alphabet_lower)))
|
37 |
+
|
38 |
+
return alphabet
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
def all_peptides_of_one_length(length: int,
|
43 |
+
alphabet: Optional[Iterable[str]] = None,
|
44 |
+
d_aa_only: bool = False,
|
45 |
+
include_d_aa: bool = False) -> Iterable[str]:
|
46 |
+
|
47 |
+
"""
|
48 |
+
|
49 |
+
"""
|
50 |
+
|
51 |
+
alphabet = _get_alphabet(alphabet=alphabet,
|
52 |
+
d_aa_only=d_aa_only,
|
53 |
+
include_d_aa=include_d_aa)
|
54 |
+
|
55 |
+
return (''.join(peptide)
|
56 |
+
for peptide in product(alphabet, repeat=length))
|
57 |
+
|
58 |
+
|
59 |
+
def all_peptides_in_length_range(max_length: int,
|
60 |
+
min_length: int = 1,
|
61 |
+
by: int = 1,
|
62 |
+
alphabet: Optional[Iterable[str]] = None,
|
63 |
+
d_aa_only: bool = False,
|
64 |
+
include_d_aa: bool = False,
|
65 |
+
*args, **kwargs) -> Iterable[str]:
|
66 |
+
|
67 |
+
"""
|
68 |
+
|
69 |
+
"""
|
70 |
+
|
71 |
+
length_range = range(*sorted([min_length, max_length + 1]), by)
|
72 |
+
peptide_maker = partial(all_peptides_of_one_length,
|
73 |
+
alphabet=alphabet,
|
74 |
+
d_aa_only=d_aa_only,
|
75 |
+
include_d_aa=include_d_aa,
|
76 |
+
*args, **kwargs)
|
77 |
+
|
78 |
+
return chain.from_iterable(peptide_maker(length=length)
|
79 |
+
for length in length_range)
|
80 |
+
|
81 |
+
|
82 |
+
def _number_of_peptides(max_length: int,
|
83 |
+
min_length: int = 1,
|
84 |
+
by: int = 1,
|
85 |
+
alphabet: Optional[Iterable[str]] = None,
|
86 |
+
d_aa_only: bool = False,
|
87 |
+
include_d_aa: bool = False):
|
88 |
+
|
89 |
+
alphabet = _get_alphabet(alphabet=alphabet,
|
90 |
+
d_aa_only=d_aa_only,
|
91 |
+
include_d_aa=include_d_aa)
|
92 |
+
n_peptides = [len(alphabet) ** length
|
93 |
+
for length in range(*sorted([min_length, max_length + 1]), by)]
|
94 |
+
|
95 |
+
return n_peptides
|
96 |
+
|
97 |
+
|
98 |
+
def _naive_sample_peptides_in_length_range(max_length: int,
|
99 |
+
min_length: int = 1,
|
100 |
+
by: int = 1,
|
101 |
+
n: Optional[Union[float, int]] = None,
|
102 |
+
alphabet: Optional[Iterable[str]] = None,
|
103 |
+
d_aa_only: bool = False,
|
104 |
+
include_d_aa: bool = False,
|
105 |
+
set_seed: Optional[int] = None):
|
106 |
+
|
107 |
+
alphabet = _get_alphabet(alphabet=alphabet,
|
108 |
+
d_aa_only=d_aa_only,
|
109 |
+
include_d_aa=include_d_aa)
|
110 |
+
n_peptides = _number_of_peptides(max_length=max_length,
|
111 |
+
min_length=min_length,
|
112 |
+
by=by,
|
113 |
+
alphabet=alphabet,
|
114 |
+
d_aa_only=d_aa_only,
|
115 |
+
include_d_aa=include_d_aa)
|
116 |
+
lengths = list(range(*sorted([min_length, max_length + 1]), by))
|
117 |
+
weight_per_length = [n / min(n_peptides) for n in n_peptides]
|
118 |
+
weighted_lengths = list(chain.from_iterable(repeat(l, ceil(w)) for l, w in zip(lengths, weight_per_length)))
|
119 |
+
|
120 |
+
lengths_sample = (choice(weighted_lengths) for _ in range(n))
|
121 |
+
return (''.join(choices(list(alphabet), k=k)) for k in lengths_sample)
|
122 |
+
|
123 |
+
|
124 |
+
def sample_peptides_in_length_range(max_length: int,
|
125 |
+
min_length: int = 1,
|
126 |
+
by: int = 1,
|
127 |
+
n: Optional[Union[float, int]] = None,
|
128 |
+
alphabet: Optional[Iterable[str]] = None,
|
129 |
+
d_aa_only: bool = False,
|
130 |
+
include_d_aa: bool = False,
|
131 |
+
naive_sampling_cutoff: float = 5e-3,
|
132 |
+
reservoir_sampling: bool = True,
|
133 |
+
indexes: Optional[Iterable[int]] = None,
|
134 |
+
set_seed: Optional[int] = None,
|
135 |
+
*args, **kwargs) -> Iterable[str]:
|
136 |
+
|
137 |
+
"""
|
138 |
+
|
139 |
+
"""
|
140 |
+
|
141 |
+
seed(set_seed)
|
142 |
+
|
143 |
+
alphabet = _get_alphabet(alphabet=alphabet,
|
144 |
+
d_aa_only=d_aa_only,
|
145 |
+
include_d_aa=include_d_aa)
|
146 |
+
|
147 |
+
n_peptides = sum(len(alphabet) ** length
|
148 |
+
for length in range(*sorted([min_length, max_length + 1]), by))
|
149 |
+
if n is None:
|
150 |
+
n_requested = n_peptides
|
151 |
+
elif n >= 1.:
|
152 |
+
n_requested = min(floor(n), n_peptides)
|
153 |
+
elif n < 1.:
|
154 |
+
n_requested = floor(n * n_peptides)
|
155 |
+
|
156 |
+
frac_requested = n_requested / n_peptides
|
157 |
+
|
158 |
+
# approximation of birthday problem
|
159 |
+
p_any_collision = -expm1(-n_requested * (n_requested - 1.) / (2. * n_peptides))
|
160 |
+
n_collisons = n_requested * (1. - ((n_peptides - 1.) / n_peptides) ** (n_requested - 1.))
|
161 |
+
frac_collisions = n_collisons / n_requested
|
162 |
+
|
163 |
+
print_err(f"Sampling {n_requested} ({frac_requested * 100.} %) peptides from "
|
164 |
+
f"length {min_length} to {max_length} ({n_peptides} combinations). "
|
165 |
+
f"Probability of collision if drawing randomly is {p_any_collision}, "
|
166 |
+
f"with {n_collisons} ({100. * frac_collisions} %) collisions on average.")
|
167 |
+
|
168 |
+
if frac_collisions < naive_sampling_cutoff and n_peptides > 2e9:
|
169 |
+
|
170 |
+
print_err("> Executing naive sampling. ")
|
171 |
+
|
172 |
+
peptides = _naive_sample_peptides_in_length_range(max_length, min_length, by,
|
173 |
+
n=n_requested,
|
174 |
+
alphabet=alphabet,
|
175 |
+
d_aa_only=d_aa_only,
|
176 |
+
include_d_aa=include_d_aa)
|
177 |
+
|
178 |
+
else:
|
179 |
+
|
180 |
+
print_err("> Executing exhaustive sampling.")
|
181 |
+
|
182 |
+
all_peptides = all_peptides_in_length_range(max_length, min_length, by,
|
183 |
+
alphabet=alphabet,
|
184 |
+
d_aa_only=d_aa_only,
|
185 |
+
include_d_aa=include_d_aa,
|
186 |
+
*args, **kwargs)
|
187 |
+
|
188 |
+
if n is None:
|
189 |
+
|
190 |
+
peptides = all_peptides
|
191 |
+
|
192 |
+
elif n >= 1.:
|
193 |
+
|
194 |
+
if reservoir_sampling:
|
195 |
+
peptides = sample_iter(all_peptides, k=n_requested,
|
196 |
+
shuffle_output=False)
|
197 |
+
else:
|
198 |
+
peptides = (pep for pep in all_peptides
|
199 |
+
if random() <= frac_requested)
|
200 |
+
|
201 |
+
elif n < 1.:
|
202 |
+
|
203 |
+
peptides = (pep for pep in all_peptides
|
204 |
+
if random() <= n)
|
205 |
+
|
206 |
+
if indexes is not None:
|
207 |
+
|
208 |
+
indexes = (int(ix) if (isinstance(ix, str) and ix.isdigit()) or isinstance(ix, int) or isinstance(ix, float)
|
209 |
+
else None
|
210 |
+
for ix in islice(indexes, 3))
|
211 |
+
indexes = [ix if (ix is None or ix >= 0) else None
|
212 |
+
for ix in indexes]
|
213 |
+
|
214 |
+
if len(indexes) > 1:
|
215 |
+
if n is not None and n >=1. and indexes[0] > n:
|
216 |
+
raise ValueError(f"Minimum slice ({indexes[0]}) is higher than number of items ({n}).")
|
217 |
+
|
218 |
+
peptides = islice(peptides, *indexes)
|
219 |
+
|
220 |
+
return peptides
|
221 |
+
|
222 |
+
|
223 |
+
def _reactor(smarts: str) -> Callable[[Mol], Union[Mol, None]]:
|
224 |
+
|
225 |
+
rxn = rdChemReactions.ReactionFromSmarts(smarts)
|
226 |
+
reaction_function = rxn.RunReactants
|
227 |
+
|
228 |
+
@vectorize
|
229 |
+
@return_none_on_error
|
230 |
+
def reactor(s: Mol) -> Mol:
|
231 |
+
|
232 |
+
return reaction_function([s])[0][0]
|
233 |
+
|
234 |
+
return reactor
|
235 |
+
|
236 |
+
|
237 |
+
@_convert_input_to_smiles
|
238 |
+
def react(strings: Union[str, Iterable[str]],
|
239 |
+
reaction: str = 'N_to_C_cyclization',
|
240 |
+
output_representation: str = 'smiles',
|
241 |
+
**kwargs) -> Union[str, Iterable[str]]:
|
242 |
+
|
243 |
+
"""
|
244 |
+
|
245 |
+
"""
|
246 |
+
|
247 |
+
try:
|
248 |
+
_this_reaction = REACTIONS[reaction]
|
249 |
+
except KeyError:
|
250 |
+
raise KeyError(f"Reaction {reaction} is not available. Try: " +
|
251 |
+
", ".join(list(REACTIONS)))
|
252 |
+
|
253 |
+
# strings = cast(strings, to=list)
|
254 |
+
# print_err((strings))
|
255 |
+
|
256 |
+
reactor = _reactor(_this_reaction)
|
257 |
+
mols = _x2mol(strings)
|
258 |
+
mols = reactor(mols)
|
259 |
+
|
260 |
+
return _mol2x(mols,
|
261 |
+
output_representation=output_representation,
|
262 |
+
**kwargs)
|
schemist/io.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools to facilitate input and output."""
|
2 |
+
|
3 |
+
from typing import Any, Callable, List, Optional, TextIO, Tuple, Union
|
4 |
+
|
5 |
+
from collections import defaultdict
|
6 |
+
from functools import partial
|
7 |
+
from string import printable
|
8 |
+
from tempfile import NamedTemporaryFile
|
9 |
+
from xml.etree import ElementTree
|
10 |
+
|
11 |
+
from carabiner import print_err
|
12 |
+
from carabiner.cast import cast
|
13 |
+
from carabiner.itertools import tenumerate
|
14 |
+
from carabiner.pd import read_table, write_stream
|
15 |
+
|
16 |
+
from pandas import DataFrame, read_excel
|
17 |
+
from rdkit.Chem import SDMolSupplier
|
18 |
+
|
19 |
+
from .converting import _mol2isomeric_canonical_smiles
|
20 |
+
|
21 |
+
def _mutate_df_stream(input_file: Union[str, TextIO],
|
22 |
+
output_file: Union[str, TextIO],
|
23 |
+
function: Callable[[DataFrame], Tuple[Any, DataFrame]],
|
24 |
+
file_format: Optional[str] = None,
|
25 |
+
chunksize: int = 1000) -> List[Any]:
|
26 |
+
|
27 |
+
carries = []
|
28 |
+
|
29 |
+
for i, chunk in tenumerate(read_table(input_file,
|
30 |
+
format=file_format,
|
31 |
+
progress=False,
|
32 |
+
chunksize=chunksize)):
|
33 |
+
|
34 |
+
result = function(chunk)
|
35 |
+
|
36 |
+
try:
|
37 |
+
carry, df = result
|
38 |
+
except ValueError:
|
39 |
+
df = result
|
40 |
+
carry = 0
|
41 |
+
|
42 |
+
write_stream(df,
|
43 |
+
output=output_file,
|
44 |
+
format=file_format,
|
45 |
+
header=i == 0,
|
46 |
+
mode='w' if i == 0 else 'a')
|
47 |
+
|
48 |
+
carries.append(carry)
|
49 |
+
|
50 |
+
return carries
|
51 |
+
|
52 |
+
|
53 |
+
def read_weird_xml(filename: Union[str, TextIO],
|
54 |
+
header: bool = True,
|
55 |
+
namespace: str = '{urn:schemas-microsoft-com:office:spreadsheet}') -> DataFrame:
|
56 |
+
|
57 |
+
"""
|
58 |
+
|
59 |
+
"""
|
60 |
+
|
61 |
+
with cast(filename, TextIOWrapper, mode='r') as f:
|
62 |
+
|
63 |
+
xml_string = ''.join(filter(printable.__contains__, f.read()))
|
64 |
+
|
65 |
+
try:
|
66 |
+
|
67 |
+
root = ElementTree.fromstring(xml_string)
|
68 |
+
|
69 |
+
except Exception as e:
|
70 |
+
|
71 |
+
print_err('\n!!! ' + xml_string.split('\n')[1184][377:380])
|
72 |
+
|
73 |
+
raise e
|
74 |
+
|
75 |
+
for i, row in enumerate(root.iter(f'{namespace}Row') ):
|
76 |
+
|
77 |
+
this_row = [datum.text for datum in row.iter(f'{namespace}Data')]
|
78 |
+
|
79 |
+
if i == 0:
|
80 |
+
|
81 |
+
if header:
|
82 |
+
|
83 |
+
heading = this_row
|
84 |
+
df = {colname: [] for colname in heading}
|
85 |
+
|
86 |
+
else:
|
87 |
+
|
88 |
+
heading = [f'X{j}' for j, _ in enumerate(this_row)]
|
89 |
+
df = {colname: [datum] for colname, datum in zip(heading, this_row)}
|
90 |
+
|
91 |
+
else:
|
92 |
+
|
93 |
+
for colname, datum in zip(heading, this_row):
|
94 |
+
|
95 |
+
df[colname].append(datum)
|
96 |
+
|
97 |
+
return DataFrame(df)
|
98 |
+
|
99 |
+
|
100 |
+
def read_sdf(filename: Union[str, TextIO]):
|
101 |
+
|
102 |
+
"""
|
103 |
+
|
104 |
+
"""
|
105 |
+
|
106 |
+
filename = cast(filename, str)
|
107 |
+
|
108 |
+
with open(filename, 'r', errors='replace') as f:
|
109 |
+
with NamedTemporaryFile("w") as o:
|
110 |
+
|
111 |
+
o.write(f.read())
|
112 |
+
o.seek(0)
|
113 |
+
|
114 |
+
df = defaultdict(list)
|
115 |
+
|
116 |
+
for i, mol in enumerate(SDMolSupplier(o.name)):
|
117 |
+
|
118 |
+
if mol is None:
|
119 |
+
|
120 |
+
continue
|
121 |
+
|
122 |
+
propdict = mol.GetPropsAsDict()
|
123 |
+
propdict['SMILES'] = _mol2isomeric_canonical_smiles(mol)
|
124 |
+
|
125 |
+
for colname in propdict:
|
126 |
+
|
127 |
+
df[colname].append(propdict[colname])
|
128 |
+
|
129 |
+
for colname in df:
|
130 |
+
|
131 |
+
if colname not in propdict:
|
132 |
+
|
133 |
+
df[colname].append(None)
|
134 |
+
|
135 |
+
col_lengths = {col: len(val) for col, val in df.items()}
|
136 |
+
|
137 |
+
if len(set(col_lengths.values())) > 1:
|
138 |
+
|
139 |
+
raise ValueError(f"Column lengths not all the same:\n\t" +
|
140 |
+
'\n\t'.join(f"{key}:{val}" for key, val in col_lengths.items()))
|
141 |
+
|
142 |
+
return DataFrame(df)
|
143 |
+
|
144 |
+
|
145 |
+
FILE_READERS = {
|
146 |
+
'bad_xml': read_weird_xml,
|
147 |
+
'xlsx': partial(read_excel, engine='openpyxl'),
|
148 |
+
'sdf': read_sdf
|
149 |
+
}
|
schemist/rest_lookup.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools for querying PubChem."""
|
2 |
+
|
3 |
+
from typing import Dict, Iterable, List, Optional, Union
|
4 |
+
from time import sleep
|
5 |
+
from xml.etree import ElementTree
|
6 |
+
|
7 |
+
from carabiner import print_err
|
8 |
+
from carabiner.cast import cast
|
9 |
+
from carabiner.decorators import vectorize
|
10 |
+
from requests import Response, Session
|
11 |
+
|
12 |
+
_PUBCHEM_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/{get}/{format}"
|
13 |
+
_CACTUS_URL = "https://cactus.nci.nih.gov/chemical/structure/{inchikey}/{get}"
|
14 |
+
|
15 |
+
_OVERLOAD_CODES = {500, 501, 503, 504}
|
16 |
+
|
17 |
+
|
18 |
+
def _url_request(inchikeys: Union[str, Iterable[str]],
|
19 |
+
url: str,
|
20 |
+
session: Optional[Session] = None,
|
21 |
+
**kwargs) -> Response:
|
22 |
+
|
23 |
+
if session is None:
|
24 |
+
session = Session()
|
25 |
+
|
26 |
+
inchikeys = cast(inchikeys, to=list)
|
27 |
+
|
28 |
+
return session.get(url.format(inchikey=','.join(inchikeys), **kwargs))
|
29 |
+
|
30 |
+
|
31 |
+
def _inchikey2pubchem_name_id(inchikeys: Union[str, Iterable[str]],
|
32 |
+
session: Optional[Session] = None,
|
33 |
+
counter: int = 0,
|
34 |
+
max_tries: int = 10,
|
35 |
+
namespace: str = "{http://pubchem.ncbi.nlm.nih.gov/pug_rest}") -> List[Dict[str, Union[None, int, str]]]:
|
36 |
+
|
37 |
+
r = _url_request(inchikeys, url=_PUBCHEM_URL,
|
38 |
+
session=session,
|
39 |
+
get="Title,InchiKey", format="XML")
|
40 |
+
|
41 |
+
if r.status_code == 200:
|
42 |
+
|
43 |
+
root = ElementTree.fromstring(r.text)
|
44 |
+
compounds = root.iter(f'{namespace}Properties')
|
45 |
+
|
46 |
+
result_dict = dict()
|
47 |
+
|
48 |
+
for cmpd in compounds:
|
49 |
+
|
50 |
+
cmpd_dict = dict()
|
51 |
+
|
52 |
+
for child in cmpd:
|
53 |
+
cmpd_dict[child.tag.split(namespace)[1]] = child.text
|
54 |
+
|
55 |
+
try:
|
56 |
+
inchikey, name, pcid = cmpd_dict['InChIKey'], cmpd_dict['Title'], cmpd_dict['CID']
|
57 |
+
except KeyError:
|
58 |
+
print(cmpd_dict)
|
59 |
+
else:
|
60 |
+
result_dict[inchikey] = {'pubchem_name': name.casefold(),
|
61 |
+
'pubchem_id': pcid}
|
62 |
+
|
63 |
+
print_err(f'PubChem: Looked up InchiKeys: {",".join(inchikeys)}')
|
64 |
+
|
65 |
+
result_list = [result_dict[inchikey]
|
66 |
+
if inchikey in result_dict
|
67 |
+
else {'pubchem_name': None, 'pubchem_id': None}
|
68 |
+
for inchikey in inchikeys]
|
69 |
+
|
70 |
+
return result_list
|
71 |
+
|
72 |
+
elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
|
73 |
+
|
74 |
+
sleep(1.)
|
75 |
+
|
76 |
+
return _inchikey2pubchem_name_id(inchikeys,
|
77 |
+
session=session,
|
78 |
+
counter=counter + 1,
|
79 |
+
max_tries=max_tries,
|
80 |
+
namespace=namespace)
|
81 |
+
|
82 |
+
else:
|
83 |
+
|
84 |
+
print_err(f'PubChem: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
|
85 |
+
|
86 |
+
return [{'pubchem_name': None, 'pubchem_id': None}
|
87 |
+
for _ in range(len(inchikeys))]
|
88 |
+
|
89 |
+
|
90 |
+
@vectorize
|
91 |
+
def _inchikey2cactus_name(inchikeys: str,
|
92 |
+
session: Optional[Session] = None,
|
93 |
+
counter: int = 0,
|
94 |
+
max_tries: int = 10):
|
95 |
+
|
96 |
+
r = _url_request(inchikeys, url=_CACTUS_URL,
|
97 |
+
session=session,
|
98 |
+
get="names")
|
99 |
+
|
100 |
+
if r.status_code == 200:
|
101 |
+
|
102 |
+
return r.text.split('\n')[0].casefold()
|
103 |
+
|
104 |
+
elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
|
105 |
+
|
106 |
+
sleep(1.)
|
107 |
+
|
108 |
+
return _inchikey2cactus_name(inchikeys,
|
109 |
+
session=session,
|
110 |
+
counter=counter + 1,
|
111 |
+
max_tries=max_tries)
|
112 |
+
|
113 |
+
else:
|
114 |
+
|
115 |
+
print_err(f'Cactus: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
|
116 |
+
|
117 |
+
return None
|
118 |
+
|
schemist/splitting.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools for splitting tabular datasets, optionally based on chemical features."""
|
2 |
+
|
3 |
+
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
4 |
+
from collections import defaultdict
|
5 |
+
from math import ceil
|
6 |
+
from random import random, seed
|
7 |
+
|
8 |
+
try:
|
9 |
+
from itertools import batched
|
10 |
+
except ImportError:
|
11 |
+
from carabiner.itertools import batched
|
12 |
+
|
13 |
+
from tqdm.auto import tqdm
|
14 |
+
|
15 |
+
from .converting import convert_string_representation, _convert_input_to_smiles
|
16 |
+
from .typing import DataSplits
|
17 |
+
|
18 |
+
# def _train_test_splits
|
19 |
+
|
20 |
+
def _train_test_val_sizes(total: int,
|
21 |
+
train: float = 1.,
|
22 |
+
test: float = 0.) -> Tuple[int]:
|
23 |
+
|
24 |
+
n_train = int(ceil(train * total))
|
25 |
+
n_test = int(ceil(test * total))
|
26 |
+
n_val = total - n_train - n_test
|
27 |
+
|
28 |
+
return n_train, n_test, n_val
|
29 |
+
|
30 |
+
|
31 |
+
def _random_chunk(strings: str,
|
32 |
+
train: float = 1.,
|
33 |
+
test: float = 0.,
|
34 |
+
carry: Optional[Dict[str, List[int]]] = None,
|
35 |
+
start_from: int = 0) -> Dict[str, List[int]]:
|
36 |
+
|
37 |
+
carry = carry or defaultdict(list)
|
38 |
+
|
39 |
+
train_test: float = train + test
|
40 |
+
|
41 |
+
for i, _ in enumerate(strings):
|
42 |
+
|
43 |
+
random_number: float = random()
|
44 |
+
|
45 |
+
if random_number < train:
|
46 |
+
|
47 |
+
key = 'train'
|
48 |
+
|
49 |
+
elif random_number < train_test:
|
50 |
+
|
51 |
+
key = 'test'
|
52 |
+
|
53 |
+
else:
|
54 |
+
|
55 |
+
key = 'validation'
|
56 |
+
|
57 |
+
carry[key].append(start_from + i)
|
58 |
+
|
59 |
+
return carry
|
60 |
+
|
61 |
+
|
62 |
+
def split_random(strings: Union[str, Iterable[str]],
|
63 |
+
train: float = 1.,
|
64 |
+
test: float = 0.,
|
65 |
+
chunksize: Optional[int] = None,
|
66 |
+
set_seed: Optional[int] = None,
|
67 |
+
*args, **kwargs) -> DataSplits:
|
68 |
+
|
69 |
+
"""
|
70 |
+
|
71 |
+
"""
|
72 |
+
|
73 |
+
if set_seed is not None:
|
74 |
+
|
75 |
+
seed(set_seed)
|
76 |
+
|
77 |
+
|
78 |
+
if chunksize is None:
|
79 |
+
|
80 |
+
idx = _random_chunk(strings=strings,
|
81 |
+
train=train,
|
82 |
+
test=test)
|
83 |
+
|
84 |
+
else:
|
85 |
+
|
86 |
+
idx = defaultdict(list)
|
87 |
+
|
88 |
+
for i, chunk in enumerate(batched(strings, chunksize)):
|
89 |
+
|
90 |
+
idx = _random_chunk(strings=chunk,
|
91 |
+
train=train,
|
92 |
+
test=test,
|
93 |
+
carry=idx,
|
94 |
+
start_from=i * chunksize)
|
95 |
+
|
96 |
+
seed(None)
|
97 |
+
|
98 |
+
return DataSplits(**idx)
|
99 |
+
|
100 |
+
|
101 |
+
@_convert_input_to_smiles
|
102 |
+
def _scaffold_chunk(strings: str,
|
103 |
+
carry: Optional[Dict[str, List[int]]] = None,
|
104 |
+
start_from: int = 0) -> Dict[str, List[int]]:
|
105 |
+
|
106 |
+
carry = carry or defaultdict(list)
|
107 |
+
|
108 |
+
these_scaffolds = convert_string_representation(strings=strings,
|
109 |
+
output_representation='scaffold')
|
110 |
+
|
111 |
+
for j, scaff in enumerate(these_scaffolds):
|
112 |
+
carry[scaff].append(start_from + j)
|
113 |
+
|
114 |
+
return carry
|
115 |
+
|
116 |
+
|
117 |
+
def _scaffold_aggregator(scaffold_sets: Dict[str, List[int]],
|
118 |
+
train: float = 1.,
|
119 |
+
test: float = 0.,
|
120 |
+
progress: bool = False) -> DataSplits:
|
121 |
+
|
122 |
+
scaffold_sets = {key: sorted(value)
|
123 |
+
for key, value in scaffold_sets.items()}
|
124 |
+
scaffold_sets = sorted(scaffold_sets.items(),
|
125 |
+
key=lambda x: (len(x[1]), x[1][0]),
|
126 |
+
reverse=True)
|
127 |
+
nrows = sum(len(idx) for _, idx in scaffold_sets)
|
128 |
+
n_train, n_test, n_val = _train_test_val_sizes(nrows,
|
129 |
+
train,
|
130 |
+
test)
|
131 |
+
idx = defaultdict(list)
|
132 |
+
|
133 |
+
iterator = tqdm(scaffold_sets) if progress else scaffold_sets
|
134 |
+
for _, scaffold_idx in iterator:
|
135 |
+
|
136 |
+
if (len(idx['train']) + len(scaffold_idx)) > n_train:
|
137 |
+
|
138 |
+
if (len(idx['test']) + len(scaffold_idx)) > n_test:
|
139 |
+
|
140 |
+
key = 'validation'
|
141 |
+
|
142 |
+
else:
|
143 |
+
|
144 |
+
key = 'test'
|
145 |
+
else:
|
146 |
+
|
147 |
+
key = 'train'
|
148 |
+
|
149 |
+
idx[key] += scaffold_idx
|
150 |
+
|
151 |
+
return DataSplits(**idx)
|
152 |
+
|
153 |
+
|
154 |
+
def split_scaffold(strings: Union[str, Iterable[str]],
|
155 |
+
train: float = 1.,
|
156 |
+
test: float = 0.,
|
157 |
+
chunksize: Optional[int] = None,
|
158 |
+
progress: bool = True) -> DataSplits:
|
159 |
+
|
160 |
+
"""
|
161 |
+
|
162 |
+
"""
|
163 |
+
|
164 |
+
if chunksize is None:
|
165 |
+
|
166 |
+
scaffold_sets = _scaffold_chunk(strings)
|
167 |
+
|
168 |
+
else:
|
169 |
+
|
170 |
+
scaffold_sets = defaultdict(list)
|
171 |
+
|
172 |
+
for i, chunk in enumerate(batched(strings, chunksize)):
|
173 |
+
|
174 |
+
scaffold_sets = _scaffold_chunk(chunk,
|
175 |
+
carry=scaffold_sets,
|
176 |
+
start_from=i * chunksize)
|
177 |
+
|
178 |
+
return _scaffold_aggregator(scaffold_sets,
|
179 |
+
train=train, test=test,
|
180 |
+
progress=progress)
|
181 |
+
|
182 |
+
|
183 |
+
_SPLITTERS = {#'simpd': split_simpd,
|
184 |
+
'scaffold': split_scaffold,
|
185 |
+
'random': split_random}
|
186 |
+
|
187 |
+
# _SPLIT_SUPERTYPES = {'scaffold': 'grouped',
|
188 |
+
# 'random': 'independent'}
|
189 |
+
|
190 |
+
_GROUPED_SPLITTERS = {'scaffold': (_scaffold_chunk, _scaffold_aggregator)}
|
191 |
+
|
192 |
+
assert all(_type in _SPLITTERS
|
193 |
+
for _type in _GROUPED_SPLITTERS) ## Should never fail!
|
194 |
+
|
195 |
+
def split(split_type: str,
|
196 |
+
*args, **kwargs) -> DataSplits:
|
197 |
+
|
198 |
+
"""
|
199 |
+
|
200 |
+
"""
|
201 |
+
|
202 |
+
splitter = _SPLITTERS[split_type]
|
203 |
+
|
204 |
+
return splitter(*args, **kwargs)
|
schemist/tables.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tools for processing tabular data."""
|
2 |
+
|
3 |
+
from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Tuple, Union
|
4 |
+
from functools import partial
|
5 |
+
|
6 |
+
try:
|
7 |
+
from itertools import batched
|
8 |
+
except ImportError:
|
9 |
+
from carabiner.itertools import batched
|
10 |
+
|
11 |
+
from carabiner.cast import cast, clist
|
12 |
+
from carabiner import print_err
|
13 |
+
from pandas import DataFrame, concat
|
14 |
+
|
15 |
+
from .cleaning import clean_smiles, clean_selfies
|
16 |
+
from .converting import convert_string_representation
|
17 |
+
from .features import calculate_feature
|
18 |
+
from .generating import sample_peptides_in_length_range, react
|
19 |
+
from .splitting import split
|
20 |
+
from .typing import DataSplits
|
21 |
+
|
22 |
+
def _get_error_tally(df: DataFrame,
|
23 |
+
cols: Union[str, List[str]]) -> Dict[str, int]:
|
24 |
+
|
25 |
+
cols = cast(cols, to=list)
|
26 |
+
|
27 |
+
try:
|
28 |
+
tally = {col: (df[col].isna() | ~df[col]).sum() for col in cols}
|
29 |
+
except TypeError:
|
30 |
+
tally = {col: df[col].isna().sum() for col in cols}
|
31 |
+
|
32 |
+
return tally
|
33 |
+
|
34 |
+
|
35 |
+
def converter(df: DataFrame,
|
36 |
+
column: str = 'smiles',
|
37 |
+
input_representation: str = 'smiles',
|
38 |
+
output_representation: Union[str, List[str]] = 'smiles',
|
39 |
+
prefix: Optional[str] = None,
|
40 |
+
options: Optional[Dict[str, Any]] = None) -> Tuple[Dict[str, int], DataFrame]:
|
41 |
+
|
42 |
+
"""
|
43 |
+
|
44 |
+
"""
|
45 |
+
|
46 |
+
prefix = prefix or ''
|
47 |
+
|
48 |
+
converters = {f"{prefix}{rep_out}": partial(convert_string_representation,
|
49 |
+
output_representation=rep_out,
|
50 |
+
input_representation=input_representation,
|
51 |
+
**options)
|
52 |
+
for rep_out in cast(output_representation, to=list)}
|
53 |
+
|
54 |
+
column_values = df[column]
|
55 |
+
|
56 |
+
converted = {col: cast(f(column_values), to=list)
|
57 |
+
for col, f in converters.items()}
|
58 |
+
|
59 |
+
df = df.assign(**converted)
|
60 |
+
|
61 |
+
return _get_error_tally(df, list(converters)), df
|
62 |
+
|
63 |
+
|
64 |
+
def cleaner(df: DataFrame,
|
65 |
+
column: str = 'smiles',
|
66 |
+
input_representation: str = 'smiles',
|
67 |
+
prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
|
68 |
+
|
69 |
+
"""
|
70 |
+
|
71 |
+
"""
|
72 |
+
|
73 |
+
if input_representation.casefold() == 'smiles':
|
74 |
+
cleaner = clean_smiles
|
75 |
+
elif input_representation.casefold() == 'selfies':
|
76 |
+
cleaner = clean_selfies
|
77 |
+
else:
|
78 |
+
raise ValueError(f"Representation {input_representation} is not supported for cleaning.")
|
79 |
+
|
80 |
+
prefix = prefix or ''
|
81 |
+
new_column = f"{prefix}{column}"
|
82 |
+
|
83 |
+
df = df.assign(**{new_column: lambda x: cleaner(x[column])})
|
84 |
+
|
85 |
+
return _get_error_tally(df, new_column), df
|
86 |
+
|
87 |
+
|
88 |
+
def featurizer(df: DataFrame,
|
89 |
+
feature_type: str,
|
90 |
+
column: str = 'smiles',
|
91 |
+
ids: Optional[Union[str, List[str]]] = None,
|
92 |
+
input_representation: str = 'smiles',
|
93 |
+
prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
|
94 |
+
|
95 |
+
"""
|
96 |
+
|
97 |
+
"""
|
98 |
+
|
99 |
+
if ids is None:
|
100 |
+
ids = df.columns.tolist()
|
101 |
+
else:
|
102 |
+
ids = cast(ids, to=list)
|
103 |
+
|
104 |
+
feature_df = calculate_feature(feature_type=feature_type,
|
105 |
+
strings=df[column],
|
106 |
+
prefix=prefix,
|
107 |
+
input_representation=input_representation)
|
108 |
+
|
109 |
+
if len(ids) > 0:
|
110 |
+
df = concat([df[ids], feature_df], axis=1)
|
111 |
+
|
112 |
+
return _get_error_tally(feature_df, 'meta_feature_valid'), df
|
113 |
+
|
114 |
+
|
115 |
+
def assign_groups(df: DataFrame,
|
116 |
+
grouper: Callable[[Union[str, Iterable[str]]], Dict[str, Tuple[int]]],
|
117 |
+
group_name: str = 'group',
|
118 |
+
column: str = 'smiles',
|
119 |
+
input_representation: str = 'smiles',
|
120 |
+
*args, **kwargs) -> Tuple[Dict[str, Tuple[int]], DataFrame]:
|
121 |
+
|
122 |
+
group_idx = grouper(strings=df[column],
|
123 |
+
input_representation=input_representation,
|
124 |
+
*args, **kwargs)
|
125 |
+
|
126 |
+
inv_group_idx = {i: group for group, idx in group_idx.items() for i in idx}
|
127 |
+
groups = [inv_group_idx[i] for i in range(len(inv_group_idx))]
|
128 |
+
|
129 |
+
return group_idx, df.assign(**{group_name: groups})
|
130 |
+
|
131 |
+
|
132 |
+
def _assign_splits(df: DataFrame,
|
133 |
+
split_idx: DataSplits,
|
134 |
+
use_df_index: bool = False) -> DataFrame:
|
135 |
+
|
136 |
+
row_index = df.index if use_df_index else tuple(range(df.shape[0]))
|
137 |
+
|
138 |
+
df = df.assign(**{f'is_{key}': [i in getattr(split_idx, key) for i in row_index]
|
139 |
+
for key in split_idx._fields})
|
140 |
+
split_counts = {key: sum(df[f'is_{key}'].values) for key in split_idx._fields}
|
141 |
+
|
142 |
+
return split_counts, df
|
143 |
+
|
144 |
+
|
145 |
+
def splitter(df: DataFrame,
|
146 |
+
split_type: str = 'random',
|
147 |
+
column: str = 'smiles',
|
148 |
+
input_representation: str = 'smiles',
|
149 |
+
*args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
|
150 |
+
|
151 |
+
"""
|
152 |
+
|
153 |
+
"""
|
154 |
+
|
155 |
+
split_idx = split(split_type=split_type,
|
156 |
+
strings=df[column],
|
157 |
+
input_representation=input_representation,
|
158 |
+
*args, **kwargs)
|
159 |
+
|
160 |
+
return _assign_splits(df, split_idx=split_idx)
|
161 |
+
|
162 |
+
|
163 |
+
def reactor(df: DataFrame,
|
164 |
+
column: str = 'smiles',
|
165 |
+
reaction: Union[str, Iterable[str]] = 'N_to_C_cyclization',
|
166 |
+
prefix: Optional[str] = None,
|
167 |
+
*args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
|
168 |
+
|
169 |
+
"""
|
170 |
+
|
171 |
+
"""
|
172 |
+
|
173 |
+
prefix = prefix or ''
|
174 |
+
|
175 |
+
reactors = {col: partial(react, reaction=col)
|
176 |
+
for col in cast(reaction, to=list)}
|
177 |
+
|
178 |
+
column_values = df[column]
|
179 |
+
|
180 |
+
new_columns = {f"{prefix}{col}": list(_reactor(strings=column_values, *args, **kwargs))
|
181 |
+
for col, _reactor in reactors.items()}
|
182 |
+
|
183 |
+
df = df.assign(**new_columns)
|
184 |
+
|
185 |
+
return _get_error_tally(df, reaction), df
|
186 |
+
|
187 |
+
|
188 |
+
def _peptide_table(max_length: int,
|
189 |
+
min_length: Optional[int] = None,
|
190 |
+
by: int = 1,
|
191 |
+
n: Optional[Union[float, int]] = None,
|
192 |
+
prefix: str = '',
|
193 |
+
suffix: str = '',
|
194 |
+
generator: bool = False,
|
195 |
+
batch_size: int = 1000,
|
196 |
+
*args, **kwargs) -> Union[DataFrame, Generator]:
|
197 |
+
|
198 |
+
min_length = min_length or max_length
|
199 |
+
|
200 |
+
peptides = sample_peptides_in_length_range(max_length=max_length,
|
201 |
+
min_length=min_length,
|
202 |
+
by=by,
|
203 |
+
n=n,
|
204 |
+
*args, **kwargs)
|
205 |
+
|
206 |
+
if generator:
|
207 |
+
|
208 |
+
for peps in batched(peptides, batch_size):
|
209 |
+
|
210 |
+
peps = [f"{prefix}{pep}{suffix}"
|
211 |
+
for pep in peps]
|
212 |
+
|
213 |
+
yield DataFrame(dict(peptide_sequence=peps))
|
214 |
+
|
215 |
+
else:
|
216 |
+
|
217 |
+
peps = [f"{prefix}{pep}{suffix}"
|
218 |
+
for pep in peptides]
|
219 |
+
|
220 |
+
return DataFrame(dict(peptide_sequence=peps))
|
schemist/typing.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Types used in schemist."""
|
2 |
+
|
3 |
+
from collections import namedtuple
|
4 |
+
|
5 |
+
DataSplits = namedtuple('DataSplits',
|
6 |
+
['train', 'test', 'validation'],
|
7 |
+
defaults=[tuple(), tuple(), tuple()])
|
schemist/utils.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""Miscellaneous utilities for schemist."""
|
test/data/AmpC_screen_table_10k.csv.gz
ADDED
Binary file (171 kB). View file
|
|
test/tests.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import doctest
|
2 |
+
import schemist as sch
|
3 |
+
|
4 |
+
if __name__ == '__main__':
|
5 |
+
|
6 |
+
doctest.testmod(sch)
|