Eachan Johnson commited on
Commit
2dceef8
·
0 Parent(s):

Initial commit

Browse files
.github/workflows/python-package.yml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: Python package
5
+
6
+ on: [push]
7
+
8
+ jobs:
9
+ build:
10
+
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
16
+
17
+ steps:
18
+ - uses: actions/checkout@v3
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v3
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+ - name: Install dependencies
24
+ run: |
25
+ python -m pip install --upgrade pip
26
+ python -m pip install flake8 pytest pytest-cov
27
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
28
+ pip install -e .
29
+ - name: Lint with flake8
30
+ run: |
31
+ # stop the build if there are Python syntax errors or undefined names
32
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
33
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
34
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
35
+ - name: Test with pytest
36
+ run: |
37
+ pytest htstools --doctest-modules --junitxml=tests/test-results.xml --cov=com --cov-report=xml --cov-report=html
38
+ - name: Test with scripts
39
+ run: |
40
+ bash test/scripts/test-plate-tab.sh
41
+ bash test/scripts/test-row-xlsx.sh
42
+ - name: Upload pytest test results
43
+ uses: actions/upload-artifact@v3
44
+ with:
45
+ name: pytest-results-${{ matrix.python-version }}
46
+ path: junit/test-results-${{ matrix.python-version }}.xml
47
+ # Use always() to always run this step to publish test results when there are test failures
48
+ if: ${{ always() }}
.github/workflows/python-publish.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will upload a Python Package using Twine when a release is created
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3
+
4
+ # This workflow uses actions that are not certified by GitHub.
5
+ # They are provided by a third-party and are governed by
6
+ # separate terms of service, privacy policy, and support
7
+ # documentation.
8
+
9
+ name: Upload Python Package
10
+
11
+ on:
12
+ release:
13
+ types: [published]
14
+
15
+ permissions:
16
+ contents: read
17
+
18
+ jobs:
19
+ deploy:
20
+
21
+ runs-on: ubuntu-latest
22
+ strategy:
23
+ matrix:
24
+ python-version: ["3.11"]
25
+
26
+ steps:
27
+ - uses: actions/checkout@v3
28
+ - name: Set up Python
29
+ uses: actions/setup-python@v3
30
+ with:
31
+ python-version: ${{ matrix.python-version }}
32
+ - name: Install dependencies
33
+ run: |
34
+ python -m pip install --upgrade pip
35
+ pip install build
36
+ - name: Build package
37
+ run: python -m build --sdist --wheel --outdir dist
38
+ - name: Publish package
39
+ uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
40
+ with:
41
+ user: __token__
42
+ password: ${{ secrets.PYPI_API_TOKEN }}
.gitignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.pyc
2
+ *.so
3
+ *.egg-info
4
+ *.whl
5
+ .DS_Store
6
+ .mypy_cache/
7
+ .pytype/
8
+ .idea
9
+ .vscode
10
+ .envrc
11
+ __pycache__
12
+ .pytest_cache
13
+
14
+ # Sphinx
15
+ /docs/build/
16
+ /docs/_autosummary/
17
+ /docs/make.bat
18
+ /docs/Makefile
19
+
20
+ /test/outputs/
21
+ /test/data/collate/
22
+
23
+ # virtualenv/venv directories
24
+ /venv/
25
+ /bin/
26
+ /include/
27
+ /lib/
28
+ /share/
.readthedocs.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .readthedocs.yml
2
+ # Read the Docs configuration file
3
+ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4
+
5
+ # Required
6
+ version: 2
7
+
8
+ build:
9
+ os: "ubuntu-20.04"
10
+ tools:
11
+ python: "3.10"
12
+
13
+ # Build documentation in the docs/ directory with Sphinx
14
+ sphinx:
15
+ configuration: docs/source/conf.py
16
+ fail_on_warning: false
17
+
18
+ # Optionally build your docs in additional formats such as PDF and ePub
19
+ formats:
20
+ - htmlzip
21
+
22
+ # Optionally set the version of Python and requirements required to build your docs
23
+ python:
24
+ install:
25
+ - requirements: docs/requirements.txt
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) [year] [fullname]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ⬢⬢⬢ schemist
2
+
3
+ ![GitHub Workflow Status (with branch)](https://img.shields.io/github/actions/workflow/status/scbirlab/schemist/python-publish.yml)
4
+ ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/schemist)
5
+ ![PyPI](https://img.shields.io/pypi/v/schemist)
6
+
7
+ Cleaning, collating, and augmenting chemical datasets.
8
+
9
+ - [Installation](#installation)
10
+ - [Command-line usage](#command-line-usage)
11
+ - [Example](#example)
12
+ - [Other commands](#other-commands)
13
+ - [Python API](#python-api)
14
+ - [Documentation](#documentation)
15
+
16
+ ## Installation
17
+
18
+ ### The easy way
19
+
20
+ Install the pre-compiled version from PyPI:
21
+
22
+ ```bash
23
+ pip install schemist
24
+ ```
25
+
26
+ ### From source
27
+
28
+ Clone the repository, then `cd` into it. Then run:
29
+
30
+ ```bash
31
+ pip install -e .
32
+ ```
33
+
34
+ ## Command-line usage
35
+
36
+ **schemist** provides command-line utlities to ... The tools complete specific tasks which
37
+ can be easily composed into analysis pipelines, because the TSV table output goes to
38
+ `stdout` by default so they can be piped from one tool to another.
39
+
40
+ To get a list of commands (tools), do
41
+
42
+ ```bash
43
+ schemist --help
44
+ ```
45
+
46
+ And to get help for a specific command, do
47
+
48
+ ```bash
49
+ schemist <command> --help
50
+ ```
51
+
52
+ For the Python API, [see below](#python-api).
53
+
54
+ ## Example
55
+
56
+
57
+ ## Other commands
58
+
59
+
60
+
61
+ ## Python API
62
+
63
+ **schemist** can be imported into Python to help make custom analyses.
64
+
65
+ ```python
66
+ >>> import schemist as sch
67
+ ```
68
+
69
+
70
+
71
+ ## Documentation
72
+
73
+ Full API documentation is at [ReadTheDocs](https://schemist.readthedocs.org).
build/lib/schemist/__init__.py ADDED
File without changes
build/lib/schemist/cleaning.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Chemical structure cleaning routines."""
2
+
3
+ from carabiner.decorators import vectorize
4
+
5
+ from datamol import sanitize_smiles
6
+ import selfies as sf
7
+
8
+ @vectorize
9
+ def clean_smiles(smiles: str,
10
+ *args, **kwargs) -> str:
11
+
12
+ """Sanitize a SMILES string or list of SMILES strings.
13
+
14
+ """
15
+
16
+ return sanitize_smiles(smiles, *args, **kwargs)
17
+
18
+
19
+ @vectorize
20
+ def clean_selfies(selfies: str,
21
+ *args, **kwargs) -> str:
22
+
23
+ """Sanitize a SELFIES string or list of SELFIES strings.
24
+
25
+ """
26
+
27
+ return sf.encode(sanitize_smiles(sf.decode(selfies), *args, **kwargs))
build/lib/schemist/cli.py ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Command-line interface for schemist."""
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from argparse import FileType, Namespace
6
+ from collections import Counter, defaultdict
7
+ from functools import partial
8
+ import os
9
+ import sys
10
+ from tempfile import NamedTemporaryFile, TemporaryDirectory
11
+
12
+ from carabiner import pprint_dict, upper_and_lower, print_err
13
+ from carabiner.cliutils import clicommand, CLIOption, CLICommand, CLIApp
14
+ from carabiner.itertools import tenumerate
15
+ from carabiner.pd import get_formats, write_stream
16
+
17
+ from .collating import collate_inventory, deduplicate_file
18
+ from .converting import _TO_FUNCTIONS, _FROM_FUNCTIONS
19
+ from .generating import AA, REACTIONS
20
+ from .io import _mutate_df_stream
21
+ from .tables import (converter, cleaner, featurizer, assign_groups,
22
+ _assign_splits, splitter, _peptide_table, reactor)
23
+ from .splitting import _SPLITTERS, _GROUPED_SPLITTERS
24
+
25
+ __version__ = '0.0.1'
26
+
27
+ def _option_parser(x: Optional[List[str]]) -> Dict[str, Any]:
28
+
29
+ options = {}
30
+
31
+ try:
32
+ for opt in x:
33
+
34
+ try:
35
+ key, value = opt.split('=')
36
+ except ValueError:
37
+ raise ValueError(f"Option {opt} is misformatted. It should be in the format keyword=value.")
38
+
39
+ try:
40
+ value = int(value)
41
+ except ValueError:
42
+ try:
43
+ value = float(value)
44
+ except ValueError:
45
+ pass
46
+
47
+ options[key] = value
48
+
49
+ except TypeError:
50
+
51
+ pass
52
+
53
+ return options
54
+
55
+
56
+ def _sum_tally(tallies: Counter,
57
+ message: str = "Error counts",
58
+ use_length: bool = False):
59
+
60
+ total_tally = Counter()
61
+
62
+ for tally in tallies:
63
+
64
+ if use_length:
65
+ total_tally.update({key: len(value) for key, value in tally.items()})
66
+ else:
67
+ total_tally.update(tally)
68
+
69
+ if len(tallies) == 0:
70
+ raise ValueError(f"Nothing generated!")
71
+
72
+ pprint_dict(total_tally, message=message)
73
+
74
+ return total_tally
75
+
76
+
77
+ @clicommand(message="Cleaning file with the following parameters")
78
+ def _clean(args: Namespace) -> None:
79
+
80
+ error_tallies = _mutate_df_stream(input_file=args.input,
81
+ output_file=args.output,
82
+ function=partial(cleaner,
83
+ column=args.column,
84
+ input_representation=args.representation,
85
+ prefix=args.prefix),
86
+ file_format=args.format)
87
+
88
+ _sum_tally(error_tallies)
89
+
90
+ return None
91
+
92
+
93
+ @clicommand(message="Converting between string representations with the following parameters")
94
+ def _convert(args: Namespace) -> None:
95
+
96
+ options = _option_parser(args.options)
97
+
98
+ error_tallies = _mutate_df_stream(input_file=args.input,
99
+ output_file=args.output,
100
+ function=partial(converter,
101
+ column=args.column,
102
+ input_representation=args.representation,
103
+ output_representation=args.to,
104
+ prefix=args.prefix,
105
+ options=options),
106
+ file_format=args.format)
107
+
108
+ _sum_tally(error_tallies)
109
+
110
+ return None
111
+
112
+
113
+ @clicommand(message="Adding features to files with the following parameters")
114
+ def _featurize(args: Namespace) -> None:
115
+
116
+ error_tallies = _mutate_df_stream(input_file=args.input,
117
+ output_file=args.output,
118
+ function=partial(featurizer,
119
+ feature_type=args.feature,
120
+ column=args.column,
121
+ ids=args.id,
122
+ input_representation=args.representation,
123
+ prefix=args.prefix),
124
+ file_format=args.format)
125
+
126
+ _sum_tally(error_tallies)
127
+
128
+ return None
129
+
130
+
131
+ @clicommand(message="Splitting table with the following parameters")
132
+ def _split(args: Namespace) -> None:
133
+
134
+ split_type = args.type.casefold()
135
+
136
+ if split_type in _GROUPED_SPLITTERS:
137
+
138
+ chunk_processor, aggregator = _GROUPED_SPLITTERS[split_type]
139
+
140
+ with TemporaryDirectory() as dir:
141
+
142
+ with NamedTemporaryFile("w", dir=dir, delete=False) as f:
143
+
144
+ group_idxs = _mutate_df_stream(input_file=args.input,
145
+ output_file=f,
146
+ function=partial(assign_groups,
147
+ grouper=chunk_processor,
148
+ group_name=split_type,
149
+ column=args.column,
150
+ input_representation=args.representation),
151
+ file_format=args.format)
152
+ f.close()
153
+ new_group_idx = defaultdict(list)
154
+
155
+ totals = 0
156
+ for group_idx in group_idxs:
157
+ these_totals = 0
158
+ for key, value in group_idx.items():
159
+ these_totals += len(value)
160
+ new_group_idx[key] += [idx + totals for idx in value]
161
+ totals += these_totals
162
+
163
+ group_idx = aggregator(new_group_idx,
164
+ train=args.train,
165
+ test=args.test)
166
+
167
+ split_tallies = _mutate_df_stream(input_file=f.name,
168
+ output_file=args.output,
169
+ function=partial(_assign_splits,
170
+ split_idx=group_idx,
171
+ use_df_index=True),
172
+ file_format=args.format)
173
+ if os.path.exists(f.name):
174
+ os.remove(f.name)
175
+
176
+ else:
177
+
178
+ split_tallies = _mutate_df_stream(input_file=args.input,
179
+ output_file=args.output,
180
+ function=partial(splitter,
181
+ split_type=args.type,
182
+ column=args.column,
183
+ input_representation=args.representation,
184
+ train=args.train,
185
+ test=args.test,
186
+ set_seed=args.seed),
187
+ file_format=args.format)
188
+
189
+ _sum_tally(split_tallies,
190
+ message="Split counts")
191
+
192
+ return None
193
+
194
+
195
+ @clicommand(message="Collating files with the following parameters")
196
+ def _collate(args: Namespace) -> None:
197
+
198
+ root_dir = args.data_dir or '.'
199
+
200
+ error_tallies = _mutate_df_stream(input_file=args.input,
201
+ output_file=args.output,
202
+ function=partial(collate_inventory,
203
+ root_dir=root_dir,
204
+ drop_unmapped=not args.keep_extra_columns,
205
+ catalog_smiles_column=args.column,
206
+ id_column_name=args.id_column,
207
+ id_n_digits=args.digits,
208
+ id_prefix=args.prefix),
209
+ file_format=args.format)
210
+
211
+ _sum_tally(error_tallies,
212
+ message="Collated chemicals:")
213
+
214
+ return None
215
+
216
+
217
+ @clicommand(message="Deduplicating chemical structures with the following parameters")
218
+ def _dedup(args: Namespace) -> None:
219
+
220
+ report, deduped_df = deduplicate_file(args.input,
221
+ format=args.format,
222
+ column=args.column,
223
+ input_representation=args.representation,
224
+ index_columns=args.indexes)
225
+
226
+ if args.prefix is not None and 'inchikey' in deduped_df:
227
+ deduped_df = deduped_df.rename(columns={'inchikey': f'{args.prefix}inchikey'})
228
+
229
+ write_stream(deduped_df,
230
+ output=args.output,
231
+ format=args.format)
232
+
233
+ pprint_dict(report, message="Finished deduplicating:")
234
+
235
+ return None
236
+
237
+
238
+ @clicommand(message="Enumerating peptides with the following parameters")
239
+ def _enum(args: Namespace) -> None:
240
+
241
+ tables = _peptide_table(max_length=args.max_length,
242
+ min_length=args.min_length,
243
+ n=args.number,
244
+ indexes=args.slice,
245
+ set_seed=args.seed,
246
+ prefix=args.prefix,
247
+ suffix=args.suffix,
248
+ d_aa_only=args.d_aa_only,
249
+ include_d_aa=args.include_d_aa,
250
+ generator=True)
251
+
252
+ dAA_use = any(aa.islower() for aa in args.prefix + args.suffix)
253
+ dAA_use = dAA_use or args.include_d_aa or args.d_aa_only
254
+
255
+ tallies, error_tallies = [], []
256
+ options = _option_parser(args.options)
257
+ _converter = partial(converter,
258
+ column='peptide_sequence',
259
+ input_representation='minihelm' if dAA_use else 'aa_seq', ## affects performance
260
+ output_representation=args.to,
261
+ options=options)
262
+
263
+ for i, table in tenumerate(tables):
264
+
265
+ _err_tally, df = _converter(table)
266
+
267
+ tallies.append({"Number of peptides": df.shape[0]})
268
+ error_tallies.append(_err_tally)
269
+
270
+ write_stream(df,
271
+ output=args.output,
272
+ format=args.format,
273
+ mode='w' if i == 0 else 'a',
274
+ header=i == 0)
275
+
276
+ _sum_tally(tallies,
277
+ message="Enumerated peptides")
278
+ _sum_tally(error_tallies,
279
+ message="Conversion errors")
280
+
281
+ return None
282
+
283
+
284
+ @clicommand(message="Reacting peptides with the following parameters")
285
+ def _react(args: Namespace) -> None:
286
+
287
+ error_tallies = _mutate_df_stream(input_file=args.input,
288
+ output_file=args.output,
289
+ function=partial(reactor,
290
+ column=args.column,
291
+ input_representation=args.representation,
292
+ reaction=args.reaction,
293
+ product_name=args.name),
294
+ file_format=args.format)
295
+
296
+ _sum_tally(error_tallies)
297
+
298
+ return None
299
+
300
+
301
+ def main() -> None:
302
+
303
+ inputs = CLIOption('input',
304
+ default=sys.stdin,
305
+ type=FileType('r'),
306
+ nargs='?',
307
+ help='Input columnar Excel, CSV or TSV file. Default: STDIN.')
308
+ representation = CLIOption('--representation', '-r',
309
+ type=str,
310
+ default='SMILES',
311
+ choices=upper_and_lower(_FROM_FUNCTIONS),
312
+ help='Chemical representation to use for input. ')
313
+ column = CLIOption('--column', '-c',
314
+ default='smiles',
315
+ type=str,
316
+ help='Column to use as input string representation. ')
317
+ prefix = CLIOption('--prefix', '-p',
318
+ default=None,
319
+ type=str,
320
+ help='Prefix to add to new column name. Default: no prefix')
321
+ to = CLIOption('--to', '-2',
322
+ type=str,
323
+ default='SMILES',
324
+ nargs='*',
325
+ choices=upper_and_lower(_TO_FUNCTIONS),
326
+ help='Format to convert to.')
327
+ options = CLIOption('--options', '-x',
328
+ type=str,
329
+ default=None,
330
+ nargs='*',
331
+ help='Options to pass to converter, in the format '
332
+ '"keyword1=value1 keyword2=value2"')
333
+ output = CLIOption('--output', '-o',
334
+ type=FileType('w'),
335
+ default=sys.stdout,
336
+ help='Output file. Default: STDOUT')
337
+ formatting = CLIOption('--format', '-f',
338
+ type=str,
339
+ default=None,
340
+ choices=upper_and_lower(get_formats()),
341
+ help='Override file extensions for input and output. '
342
+ 'Default: infer from file extension.')
343
+
344
+ ## featurize
345
+ id_feat = CLIOption('--id', '-i',
346
+ type=str,
347
+ default=None,
348
+ nargs='*',
349
+ help='Columns to retain in output table. Default: use all')
350
+ feature = CLIOption('--feature', '-t',
351
+ type=str,
352
+ default='2d',
353
+ choices=['2d', 'fp'], ## TODO: implement 3d
354
+ help='Which feature type to generate.')
355
+
356
+ ## split
357
+ type_ = CLIOption('--type', '-t',
358
+ type=str,
359
+ default='random',
360
+ choices=upper_and_lower(_SPLITTERS),
361
+ help='Which split type to use.')
362
+ train = CLIOption('--train', '-a',
363
+ type=float,
364
+ default=1.,
365
+ help='Proportion of data to use for training. ')
366
+ test = CLIOption('--test', '-b',
367
+ type=float,
368
+ default=0.,
369
+ help='Proportion of data to use for testing. ')
370
+
371
+ ## collate
372
+ data_dir = CLIOption('--data-dir', '-d',
373
+ type=str,
374
+ default=None,
375
+ help='Directory containing data files. '
376
+ 'Default: current directory')
377
+ id_column = CLIOption('--id-column', '-s',
378
+ default=None,
379
+ type=str,
380
+ help='If provided, add a structure ID column with this name. '
381
+ 'Default: don\'t add structure IDs')
382
+ prefix_collate = CLIOption('--prefix', '-p',
383
+ default='ID-',
384
+ type=str,
385
+ help='Prefix to add to structure IDs. '
386
+ 'Default: no prefix')
387
+ digits = CLIOption('--digits', '-n',
388
+ default=8,
389
+ type=int,
390
+ help='Number of digits in structure IDs. ')
391
+ keep_extra_columns = CLIOption('--keep-extra-columns', '-x',
392
+ action='store_true',
393
+ help='Whether to keep columns not mentioned in the catalog. '
394
+ 'Default: drop extra columns.')
395
+ keep_invalid_smiles = CLIOption('--keep-invalid-smiles', '-y',
396
+ action='store_true',
397
+ help='Whether to keep rows with invalid SMILES. '
398
+ 'Default: drop invalid rows.')
399
+
400
+ ## dedup
401
+ indexes = CLIOption('--indexes', '-x',
402
+ type=str,
403
+ default=None,
404
+ nargs='*',
405
+ help='Columns to retain and collapse (if multiple values per unique structure). '
406
+ 'Default: retain no other columns than structure and InchiKey.')
407
+ drop_inchikey = CLIOption('--drop-inchikey', '-d',
408
+ action='store_true',
409
+ help='Whether to drop the calculated InchiKey column. '
410
+ 'Default: keep InchiKey.')
411
+
412
+ ### enum
413
+ max_length = CLIOption('--max-length', '-l',
414
+ type=int,
415
+ help='Maximum length of enumerated peptide. '
416
+ 'Required.')
417
+ min_length = CLIOption('--min-length', '-m',
418
+ type=int,
419
+ default=None,
420
+ help='Minimum length of enumerated peptide. '
421
+ 'Default: same as maximum, i.e. all peptides same length.')
422
+ number_to_gen = CLIOption('--number', '-n',
423
+ type=float,
424
+ default=None,
425
+ help='Number of peptides to sample from all possible '
426
+ 'within the constraints. If less than 1, sample '
427
+ 'that fraction of all possible. If greater than 1, '
428
+ 'sample that number. '
429
+ 'Default: return all peptides.')
430
+ slicer = CLIOption('--slice', '-z',
431
+ type=str,
432
+ default=None,
433
+ nargs='*',
434
+ help='Subset of (possibly sampled) population to return, in the format <stop> '
435
+ 'or <start> <stop> [<step>]. If "x" is used for <stop>, then it runs to the end. '
436
+ 'For example, 1000 gives the first 1000, 2 600 gives items 2-600, and '
437
+ '3 500 2 gives every other from 3 to 500. Default: return all.')
438
+ alphabet = CLIOption('--alphabet', '-b',
439
+ type=str,
440
+ default=''.join(AA),
441
+ help='Alphabet to use in sampling.')
442
+ suffix = CLIOption('--suffix', '-s',
443
+ type=str,
444
+ default='',
445
+ help='Sequence to add to end. Lowercase for D-amino acids. '
446
+ 'Default: no suffix.')
447
+ set_seed = CLIOption('--seed', '-e',
448
+ type=int,
449
+ default=None,
450
+ help='Seed to use for reproducible randomness. '
451
+ 'Default: don\'t enable reproducibility.')
452
+ d_aa_only = CLIOption('--d-aa-only', '-a',
453
+ action='store_true',
454
+ help='Whether to only use D-amino acids. '
455
+ 'Default: don\'t include.')
456
+ include_d_aa = CLIOption('--include-d-aa', '-y',
457
+ action='store_true',
458
+ help='Whether to include D-amino acids in enumeration. '
459
+ 'Default: don\'t include.')
460
+
461
+ ## reaction
462
+ name = CLIOption('--name', '-n',
463
+ type=str,
464
+ default=None,
465
+ help='Name of column for product. '
466
+ 'Default: same as reaction name.')
467
+ reaction_opt = CLIOption('--reaction', '-x',
468
+ type=str,
469
+ nargs='*',
470
+ choices=list(REACTIONS),
471
+ default='N_to_C_cyclization',
472
+ help='Reaction(s) to apply.')
473
+
474
+ clean = CLICommand('clean',
475
+ description='Clean and normalize SMILES column of a table.',
476
+ main=_clean,
477
+ options=[output, formatting, inputs, representation, column, prefix])
478
+ convert = CLICommand('convert',
479
+ description='Convert between string representations of chemical structures.',
480
+ main=_convert,
481
+ options=[output, formatting, inputs, representation, column, prefix, to, options])
482
+ featurize = CLICommand('featurize',
483
+ description='Convert between string representations of chemical structures.',
484
+ main=_featurize,
485
+ options=[output, formatting, inputs, representation, column, prefix,
486
+ id_feat, feature])
487
+ collate = CLICommand('collate',
488
+ description='Collect disparate tables or SDF files of libraries into a single table.',
489
+ main=_collate,
490
+ options=[output, formatting, inputs, representation,
491
+ data_dir, column.replace(default='input_smiles'), id_column, prefix_collate,
492
+ digits, keep_extra_columns, keep_invalid_smiles])
493
+ dedup = CLICommand('dedup',
494
+ description='Deduplicate chemical structures and retain references.',
495
+ main=_dedup,
496
+ options=[output, formatting, inputs, representation, column, prefix,
497
+ indexes, drop_inchikey])
498
+ enum = CLICommand('enumerate',
499
+ description='Enumerate bio-chemical structures within length and sequence constraints.',
500
+ main=_enum,
501
+ options=[output, formatting, to, options,
502
+ alphabet, max_length, min_length, number_to_gen,
503
+ slicer, set_seed,
504
+ prefix.replace(default='',
505
+ help='Sequence to prepend. Lowercase for D-amino acids. '
506
+ 'Default: no prefix.'),
507
+ suffix,
508
+ type_.replace(default='aa',
509
+ choices=['aa'],
510
+ help='Type of bio sequence to enumerate. '
511
+ 'Default: %(default)s.'),
512
+ d_aa_only, include_d_aa])
513
+ reaction = CLICommand('react',
514
+ description='React compounds in silico in indicated columns using a named reaction.',
515
+ main=_react,
516
+ options=[output, formatting, inputs, representation, column, name,
517
+ reaction_opt])
518
+ split = CLICommand('split',
519
+ description='Split table based on chosen algorithm, optionally taking account of chemical structure during splits.',
520
+ main=_split,
521
+ options=[output, formatting, inputs, representation, column, prefix,
522
+ type_, train, test, set_seed])
523
+
524
+ app = CLIApp("schemist",
525
+ version=__version__,
526
+ description="Tools for cleaning, collating, and augmenting chemical datasets.",
527
+ commands=[clean, convert, featurize, collate, dedup, enum, reaction, split])
528
+
529
+ app.run()
530
+
531
+ return None
532
+
533
+
534
+ if __name__ == "__main__":
535
+
536
+ main()
build/lib/schemist/collating.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools to collate chemical data files."""
2
+
3
+ from typing import Callable, Dict, Iterable, List, Optional, Tuple, TextIO, Union
4
+
5
+ from collections import Counter
6
+ from functools import partial
7
+ from glob import glob
8
+ import os
9
+
10
+ from carabiner.pd import read_table, resolve_delim
11
+ from carabiner import print_err
12
+ import numpy as np
13
+ from pandas import DataFrame, concat
14
+
15
+ from .converting import convert_string_representation, _FROM_FUNCTIONS
16
+ from .io import FILE_READERS
17
+
18
+ GROUPING_COLUMNS = ("filename", "file_format", "library_name", "string_representation")
19
+ ESSENTIAL_COLUMNS = GROUPING_COLUMNS + ("compound_collection", "plate_id", "well_id")
20
+
21
+ def _column_mapper(df: DataFrame,
22
+ cols: Iterable[str]) -> Tuple[Callable, Dict]:
23
+
24
+ basic_map = {column: df[column].tolist()[0] for column in cols}
25
+ inv_basic_map = {value: key for key, value in basic_map.items()}
26
+
27
+ def column_mapper(x: DataFrame) -> DataFrame:
28
+
29
+ new_df = DataFrame()
30
+
31
+ for new_col, old_col in basic_map.items():
32
+
33
+ # old_col = str(old_col)
34
+
35
+ if old_col is None or str(old_col) in ('None', 'nan', 'NA'):
36
+
37
+ new_df[new_col] = None
38
+
39
+ elif '+' in old_col:
40
+
41
+ splits = old_col.split('+')
42
+ new_df[new_col] = x[splits[0]].str.cat([x[s].astype(str)
43
+ for s in splits[1:]])
44
+
45
+ elif ';' in old_col:
46
+
47
+ col, char, index = old_col.split(';')
48
+ index = [int(i) for i in index.split(':')]
49
+
50
+ if len(index) == 1:
51
+ index = slice(index[0], index[0] + 1)
52
+ else:
53
+ index = slice(*index)
54
+
55
+ try:
56
+
57
+ new_df[new_col] = (x[col]
58
+ .str.split(char)
59
+ .map(lambda y: char.join(y[index] if y is not np.nan else []))
60
+ .str.strip())
61
+
62
+ except TypeError as e:
63
+
64
+ print_err(x[col].str.split(char))
65
+
66
+ raise e
67
+
68
+ else:
69
+
70
+ new_df[new_col] = x[old_col].copy()
71
+
72
+ return new_df
73
+
74
+ return column_mapper, inv_basic_map
75
+
76
+
77
+ def _check_catalog(catalog: DataFrame,
78
+ catalog_smiles_column: str = 'input_smiles') -> None:
79
+
80
+ essential_columns = (catalog_smiles_column, ) + ESSENTIAL_COLUMNS
81
+ missing_essential_cols = [col for col in essential_columns
82
+ if col not in catalog]
83
+
84
+ if len(missing_essential_cols) > 0:
85
+
86
+ print_err(catalog.columns.tolist())
87
+
88
+ raise KeyError("Missing required columns from catalog: " +
89
+ ", ".join(missing_essential_cols))
90
+
91
+ return None
92
+
93
+
94
+ def collate_inventory(catalog: DataFrame,
95
+ root_dir: Optional[str] = None,
96
+ drop_invalid: bool = True,
97
+ drop_unmapped: bool = False,
98
+ catalog_smiles_column: str = 'input_smiles',
99
+ id_column_name: Optional[str] = None,
100
+ id_n_digits: int = 8,
101
+ id_prefix: str = '') -> DataFrame:
102
+
103
+ f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
104
+
105
+ The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
106
+
107
+ - filename is a glob pattern of files to collate
108
+ - file_format is one of {', '.join(FILE_READERS.keys())}
109
+ - smiles_column contains smiles strings
110
+
111
+ Other columns are optional and can have any name, but must contain the name or a pattern
112
+ matching a column (for tabular data) or field (for SDF data) in the files
113
+ of the `filename` column. In the output DataFrame, the named column data will be mapped.
114
+
115
+ Optional column contents can be either concatenated or split using the following
116
+ pattern:
117
+
118
+ - col1+col2: concatenates the contents of `col1` and `col2`
119
+ - col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
120
+
121
+ Parameters
122
+ ----------
123
+ catalog : pd.DataFrame
124
+ Table cataloging locations and format of data. Requires
125
+ columns {', '.join(ESSENTIAL_COLUMNS)}.
126
+ root_dir : str, optional
127
+ Path to look for data files. Default: current directory.
128
+ drop_invalid : bool, optional
129
+ Whether to drop rows containing invalid SMILES.
130
+
131
+
132
+ Returns
133
+ -------
134
+ pd.DataFrame
135
+ Collated chemical data.
136
+
137
+ """
138
+
139
+ root_dir = root_dir or '.'
140
+
141
+ _check_catalog(catalog, catalog_smiles_column)
142
+
143
+ nongroup_columns = [col for col in catalog
144
+ if col not in GROUPING_COLUMNS]
145
+ loaded_dataframes = []
146
+ report = Counter({"invalid SMILES": 0,
147
+ "rows processed": 0})
148
+
149
+ grouped_catalog = catalog.groupby(list(GROUPING_COLUMNS))
150
+ for (this_glob, this_filetype,
151
+ this_library_name, this_representation), filename_df in grouped_catalog:
152
+
153
+ print_err(f'\nProcessing {this_glob}:')
154
+
155
+ this_glob = glob(os.path.join(root_dir, this_glob))
156
+
157
+ these_filenames = sorted(f for f in this_glob
158
+ if not os.path.basename(f).startswith('~$'))
159
+ print_err('\t- ' + '\n\t- '.join(these_filenames))
160
+
161
+ column_mapper, mapped_cols = _column_mapper(filename_df,
162
+ nongroup_columns)
163
+
164
+ reader = FILE_READERS[this_filetype]
165
+
166
+ for filename in these_filenames:
167
+
168
+ this_data0 = reader(filename)
169
+
170
+ if not drop_unmapped:
171
+ unmapped_cols = {col: 'x_' + col.casefold().replace(' ', '_')
172
+ for col in this_data0 if col not in mapped_cols}
173
+ this_data = this_data0[list(unmapped_cols)].rename(columns=unmapped_cols)
174
+ this_data = concat([column_mapper(this_data0), this_data],
175
+ axis=1)
176
+ else:
177
+ this_data = column_mapper(this_data0)
178
+
179
+ if this_representation.casefold() not in _FROM_FUNCTIONS:
180
+
181
+ raise TypeError(' or '.join(list(set(this_representation, this_representation.casefold()))) +
182
+ "not a supported string representation. Try one of " + ", ".join(_FROM_FUNCTIONS))
183
+
184
+ this_converter = partial(convert_string_representation,
185
+ input_representation=this_representation.casefold())
186
+
187
+ this_data = (this_data
188
+ .query('compound_collection != "NA"')
189
+ .assign(library_name=this_library_name,
190
+ input_file_format=this_filetype,
191
+ input_string_representation=this_representation,
192
+ plate_id=lambda x: x['plate_id'].astype(str),
193
+ plate_loc=lambda x: x['library_name'].str.cat([x['compound_collection'], x['plate_id'], x['well_id']], sep=':'),
194
+ canonical_smiles=lambda x: this_converter(x[catalog_smiles_column]),
195
+ is_valid_smiles=lambda x: [s is not None for s in x['canonical_smiles']]))
196
+
197
+ report.update({"invalid SMILES": (~this_data['is_valid_smiles']).sum(),
198
+ "rows processed": this_data.shape[0]})
199
+
200
+ if drop_invalid:
201
+
202
+ this_data = this_data.query('is_valid_smiles')
203
+
204
+ if id_column_name is not None:
205
+
206
+ this_converter = partial(convert_string_representation,
207
+ output_representation='id',
208
+ options=dict(n=id_n_digits,
209
+ prefix=id_prefix))
210
+ this_data = this_data.assign(**{id_column_name: lambda x: this_converter(x['canonical_smiles'])})
211
+
212
+ loaded_dataframes.append(this_data)
213
+
214
+ collated_df = concat(loaded_dataframes, axis=0)
215
+
216
+ return report, collated_df
217
+
218
+
219
+ def collate_inventory_from_file(catalog_path: Union[str, TextIO],
220
+ root_dir: Optional[str] = None,
221
+ format: Optional[str] = None,
222
+ *args, **kwargs) -> DataFrame:
223
+
224
+ f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
225
+
226
+ The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
227
+
228
+ - filename is a glob pattern of files to collate
229
+ - file_format is one of {', '.join(FILE_READERS.keys())}
230
+ - smiles_column contains smiles strings
231
+
232
+ Other columns are optional and can have any name, but must contain the name or a pattern
233
+ matching a column (for tabular data) or field (for SDF data) in the files
234
+ of the `filename` column. In the output DataFrame, the named column data will be mapped.
235
+
236
+ Optional column contents can be either concatenated or split using the following
237
+ pattern:
238
+
239
+ - col1+col2: concatenates the contents of `col1` and `col2`
240
+ - col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
241
+
242
+ Parameters
243
+ ----------
244
+ catalog_path : str
245
+ Path to catalog file in XLSX, TSV or CSV format. Requires
246
+ columns {', '.join(ESSENTIAL_COLUMNS)}.
247
+ format : str, optional
248
+ Format of catalog file. Default: infer from file extension.
249
+ root_dir : str, optional
250
+ Path to look for data files. Default: use directory containing
251
+ the catalog.
252
+
253
+ Returns
254
+ -------
255
+ pd.DataFrame
256
+ Collated chemical data.
257
+
258
+ """
259
+
260
+ root_dir = root_dir or os.path.dirname(catalog_path)
261
+
262
+ data_catalog = read_table(catalog_path, format=format)
263
+
264
+ return collate_inventory(catalog=data_catalog,
265
+ root_dir=root_dir,
266
+ *args, **kwargs)
267
+
268
+
269
+ def deduplicate(df: DataFrame,
270
+ column: str = 'smiles',
271
+ input_representation: str = 'smiles',
272
+ index_columns: Optional[List[str]] = None,
273
+ drop_inchikey: bool = False) -> DataFrame:
274
+
275
+ index_columns = index_columns or []
276
+
277
+ inchikey_converter = partial(convert_string_representation,
278
+ input_representation=input_representation,
279
+ output_representation='inchikey')
280
+
281
+ df = df.assign(inchikey=lambda x: inchikey_converter(x[column]))
282
+
283
+ structure_columns = [column, 'inchikey']
284
+ df_unique = []
285
+
286
+ for (string_rep, inchikey), structure_df in df.groupby(structure_columns):
287
+
288
+ collapsed_indexes = {col: [';'.join(sorted(map(str, set(structure_df[col].tolist()))))]
289
+ for col in structure_df if col in index_columns}
290
+ collapsed_indexes.update({column: [string_rep],
291
+ 'inchikey': [inchikey],
292
+ 'instance_count': [structure_df.shape[0]]})
293
+
294
+ df_unique.append(DataFrame(collapsed_indexes))
295
+
296
+ df_unique = concat(df_unique, axis=0)
297
+
298
+ if drop_inchikey:
299
+
300
+ df_unique = df_unique.drop(columns=['inchikey'])
301
+
302
+ report = {'starting rows:': df.shape[0],
303
+ 'ending_rows': df_unique.shape[0]}
304
+
305
+ return report, df_unique
306
+
307
+
308
+ def deduplicate_file(filename: Union[str, TextIO],
309
+ format: Optional[str] = None,
310
+ *args, **kwargs) -> DataFrame:
311
+
312
+ table = read_table(filename)
313
+
314
+ return deduplicate(table, *args, **kwargs)
315
+
build/lib/schemist/converting.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Converting between chemical representation formats."""
2
+
3
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
4
+
5
+ from functools import wraps
6
+
7
+ from carabiner import print_err
8
+ from carabiner.cast import cast, flatten
9
+ from carabiner.decorators import return_none_on_error, vectorize
10
+ from carabiner.itertools import batched
11
+
12
+ from datamol import sanitize_smiles
13
+ import nemony as nm
14
+ from pandas import DataFrame
15
+ from rdkit.Chem import (Mol, MolFromInchi, MolFromHELM, MolFromSequence,
16
+ MolFromSmiles, MolToInchi, MolToInchiKey,
17
+ MolToSmiles)
18
+ from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
19
+ from requests import Session
20
+ import selfies as sf
21
+
22
+ from .rest_lookup import _inchikey2pubchem_name_id, _inchikey2cactus_name
23
+
24
+ @vectorize
25
+ @return_none_on_error
26
+ def _seq2mol(s: str) -> Union[Mol, None]:
27
+
28
+ return MolFromSequence(s, sanitize=True)
29
+
30
+
31
+ @vectorize
32
+ @return_none_on_error
33
+ def _helm2mol(s: str) -> Union[Mol, None]:
34
+
35
+ return MolFromHELM(s, sanitize=True)
36
+
37
+
38
+ def mini_helm2helm(s: str) -> List[str]:
39
+
40
+ new_s = []
41
+ token = ''
42
+ between_sq_brackets = False
43
+
44
+ for letter in s:
45
+
46
+ if letter.islower() and not between_sq_brackets:
47
+
48
+ letter = f"[d{letter.upper()}]"
49
+
50
+ token += letter
51
+
52
+ if letter == '[':
53
+ between_sq_brackets = True
54
+ elif letter == ']':
55
+ between_sq_brackets = False
56
+
57
+ if not between_sq_brackets:
58
+ new_s.append(token)
59
+ token = ''
60
+
61
+ return "PEPTIDE1{{{inner_helm}}}$$$$".format(inner_helm='.'.join(new_s))
62
+
63
+
64
+ @vectorize
65
+ @return_none_on_error
66
+ def _mini_helm2mol(s: str) -> Mol:
67
+
68
+ s = mini_helm2helm(s)
69
+
70
+ return MolFromHELM(s, sanitize=True)
71
+
72
+
73
+ @vectorize
74
+ @return_none_on_error
75
+ def _inchi2mol(s: str) -> Mol:
76
+
77
+ return MolFromInchi(s,
78
+ sanitize=True,
79
+ removeHs=True)
80
+
81
+ @vectorize
82
+ # @return_none_on_error
83
+ def _smiles2mol(s: str) -> Mol:
84
+
85
+ return MolFromSmiles(sanitize_smiles(s))
86
+
87
+
88
+ @vectorize
89
+ @return_none_on_error
90
+ def _selfies2mol(s: str) -> Mol:
91
+
92
+ return MolFromSmiles(sf.decoder(s))
93
+
94
+
95
+ @vectorize
96
+ @return_none_on_error
97
+ def _mol2nonstandard_inchikey(m: Mol,
98
+ **kwargs) -> str:
99
+
100
+ return MolToInchiKey(m,
101
+ options="/FixedH /SUU /RecMet /KET /15T")
102
+
103
+
104
+ @vectorize
105
+ @return_none_on_error
106
+ def _mol2hash(m: Mol,
107
+ **kwargs) -> str:
108
+
109
+ nonstandard_inchikey = _mol2nonstandard_inchikey(m)
110
+
111
+ return nm.hash(nonstandard_inchikey)
112
+
113
+
114
+ @vectorize
115
+ @return_none_on_error
116
+ def _mol2id(m: Mol,
117
+ n: int = 8,
118
+ prefix: str = '',
119
+ **kwargs) -> str:
120
+
121
+ return prefix + str(int(_mol2hash(m), 16))[:n]
122
+
123
+
124
+ @vectorize
125
+ @return_none_on_error
126
+ def _mol2isomeric_canonical_smiles(m: Mol,
127
+ **kwargs) -> str:
128
+
129
+ return MolToSmiles(m,
130
+ isomericSmiles=True,
131
+ canonical=True)
132
+
133
+
134
+ @vectorize
135
+ @return_none_on_error
136
+ def _mol2inchi(m: Mol,
137
+ **kwargs) -> str:
138
+
139
+ return MolToInchi(m)
140
+
141
+
142
+ @vectorize
143
+ @return_none_on_error
144
+ def _mol2inchikey(m: Mol,
145
+ **kwargs) -> str:
146
+
147
+ return MolToInchiKey(m)
148
+
149
+
150
+ @vectorize
151
+ @return_none_on_error
152
+ def _mol2random_smiles(m: Mol,
153
+ **kwargs) -> str:
154
+
155
+ return MolToSmiles(m,
156
+ isomericSmiles=True,
157
+ doRandom=True)
158
+
159
+
160
+ @vectorize
161
+ @return_none_on_error
162
+ def _mol2mnemonic(m: Mol,
163
+ **kwargs) -> str:
164
+
165
+ nonstandard_inchikey = _mol2nonstandard_inchikey(m)
166
+
167
+ return nm.encode(nonstandard_inchikey)
168
+
169
+
170
+ def _mol2pubchem(m: Union[Mol, Iterable[Mol]],
171
+ session: Optional[Session] = None,
172
+ chunksize: int = 32) -> List[Dict[str, Union[None, int, str]]]:
173
+
174
+ inchikeys = cast(_mol2inchikey(m), to=list)
175
+ pubchem_ids = []
176
+
177
+ for _inchikeys in batched(inchikeys, chunksize):
178
+
179
+ these_ids = _inchikey2pubchem_name_id(_inchikeys,
180
+ session=session)
181
+ pubchem_ids += these_ids
182
+
183
+ return pubchem_ids
184
+
185
+
186
+ @return_none_on_error
187
+ def _mol2pubchem_id(m: Union[Mol, Iterable[Mol]],
188
+ session: Optional[Session] = None,
189
+ chunksize: int = 32,
190
+ **kwargs) -> Union[str, List[str]]:
191
+
192
+ return flatten([val['pubchem_id']
193
+ for val in _mol2pubchem(m,
194
+ session=session,
195
+ chunksize=chunksize)])
196
+
197
+
198
+ @return_none_on_error
199
+ def _mol2pubchem_name(m: Union[Mol, Iterable[Mol]],
200
+ session: Optional[Session] = None,
201
+ chunksize: int = 32,
202
+ **kwargs) -> Union[str, List[str]]:
203
+
204
+ return flatten([val['pubchem_name']
205
+ for val in _mol2pubchem(m,
206
+ session=session,
207
+ chunksize=chunksize)])
208
+
209
+ @return_none_on_error
210
+ def _mol2cactus_name(m: Union[Mol, Iterable[Mol]],
211
+ session: Optional[Session] = None,
212
+ **kwargs) -> Union[str, List[str]]:
213
+
214
+ return _inchikey2cactus_name(_mol2inchikey(m),
215
+ session=session)
216
+
217
+
218
+ @vectorize
219
+ @return_none_on_error
220
+ def _mol2scaffold(m: Mol,
221
+ chiral: bool = True,
222
+ **kwargs) -> str:
223
+
224
+ return MurckoScaffoldSmiles(mol=m,
225
+ includeChirality=chiral)
226
+
227
+
228
+ @vectorize
229
+ @return_none_on_error
230
+ def _mol2selfies(m: Mol,
231
+ **kwargs) -> str:
232
+
233
+ s = sf.encoder(_mol2isomeric_canonical_smiles(m))
234
+
235
+ return s if s != -1 else None
236
+
237
+
238
+ _TO_FUNCTIONS = {"smiles": _mol2isomeric_canonical_smiles,
239
+ "selfies": _mol2selfies,
240
+ "inchi": _mol2inchi,
241
+ "inchikey": _mol2inchikey,
242
+ "nonstandard_inchikey": _mol2nonstandard_inchikey,
243
+ "hash": _mol2hash,
244
+ "mnemonic": _mol2mnemonic,
245
+ "id": _mol2id,
246
+ "scaffold": _mol2scaffold,
247
+ "permuted_smiles": _mol2random_smiles,
248
+ "pubchem_id": _mol2pubchem_id,
249
+ "pubchem_name": _mol2pubchem_name,
250
+ "cactus_name": _mol2cactus_name}
251
+
252
+ _FROM_FUNCTIONS = {"smiles": _smiles2mol,
253
+ "selfies": _selfies2mol,
254
+ "inchi": _inchi2mol,
255
+ "aa_seq": _seq2mol,
256
+ "helm": _helm2mol,
257
+ "minihelm": _mini_helm2mol}
258
+
259
+
260
+ def _x2mol(strings: Union[Iterable[str], str],
261
+ input_representation: str = 'smiles') -> Union[Mol, None, Iterable[Union[Mol, None]]]:
262
+
263
+ from_function = _FROM_FUNCTIONS[input_representation.casefold()]
264
+
265
+ return from_function(strings)
266
+
267
+
268
+ def _mol2x(mols: Union[Iterable[Mol], Mol],
269
+ output_representation: str = 'smiles',
270
+ **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
271
+
272
+ to_function = _TO_FUNCTIONS[output_representation.casefold()]
273
+
274
+ return to_function(mols, **kwargs)
275
+
276
+
277
+ def convert_string_representation(strings: Union[Iterable[str], str],
278
+ input_representation: str = 'smiles',
279
+ output_representation: str = 'smiles',
280
+ **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
281
+
282
+ """Convert between string representations of chemical structures.
283
+
284
+ """
285
+
286
+ mols = _x2mol(strings, input_representation)
287
+ # print_err(mols)
288
+ outstrings = _mol2x(mols, output_representation, **kwargs)
289
+ # print_err(outstrings)
290
+
291
+ return outstrings
292
+
293
+
294
+ def _convert_input_to_smiles(f: Callable) -> Callable:
295
+
296
+ @wraps(f)
297
+ def _f(strings: Union[Iterable[str], str],
298
+ input_representation: str = 'smiles',
299
+ *args, **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
300
+
301
+ smiles = convert_string_representation(strings,
302
+ output_representation='smiles',
303
+ input_representation=input_representation)
304
+
305
+ return f(strings=smiles,
306
+ *args, **kwargs)
307
+
308
+ return _f
build/lib/schemist/features.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools for generating chemical features."""
2
+
3
+ from typing import Any, Callable, Iterable, Optional, Union
4
+
5
+ from descriptastorus.descriptors import MakeGenerator
6
+ from pandas import DataFrame, Series
7
+ import numpy as np
8
+ from rdkit.Chem.AllChem import FingeprintGenerator64, GetMorganGenerator, Mol
9
+
10
+ from .converting import _smiles2mol, _convert_input_to_smiles
11
+
12
+ def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], DataFrame]:
13
+
14
+ def _f(prefix: Optional[str] = None,
15
+ *args, **kwargs) -> DataFrame:
16
+
17
+ feature_matrix = f(*args, **kwargs)
18
+
19
+ if prefix is not None:
20
+
21
+ new_cols = {col: f"{prefix}_{col}"
22
+ for col in feature_matrix.columns
23
+ if not col.startswith('_meta')}
24
+ feature_matrix = feature_matrix.rename(columns=new_cols)
25
+
26
+ return feature_matrix
27
+
28
+ return _f
29
+
30
+
31
+ def _get_descriptastorus_features(smiles: Iterable[str],
32
+ generator: str) -> DataFrame:
33
+
34
+ generator = MakeGenerator((generator, ))
35
+ smiles = Series(smiles)
36
+
37
+ features = smiles.apply(lambda z: np.array(generator.process(z)))
38
+ matrix = np.stack(features.values, axis=0)
39
+
40
+ return DataFrame(matrix,
41
+ index=smiles.index,
42
+ columns=[col for col, _ in generator.GetColumns()])
43
+
44
+
45
+ @_feature_matrix
46
+ @_convert_input_to_smiles
47
+ def calculate_2d_features(strings: Union[Iterable[str], str],
48
+ normalized: bool = True,
49
+ histogram_normalized: bool = True) -> DataFrame:
50
+
51
+ """Calculate 2d features from string representation.
52
+
53
+ """
54
+
55
+ if normalized:
56
+ if histogram_normalized:
57
+ generator_name = "RDKit2DHistogramNormalized"
58
+ else:
59
+ generator_name = "RDKit2DNormalized"
60
+ else:
61
+ generator_name = "RDKit2D"
62
+
63
+ feature_matrix = _get_descriptastorus_features(strings,
64
+ generator=generator_name)
65
+
66
+ feature_matrix = (feature_matrix
67
+ .rename(columns={f"{generator_name}_calculated": "meta_feature_valid0"})
68
+ .assign(meta_feature_type=generator_name,
69
+ meta_feature_valid=lambda x: (x['meta_feature_valid0'] == 1.))
70
+ .drop(columns=['meta_feature_valid0']))
71
+
72
+ return feature_matrix
73
+
74
+
75
+ def _fast_fingerprint(generator: FingeprintGenerator64,
76
+ mol: Mol,
77
+ to_np: bool = True) -> Union[str, np.ndarray]:
78
+
79
+ try:
80
+ fp_string = generator.GetFingerprint(mol).ToBitString()
81
+ except:
82
+ return None
83
+ else:
84
+ if to_np:
85
+ return np.frombuffer(fp_string.encode(), 'u1') - ord('0')
86
+ else:
87
+ return fp_string
88
+
89
+
90
+ @_feature_matrix
91
+ @_convert_input_to_smiles
92
+ def calculate_fingerprints(strings: Union[Iterable[str], str],
93
+ fp_type: str = 'morgan',
94
+ radius: int = 2,
95
+ chiral: bool = True,
96
+ on_bits: bool = True) -> DataFrame:
97
+
98
+ """
99
+
100
+ """
101
+
102
+ if fp_type.casefold() == 'morgan':
103
+ generator_class = GetMorganGenerator
104
+ else:
105
+ raise AttributeError(f"Fingerprint type {fp_type} not supported!")
106
+
107
+ fp_generator = generator_class(radius=radius,
108
+ includeChirality=chiral)
109
+ mols = (_smiles2mol(s) for s in strings)
110
+ fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits)
111
+ for mol in mols)
112
+
113
+ if on_bits:
114
+
115
+ fingerprints = (map(str, np.flatnonzero(fp_string).tolist())
116
+ for fp_string in fp_strings)
117
+ fingerprints = [';'.join(fp) for fp in fingerprints]
118
+ validity = [len(fp) > 0 for fp in fingerprints]
119
+
120
+ feature_matrix = DataFrame(fingerprints,
121
+ columns=['fp_bits'])
122
+
123
+ else:
124
+
125
+ fingerprints = [np.array(int(digit) for digit in fp_string)
126
+ if fp_string is not None
127
+ else (-np.ones((fp_generator.GetOptions().fpSize, )))
128
+ for fp_string in fp_strings]
129
+ validity = [np.all(fp >= 0) for fp in fingerprints]
130
+
131
+ feature_matrix = DataFrame(np.stack(fingerprints, axis=0),
132
+ columns=[f"fp_{i}" for i in range(len(fingerprints[0]))])
133
+
134
+ return feature_matrix.assign(meta_feature_type=fp_type.casefold(),
135
+ meta_feature_valid=validity)
136
+
137
+
138
+ _FEATURE_CALCULATORS = {"2d": calculate_2d_features, "fp": calculate_fingerprints}
139
+
140
+ def calculate_feature(feature_type: str,
141
+ *args, **kwargs):
142
+
143
+ """
144
+
145
+ """
146
+
147
+ featurizer = _FEATURE_CALCULATORS[feature_type]
148
+
149
+ return featurizer(*args, **kwargs)
build/lib/schemist/generating.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools for enumerating compounds. Currently only works with peptides."""
2
+
3
+ from typing import Callable, Iterable, Optional, Tuple, Union
4
+
5
+ from functools import partial
6
+ from itertools import chain, islice, product, repeat
7
+ from math import ceil, expm1, floor
8
+ from random import choice, choices, random, seed
9
+
10
+ from carabiner import print_err
11
+ from carabiner.decorators import vectorize, return_none_on_error
12
+ from carabiner.random import sample_iter
13
+ from rdkit.Chem import Mol, rdChemReactions
14
+ import numpy as np
15
+
16
+ from .converting import (_x2mol, _mol2x,
17
+ _convert_input_to_smiles)
18
+
19
+ AA = tuple('GALVITSMCPFYWHKRDENQ')
20
+ dAA = tuple(aa.casefold() for aa in AA)
21
+
22
+ REACTIONS = {'N_to_C_cyclization': '([N;H1:5][C:1][C:2](=[O:6])[O:3].[N;H2:4][C:7][C:8](=[O:9])[N;H1:10])>>[N;H1:5][C:1][C:2](=[O:6])[N;H1:4][C:7][C:8](=[O:9])[N;H1:10].[O;H2:3]',
23
+ 'cysteine_to_chloroacetyl_cyclization': '([N;H1:5][C:2](=[O:6])[C:1][Cl:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8]',
24
+ 'cysteine_to_N_cyclization':'([N;H1:5][C:2](=[O:6])[C:1][N;H2:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8].[N;H3:3]'}
25
+
26
+ def _get_alphabet(alphabet: Optional[Iterable[str]] = None,
27
+ d_aa_only: bool = False,
28
+ include_d_aa: bool = False) -> Tuple[str]:
29
+
30
+ alphabet = alphabet or AA
31
+ alphabet_lower = tuple(set(aa.casefold() for aa in AA))
32
+
33
+ if d_aa_only:
34
+ alphabet = alphabet_lower
35
+ elif include_d_aa:
36
+ alphabet = tuple(set(chain(alphabet, alphabet_lower)))
37
+
38
+ return alphabet
39
+
40
+
41
+
42
+ def all_peptides_of_one_length(length: int,
43
+ alphabet: Optional[Iterable[str]] = None,
44
+ d_aa_only: bool = False,
45
+ include_d_aa: bool = False) -> Iterable[str]:
46
+
47
+ """
48
+
49
+ """
50
+
51
+ alphabet = _get_alphabet(alphabet=alphabet,
52
+ d_aa_only=d_aa_only,
53
+ include_d_aa=include_d_aa)
54
+
55
+ return (''.join(peptide)
56
+ for peptide in product(alphabet, repeat=length))
57
+
58
+
59
+ def all_peptides_in_length_range(max_length: int,
60
+ min_length: int = 1,
61
+ by: int = 1,
62
+ alphabet: Optional[Iterable[str]] = None,
63
+ d_aa_only: bool = False,
64
+ include_d_aa: bool = False,
65
+ *args, **kwargs) -> Iterable[str]:
66
+
67
+ """
68
+
69
+ """
70
+
71
+ length_range = range(*sorted([min_length, max_length + 1]), by)
72
+ peptide_maker = partial(all_peptides_of_one_length,
73
+ alphabet=alphabet,
74
+ d_aa_only=d_aa_only,
75
+ include_d_aa=include_d_aa,
76
+ *args, **kwargs)
77
+
78
+ return chain.from_iterable(peptide_maker(length=length)
79
+ for length in length_range)
80
+
81
+
82
+ def _number_of_peptides(max_length: int,
83
+ min_length: int = 1,
84
+ by: int = 1,
85
+ alphabet: Optional[Iterable[str]] = None,
86
+ d_aa_only: bool = False,
87
+ include_d_aa: bool = False):
88
+
89
+ alphabet = _get_alphabet(alphabet=alphabet,
90
+ d_aa_only=d_aa_only,
91
+ include_d_aa=include_d_aa)
92
+ n_peptides = [len(alphabet) ** length
93
+ for length in range(*sorted([min_length, max_length + 1]), by)]
94
+
95
+ return n_peptides
96
+
97
+
98
+ def _naive_sample_peptides_in_length_range(max_length: int,
99
+ min_length: int = 1,
100
+ by: int = 1,
101
+ n: Optional[Union[float, int]] = None,
102
+ alphabet: Optional[Iterable[str]] = None,
103
+ d_aa_only: bool = False,
104
+ include_d_aa: bool = False,
105
+ set_seed: Optional[int] = None):
106
+
107
+ alphabet = _get_alphabet(alphabet=alphabet,
108
+ d_aa_only=d_aa_only,
109
+ include_d_aa=include_d_aa)
110
+ n_peptides = _number_of_peptides(max_length=max_length,
111
+ min_length=min_length,
112
+ by=by,
113
+ alphabet=alphabet,
114
+ d_aa_only=d_aa_only,
115
+ include_d_aa=include_d_aa)
116
+ lengths = list(range(*sorted([min_length, max_length + 1]), by))
117
+ weight_per_length = [n / min(n_peptides) for n in n_peptides]
118
+ weighted_lengths = list(chain.from_iterable(repeat(l, ceil(w)) for l, w in zip(lengths, weight_per_length)))
119
+
120
+ lengths_sample = (choice(weighted_lengths) for _ in range(n))
121
+ return (''.join(choices(list(alphabet), k=k)) for k in lengths_sample)
122
+
123
+
124
+ def sample_peptides_in_length_range(max_length: int,
125
+ min_length: int = 1,
126
+ by: int = 1,
127
+ n: Optional[Union[float, int]] = None,
128
+ alphabet: Optional[Iterable[str]] = None,
129
+ d_aa_only: bool = False,
130
+ include_d_aa: bool = False,
131
+ naive_sampling_cutoff: float = 5e-3,
132
+ reservoir_sampling: bool = True,
133
+ indexes: Optional[Iterable[int]] = None,
134
+ set_seed: Optional[int] = None,
135
+ *args, **kwargs) -> Iterable[str]:
136
+
137
+ """
138
+
139
+ """
140
+
141
+ seed(set_seed)
142
+
143
+ alphabet = _get_alphabet(alphabet=alphabet,
144
+ d_aa_only=d_aa_only,
145
+ include_d_aa=include_d_aa)
146
+
147
+ n_peptides = sum(len(alphabet) ** length
148
+ for length in range(*sorted([min_length, max_length + 1]), by))
149
+ if n is None:
150
+ n_requested = n_peptides
151
+ elif n >= 1.:
152
+ n_requested = min(floor(n), n_peptides)
153
+ elif n < 1.:
154
+ n_requested = floor(n * n_peptides)
155
+
156
+ frac_requested = n_requested / n_peptides
157
+
158
+ # approximation of birthday problem
159
+ p_any_collision = -expm1(-n_requested * (n_requested - 1.) / (2. * n_peptides))
160
+ n_collisons = n_requested * (1. - ((n_peptides - 1.) / n_peptides) ** (n_requested - 1.))
161
+ frac_collisions = n_collisons / n_requested
162
+
163
+ print_err(f"Sampling {n_requested} ({frac_requested * 100.} %) peptides from "
164
+ f"length {min_length} to {max_length} ({n_peptides} combinations). "
165
+ f"Probability of collision if drawing randomly is {p_any_collision}, "
166
+ f"with {n_collisons} ({100. * frac_collisions} %) collisions on average.")
167
+
168
+ if frac_collisions < naive_sampling_cutoff and n_peptides > 2e9:
169
+
170
+ print_err("> Executing naive sampling. ")
171
+
172
+ peptides = _naive_sample_peptides_in_length_range(max_length, min_length, by,
173
+ n=n_requested,
174
+ alphabet=alphabet,
175
+ d_aa_only=d_aa_only,
176
+ include_d_aa=include_d_aa)
177
+
178
+ else:
179
+
180
+ print_err("> Executing exhaustive sampling.")
181
+
182
+ all_peptides = all_peptides_in_length_range(max_length, min_length, by,
183
+ alphabet=alphabet,
184
+ d_aa_only=d_aa_only,
185
+ include_d_aa=include_d_aa,
186
+ *args, **kwargs)
187
+
188
+ if n is None:
189
+
190
+ peptides = all_peptides
191
+
192
+ elif n >= 1.:
193
+
194
+ if reservoir_sampling:
195
+ peptides = sample_iter(all_peptides, k=n_requested,
196
+ shuffle_output=False)
197
+ else:
198
+ peptides = (pep for pep in all_peptides
199
+ if random() <= frac_requested)
200
+
201
+ elif n < 1.:
202
+
203
+ peptides = (pep for pep in all_peptides
204
+ if random() <= n)
205
+
206
+ if indexes is not None:
207
+
208
+ indexes = (int(ix) if (isinstance(ix, str) and ix.isdigit()) or isinstance(ix, int) or isinstance(ix, float)
209
+ else None
210
+ for ix in islice(indexes, 3))
211
+ indexes = [ix if (ix is None or ix >= 0) else None
212
+ for ix in indexes]
213
+
214
+ if len(indexes) > 1:
215
+ if n is not None and n >=1. and indexes[0] > n:
216
+ raise ValueError(f"Minimum slice ({indexes[0]}) is higher than number of items ({n}).")
217
+
218
+ peptides = islice(peptides, *indexes)
219
+
220
+ return peptides
221
+
222
+
223
+ def _reactor(smarts: str) -> Callable[[Mol], Union[Mol, None]]:
224
+
225
+ rxn = rdChemReactions.ReactionFromSmarts(smarts)
226
+ reaction_function = rxn.RunReactants
227
+
228
+ @vectorize
229
+ @return_none_on_error
230
+ def reactor(s: Mol) -> Mol:
231
+
232
+ return reaction_function([s])[0][0]
233
+
234
+ return reactor
235
+
236
+
237
+ @_convert_input_to_smiles
238
+ def react(strings: Union[str, Iterable[str]],
239
+ reaction: str = 'N_to_C_cyclization',
240
+ output_representation: str = 'smiles',
241
+ **kwargs) -> Union[str, Iterable[str]]:
242
+
243
+ """
244
+
245
+ """
246
+
247
+ try:
248
+ _this_reaction = REACTIONS[reaction]
249
+ except KeyError:
250
+ raise KeyError(f"Reaction {reaction} is not available. Try: " +
251
+ ", ".join(list(REACTIONS)))
252
+
253
+ # strings = cast(strings, to=list)
254
+ # print_err((strings))
255
+
256
+ reactor = _reactor(_this_reaction)
257
+ mols = _x2mol(strings)
258
+ mols = reactor(mols)
259
+
260
+ return _mol2x(mols,
261
+ output_representation=output_representation,
262
+ **kwargs)
build/lib/schemist/io.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools to facilitate input and output."""
2
+
3
+ from typing import Any, Callable, List, Optional, TextIO, Tuple, Union
4
+
5
+ from collections import defaultdict
6
+ from functools import partial
7
+ from string import printable
8
+ from tempfile import NamedTemporaryFile
9
+ from xml.etree import ElementTree
10
+
11
+ from carabiner import print_err
12
+ from carabiner.cast import cast
13
+ from carabiner.itertools import tenumerate
14
+ from carabiner.pd import read_table, write_stream
15
+
16
+ from pandas import DataFrame, read_excel
17
+ from rdkit.Chem import SDMolSupplier
18
+
19
+ from .converting import _mol2isomeric_canonical_smiles
20
+
21
+ def _mutate_df_stream(input_file: Union[str, TextIO],
22
+ output_file: Union[str, TextIO],
23
+ function: Callable[[DataFrame], Tuple[Any, DataFrame]],
24
+ file_format: Optional[str] = None,
25
+ chunksize: int = 1000) -> List[Any]:
26
+
27
+ carries = []
28
+
29
+ for i, chunk in tenumerate(read_table(input_file,
30
+ format=file_format,
31
+ progress=False,
32
+ chunksize=chunksize)):
33
+
34
+ result = function(chunk)
35
+
36
+ try:
37
+ carry, df = result
38
+ except ValueError:
39
+ df = result
40
+ carry = 0
41
+
42
+ write_stream(df,
43
+ output=output_file,
44
+ format=file_format,
45
+ header=i == 0,
46
+ mode='w' if i == 0 else 'a')
47
+
48
+ carries.append(carry)
49
+
50
+ return carries
51
+
52
+
53
+ def read_weird_xml(filename: Union[str, TextIO],
54
+ header: bool = True,
55
+ namespace: str = '{urn:schemas-microsoft-com:office:spreadsheet}') -> DataFrame:
56
+
57
+ """
58
+
59
+ """
60
+
61
+ with cast(filename, TextIOWrapper, mode='r') as f:
62
+
63
+ xml_string = ''.join(filter(printable.__contains__, f.read()))
64
+
65
+ try:
66
+
67
+ root = ElementTree.fromstring(xml_string)
68
+
69
+ except Exception as e:
70
+
71
+ print_err('\n!!! ' + xml_string.split('\n')[1184][377:380])
72
+
73
+ raise e
74
+
75
+ for i, row in enumerate(root.iter(f'{namespace}Row') ):
76
+
77
+ this_row = [datum.text for datum in row.iter(f'{namespace}Data')]
78
+
79
+ if i == 0:
80
+
81
+ if header:
82
+
83
+ heading = this_row
84
+ df = {colname: [] for colname in heading}
85
+
86
+ else:
87
+
88
+ heading = [f'X{j}' for j, _ in enumerate(this_row)]
89
+ df = {colname: [datum] for colname, datum in zip(heading, this_row)}
90
+
91
+ else:
92
+
93
+ for colname, datum in zip(heading, this_row):
94
+
95
+ df[colname].append(datum)
96
+
97
+ return DataFrame(df)
98
+
99
+
100
+ def read_sdf(filename: Union[str, TextIO]):
101
+
102
+ """
103
+
104
+ """
105
+
106
+ filename = cast(filename, str)
107
+
108
+ with open(filename, 'r', errors='replace') as f:
109
+ with NamedTemporaryFile("w") as o:
110
+
111
+ o.write(f.read())
112
+ o.seek(0)
113
+
114
+ df = defaultdict(list)
115
+
116
+ for i, mol in enumerate(SDMolSupplier(o.name)):
117
+
118
+ if mol is None:
119
+
120
+ continue
121
+
122
+ propdict = mol.GetPropsAsDict()
123
+ propdict['SMILES'] = _mol2isomeric_canonical_smiles(mol)
124
+
125
+ for colname in propdict:
126
+
127
+ df[colname].append(propdict[colname])
128
+
129
+ for colname in df:
130
+
131
+ if colname not in propdict:
132
+
133
+ df[colname].append(None)
134
+
135
+ col_lengths = {col: len(val) for col, val in df.items()}
136
+
137
+ if len(set(col_lengths.values())) > 1:
138
+
139
+ raise ValueError(f"Column lengths not all the same:\n\t" +
140
+ '\n\t'.join(f"{key}:{val}" for key, val in col_lengths.items()))
141
+
142
+ return DataFrame(df)
143
+
144
+
145
+ FILE_READERS = {
146
+ 'bad_xml': read_weird_xml,
147
+ 'xlsx': partial(read_excel, engine='openpyxl'),
148
+ 'sdf': read_sdf
149
+ }
build/lib/schemist/rest_lookup.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools for querying PubChem."""
2
+
3
+ from typing import Dict, Iterable, List, Optional, Union
4
+ from time import sleep
5
+ from xml.etree import ElementTree
6
+
7
+ from carabiner import print_err
8
+ from carabiner.cast import cast
9
+ from carabiner.decorators import vectorize
10
+ from requests import Response, Session
11
+
12
+ _PUBCHEM_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/{get}/{format}"
13
+ _CACTUS_URL = "https://cactus.nci.nih.gov/chemical/structure/{inchikey}/{get}"
14
+
15
+ _OVERLOAD_CODES = {500, 501, 503, 504}
16
+
17
+
18
+ def _url_request(inchikeys: Union[str, Iterable[str]],
19
+ url: str,
20
+ session: Optional[Session] = None,
21
+ **kwargs) -> Response:
22
+
23
+ if session is None:
24
+ session = Session()
25
+
26
+ inchikeys = cast(inchikeys, to=list)
27
+
28
+ return session.get(url.format(inchikey=','.join(inchikeys), **kwargs))
29
+
30
+
31
+ def _inchikey2pubchem_name_id(inchikeys: Union[str, Iterable[str]],
32
+ session: Optional[Session] = None,
33
+ counter: int = 0,
34
+ max_tries: int = 10,
35
+ namespace: str = "{http://pubchem.ncbi.nlm.nih.gov/pug_rest}") -> List[Dict[str, Union[None, int, str]]]:
36
+
37
+ r = _url_request(inchikeys, url=_PUBCHEM_URL,
38
+ session=session,
39
+ get="Title,InchiKey", format="XML")
40
+
41
+ if r.status_code == 200:
42
+
43
+ root = ElementTree.fromstring(r.text)
44
+ compounds = root.iter(f'{namespace}Properties')
45
+
46
+ result_dict = dict()
47
+
48
+ for cmpd in compounds:
49
+
50
+ cmpd_dict = dict()
51
+
52
+ for child in cmpd:
53
+ cmpd_dict[child.tag.split(namespace)[1]] = child.text
54
+
55
+ try:
56
+ inchikey, name, pcid = cmpd_dict['InChIKey'], cmpd_dict['Title'], cmpd_dict['CID']
57
+ except KeyError:
58
+ print(cmpd_dict)
59
+ else:
60
+ result_dict[inchikey] = {'pubchem_name': name.casefold(),
61
+ 'pubchem_id': pcid}
62
+
63
+ print_err(f'PubChem: Looked up InchiKeys: {",".join(inchikeys)}')
64
+
65
+ result_list = [result_dict[inchikey]
66
+ if inchikey in result_dict
67
+ else {'pubchem_name': None, 'pubchem_id': None}
68
+ for inchikey in inchikeys]
69
+
70
+ return result_list
71
+
72
+ elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
73
+
74
+ sleep(1.)
75
+
76
+ return _inchikey2pubchem_name_id(inchikeys,
77
+ session=session,
78
+ counter=counter + 1,
79
+ max_tries=max_tries,
80
+ namespace=namespace)
81
+
82
+ else:
83
+
84
+ print_err(f'PubChem: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
85
+
86
+ return [{'pubchem_name': None, 'pubchem_id': None}
87
+ for _ in range(len(inchikeys))]
88
+
89
+
90
+ @vectorize
91
+ def _inchikey2cactus_name(inchikeys: str,
92
+ session: Optional[Session] = None,
93
+ counter: int = 0,
94
+ max_tries: int = 10):
95
+
96
+ r = _url_request(inchikeys, url=_CACTUS_URL,
97
+ session=session,
98
+ get="names")
99
+
100
+ if r.status_code == 200:
101
+
102
+ return r.text.split('\n')[0].casefold()
103
+
104
+ elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
105
+
106
+ sleep(1.)
107
+
108
+ return _inchikey2cactus_name(inchikeys,
109
+ session=session,
110
+ counter=counter + 1,
111
+ max_tries=max_tries)
112
+
113
+ else:
114
+
115
+ print_err(f'Cactus: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
116
+
117
+ return None
118
+
build/lib/schemist/splitting.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools for splitting tabular datasets, optionally based on chemical features."""
2
+
3
+ from typing import Dict, Iterable, List, Optional, Tuple, Union
4
+ from collections import defaultdict
5
+ from math import ceil
6
+ from random import random, seed
7
+
8
+ try:
9
+ from itertools import batched
10
+ except ImportError:
11
+ from carabiner.itertools import batched
12
+
13
+ from tqdm.auto import tqdm
14
+
15
+ from .converting import convert_string_representation, _convert_input_to_smiles
16
+ from .typing import DataSplits
17
+
18
+ # def _train_test_splits
19
+
20
+ def _train_test_val_sizes(total: int,
21
+ train: float = 1.,
22
+ test: float = 0.) -> Tuple[int]:
23
+
24
+ n_train = int(ceil(train * total))
25
+ n_test = int(ceil(test * total))
26
+ n_val = total - n_train - n_test
27
+
28
+ return n_train, n_test, n_val
29
+
30
+
31
+ def _random_chunk(strings: str,
32
+ train: float = 1.,
33
+ test: float = 0.,
34
+ carry: Optional[Dict[str, List[int]]] = None,
35
+ start_from: int = 0) -> Dict[str, List[int]]:
36
+
37
+ carry = carry or defaultdict(list)
38
+
39
+ train_test: float = train + test
40
+
41
+ for i, _ in enumerate(strings):
42
+
43
+ random_number: float = random()
44
+
45
+ if random_number < train:
46
+
47
+ key = 'train'
48
+
49
+ elif random_number < train_test:
50
+
51
+ key = 'test'
52
+
53
+ else:
54
+
55
+ key = 'validation'
56
+
57
+ carry[key].append(start_from + i)
58
+
59
+ return carry
60
+
61
+
62
+ def split_random(strings: Union[str, Iterable[str]],
63
+ train: float = 1.,
64
+ test: float = 0.,
65
+ chunksize: Optional[int] = None,
66
+ set_seed: Optional[int] = None,
67
+ *args, **kwargs) -> DataSplits:
68
+
69
+ """
70
+
71
+ """
72
+
73
+ if set_seed is not None:
74
+
75
+ seed(set_seed)
76
+
77
+
78
+ if chunksize is None:
79
+
80
+ idx = _random_chunk(strings=strings,
81
+ train=train,
82
+ test=test)
83
+
84
+ else:
85
+
86
+ idx = defaultdict(list)
87
+
88
+ for i, chunk in enumerate(batched(strings, chunksize)):
89
+
90
+ idx = _random_chunk(strings=chunk,
91
+ train=train,
92
+ test=test,
93
+ carry=idx,
94
+ start_from=i * chunksize)
95
+
96
+ seed(None)
97
+
98
+ return DataSplits(**idx)
99
+
100
+
101
+ @_convert_input_to_smiles
102
+ def _scaffold_chunk(strings: str,
103
+ carry: Optional[Dict[str, List[int]]] = None,
104
+ start_from: int = 0) -> Dict[str, List[int]]:
105
+
106
+ carry = carry or defaultdict(list)
107
+
108
+ these_scaffolds = convert_string_representation(strings=strings,
109
+ output_representation='scaffold')
110
+
111
+ for j, scaff in enumerate(these_scaffolds):
112
+ carry[scaff].append(start_from + j)
113
+
114
+ return carry
115
+
116
+
117
+ def _scaffold_aggregator(scaffold_sets: Dict[str, List[int]],
118
+ train: float = 1.,
119
+ test: float = 0.,
120
+ progress: bool = False) -> DataSplits:
121
+
122
+ scaffold_sets = {key: sorted(value)
123
+ for key, value in scaffold_sets.items()}
124
+ scaffold_sets = sorted(scaffold_sets.items(),
125
+ key=lambda x: (len(x[1]), x[1][0]),
126
+ reverse=True)
127
+ nrows = sum(len(idx) for _, idx in scaffold_sets)
128
+ n_train, n_test, n_val = _train_test_val_sizes(nrows,
129
+ train,
130
+ test)
131
+ idx = defaultdict(list)
132
+
133
+ iterator = tqdm(scaffold_sets) if progress else scaffold_sets
134
+ for _, scaffold_idx in iterator:
135
+
136
+ if (len(idx['train']) + len(scaffold_idx)) > n_train:
137
+
138
+ if (len(idx['test']) + len(scaffold_idx)) > n_test:
139
+
140
+ key = 'validation'
141
+
142
+ else:
143
+
144
+ key = 'test'
145
+ else:
146
+
147
+ key = 'train'
148
+
149
+ idx[key] += scaffold_idx
150
+
151
+ return DataSplits(**idx)
152
+
153
+
154
+ def split_scaffold(strings: Union[str, Iterable[str]],
155
+ train: float = 1.,
156
+ test: float = 0.,
157
+ chunksize: Optional[int] = None,
158
+ progress: bool = True) -> DataSplits:
159
+
160
+ """
161
+
162
+ """
163
+
164
+ if chunksize is None:
165
+
166
+ scaffold_sets = _scaffold_chunk(strings)
167
+
168
+ else:
169
+
170
+ scaffold_sets = defaultdict(list)
171
+
172
+ for i, chunk in enumerate(batched(strings, chunksize)):
173
+
174
+ scaffold_sets = _scaffold_chunk(chunk,
175
+ carry=scaffold_sets,
176
+ start_from=i * chunksize)
177
+
178
+ return _scaffold_aggregator(scaffold_sets,
179
+ train=train, test=test,
180
+ progress=progress)
181
+
182
+
183
+ _SPLITTERS = {#'simpd': split_simpd,
184
+ 'scaffold': split_scaffold,
185
+ 'random': split_random}
186
+
187
+ # _SPLIT_SUPERTYPES = {'scaffold': 'grouped',
188
+ # 'random': 'independent'}
189
+
190
+ _GROUPED_SPLITTERS = {'scaffold': (_scaffold_chunk, _scaffold_aggregator)}
191
+
192
+ assert all(_type in _SPLITTERS
193
+ for _type in _GROUPED_SPLITTERS) ## Should never fail!
194
+
195
+ def split(split_type: str,
196
+ *args, **kwargs) -> DataSplits:
197
+
198
+ """
199
+
200
+ """
201
+
202
+ splitter = _SPLITTERS[split_type]
203
+
204
+ return splitter(*args, **kwargs)
build/lib/schemist/tables.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools for processing tabular data."""
2
+
3
+ from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Tuple, Union
4
+ from functools import partial
5
+
6
+ try:
7
+ from itertools import batched
8
+ except ImportError:
9
+ from carabiner.itertools import batched
10
+
11
+ from carabiner.cast import cast, clist
12
+ from carabiner import print_err
13
+ from pandas import DataFrame, concat
14
+
15
+ from .cleaning import clean_smiles, clean_selfies
16
+ from .converting import convert_string_representation
17
+ from .features import calculate_feature
18
+ from .generating import sample_peptides_in_length_range, react
19
+ from .splitting import split
20
+ from .typing import DataSplits
21
+
22
+ def _get_error_tally(df: DataFrame,
23
+ cols: Union[str, List[str]]) -> Dict[str, int]:
24
+
25
+ cols = cast(cols, to=list)
26
+
27
+ try:
28
+ tally = {col: (df[col].isna() | ~df[col]).sum() for col in cols}
29
+ except TypeError:
30
+ tally = {col: df[col].isna().sum() for col in cols}
31
+
32
+ return tally
33
+
34
+
35
+ def converter(df: DataFrame,
36
+ column: str = 'smiles',
37
+ input_representation: str = 'smiles',
38
+ output_representation: Union[str, List[str]] = 'smiles',
39
+ prefix: Optional[str] = None,
40
+ options: Optional[Dict[str, Any]] = None) -> Tuple[Dict[str, int], DataFrame]:
41
+
42
+ """
43
+
44
+ """
45
+
46
+ prefix = prefix or ''
47
+
48
+ converters = {f"{prefix}{rep_out}": partial(convert_string_representation,
49
+ output_representation=rep_out,
50
+ input_representation=input_representation,
51
+ **options)
52
+ for rep_out in cast(output_representation, to=list)}
53
+
54
+ column_values = df[column]
55
+
56
+ converted = {col: cast(f(column_values), to=list)
57
+ for col, f in converters.items()}
58
+
59
+ df = df.assign(**converted)
60
+
61
+ return _get_error_tally(df, list(converters)), df
62
+
63
+
64
+ def cleaner(df: DataFrame,
65
+ column: str = 'smiles',
66
+ input_representation: str = 'smiles',
67
+ prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
68
+
69
+ """
70
+
71
+ """
72
+
73
+ if input_representation.casefold() == 'smiles':
74
+ cleaner = clean_smiles
75
+ elif input_representation.casefold() == 'selfies':
76
+ cleaner = clean_selfies
77
+ else:
78
+ raise ValueError(f"Representation {input_representation} is not supported for cleaning.")
79
+
80
+ prefix = prefix or ''
81
+ new_column = f"{prefix}{column}"
82
+
83
+ df = df.assign(**{new_column: lambda x: cleaner(x[column])})
84
+
85
+ return _get_error_tally(df, new_column), df
86
+
87
+
88
+ def featurizer(df: DataFrame,
89
+ feature_type: str,
90
+ column: str = 'smiles',
91
+ ids: Optional[Union[str, List[str]]] = None,
92
+ input_representation: str = 'smiles',
93
+ prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
94
+
95
+ """
96
+
97
+ """
98
+
99
+ if ids is None:
100
+ ids = df.columns.tolist()
101
+ else:
102
+ ids = cast(ids, to=list)
103
+
104
+ feature_df = calculate_feature(feature_type=feature_type,
105
+ strings=df[column],
106
+ prefix=prefix,
107
+ input_representation=input_representation)
108
+
109
+ if len(ids) > 0:
110
+ df = concat([df[ids], feature_df], axis=1)
111
+
112
+ return _get_error_tally(feature_df, 'meta_feature_valid'), df
113
+
114
+
115
+ def assign_groups(df: DataFrame,
116
+ grouper: Callable[[Union[str, Iterable[str]]], Dict[str, Tuple[int]]],
117
+ group_name: str = 'group',
118
+ column: str = 'smiles',
119
+ input_representation: str = 'smiles',
120
+ *args, **kwargs) -> Tuple[Dict[str, Tuple[int]], DataFrame]:
121
+
122
+ group_idx = grouper(strings=df[column],
123
+ input_representation=input_representation,
124
+ *args, **kwargs)
125
+
126
+ inv_group_idx = {i: group for group, idx in group_idx.items() for i in idx}
127
+ groups = [inv_group_idx[i] for i in range(len(inv_group_idx))]
128
+
129
+ return group_idx, df.assign(**{group_name: groups})
130
+
131
+
132
+ def _assign_splits(df: DataFrame,
133
+ split_idx: DataSplits,
134
+ use_df_index: bool = False) -> DataFrame:
135
+
136
+ row_index = df.index if use_df_index else tuple(range(df.shape[0]))
137
+
138
+ df = df.assign(**{f'is_{key}': [i in getattr(split_idx, key) for i in row_index]
139
+ for key in split_idx._fields})
140
+ split_counts = {key: sum(df[f'is_{key}'].values) for key in split_idx._fields}
141
+
142
+ return split_counts, df
143
+
144
+
145
+ def splitter(df: DataFrame,
146
+ split_type: str = 'random',
147
+ column: str = 'smiles',
148
+ input_representation: str = 'smiles',
149
+ *args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
150
+
151
+ """
152
+
153
+ """
154
+
155
+ split_idx = split(split_type=split_type,
156
+ strings=df[column],
157
+ input_representation=input_representation,
158
+ *args, **kwargs)
159
+
160
+ return _assign_splits(df, split_idx=split_idx)
161
+
162
+
163
+ def reactor(df: DataFrame,
164
+ column: str = 'smiles',
165
+ reaction: Union[str, Iterable[str]] = 'N_to_C_cyclization',
166
+ prefix: Optional[str] = None,
167
+ *args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
168
+
169
+ """
170
+
171
+ """
172
+
173
+ prefix = prefix or ''
174
+
175
+ reactors = {col: partial(react, reaction=col)
176
+ for col in cast(reaction, to=list)}
177
+
178
+ column_values = df[column]
179
+
180
+ new_columns = {f"{prefix}{col}": list(_reactor(strings=column_values, *args, **kwargs))
181
+ for col, _reactor in reactors.items()}
182
+
183
+ df = df.assign(**new_columns)
184
+
185
+ return _get_error_tally(df, reaction), df
186
+
187
+
188
+ def _peptide_table(max_length: int,
189
+ min_length: Optional[int] = None,
190
+ by: int = 1,
191
+ n: Optional[Union[float, int]] = None,
192
+ prefix: str = '',
193
+ suffix: str = '',
194
+ generator: bool = False,
195
+ batch_size: int = 1000,
196
+ *args, **kwargs) -> Union[DataFrame, Generator]:
197
+
198
+ min_length = min_length or max_length
199
+
200
+ peptides = sample_peptides_in_length_range(max_length=max_length,
201
+ min_length=min_length,
202
+ by=by,
203
+ n=n,
204
+ *args, **kwargs)
205
+
206
+ if generator:
207
+
208
+ for peps in batched(peptides, batch_size):
209
+
210
+ peps = [f"{prefix}{pep}{suffix}"
211
+ for pep in peps]
212
+
213
+ yield DataFrame(dict(peptide_sequence=peps))
214
+
215
+ else:
216
+
217
+ peps = [f"{prefix}{pep}{suffix}"
218
+ for pep in peptides]
219
+
220
+ return DataFrame(dict(peptide_sequence=peps))
build/lib/schemist/typing.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """Types used in schemist."""
2
+
3
+ from collections import namedtuple
4
+
5
+ DataSplits = namedtuple('DataSplits',
6
+ ['train', 'test', 'validation'],
7
+ defaults=[tuple(), tuple(), tuple()])
build/lib/schemist/utils.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Miscellaneous utilities for schemist."""
docs/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ myst_parser
2
+ matplotlib
3
+ numpy
4
+ openpyxl==3.1.0
5
+ pandas
6
+ scipy
7
+ sphinx_rtd_theme
8
+ ./
docs/source/conf.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration file for the Sphinx documentation builder.
2
+ #
3
+ # For the full list of built-in configuration values, see the documentation:
4
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html
5
+
6
+ # -- Project information -----------------------------------------------------
7
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
8
+
9
+ project = 'schemist'
10
+ copyright = '2024, Eachan Johnson'
11
+ author = 'Eachan Johnson'
12
+ release = '0.0.1'
13
+
14
+ # -- General configuration ---------------------------------------------------
15
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
16
+
17
+ extensions = ['sphinx.ext.doctest',
18
+ 'sphinx.ext.autodoc',
19
+ 'sphinx.ext.autosummary',
20
+ 'sphinx.ext.napoleon',
21
+ 'sphinx.ext.viewcode',
22
+ 'myst_parser']
23
+
24
+ myst_enable_extensions = [
25
+ "amsmath",
26
+ "dollarmath",
27
+ ]
28
+
29
+ source_suffix = {
30
+ '.rst': 'restructuredtext',
31
+ '.txt': 'markdown',
32
+ '.md': 'markdown',
33
+ }
34
+
35
+
36
+ templates_path = ['_templates']
37
+ exclude_patterns = []
38
+
39
+
40
+
41
+ # -- Options for HTML output -------------------------------------------------
42
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
43
+
44
+ html_theme = 'sphinx_rtd_theme'
45
+ html_static_path = []
docs/source/index.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ⬢⬢⬢ schemist
2
+
3
+ ![GitHub Workflow Status (with branch)](https://img.shields.io/github/actions/workflow/status/scbirlab/schemist/python-publish.yml)
4
+ ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/schemist)
5
+ ![PyPI](https://img.shields.io/pypi/v/schemist)
6
+
7
+ Cleaning, collating, and augmenting chemical datasets.
8
+
9
+ ```{toctree}
10
+ :maxdepth: 2
11
+ :caption: Contents:
12
+
13
+ installation
14
+ usage
15
+ python
16
+ modules
17
+ ```
18
+
19
+ ## Source
20
+
21
+ `GitHub <https://github.com/scbirlab/schemist>`_
pyproject.toml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "schemist"
3
+ version = "0.0.1"
4
+ authors = [
5
+ { name="Eachan Johnson", email="[email protected]" },
6
+ ]
7
+ description = "Organizing and processing tables of chemical structures."
8
+ readme = "README.md"
9
+ requires-python = ">=3.8"
10
+ license = {file = "LICENSE"}
11
+ keywords = ["science", "chemistry", "SMILES", "SELFIES", "cheminformatics"]
12
+
13
+ classifiers = [
14
+
15
+ "Development Status :: 3 - Alpha",
16
+
17
+ # Indicate who your project is intended for
18
+ "Intended Audience :: Science/Research",
19
+ "Topic :: Scientific/Engineering :: Chemistry",
20
+
21
+ "License :: OSI Approved :: MIT License",
22
+
23
+ "Programming Language :: Python :: 3.8",
24
+ "Programming Language :: Python :: 3.9",
25
+ "Programming Language :: Python :: 3.10",
26
+ "Programming Language :: Python :: 3.11",
27
+ "Programming Language :: Python :: 3 :: Only",
28
+ ]
29
+
30
+ dependencies = [
31
+ "git+https://github.com/scbirlab/carabiner.git",
32
+ "datamol",
33
+ "descriptastorus",
34
+ "nemony",
35
+ "openpyxl==3.1.0",
36
+ "pandas",
37
+ "rdkit",
38
+ "requests",
39
+ "selfies"
40
+ ]
41
+
42
+ [project.urls]
43
+ "Homepage" = "https://github.com/scbirlab/schemist"
44
+ "Repository" = "https://github.com/scbirlab/schemist.git"
45
+ "Bug Tracker" = "https://github.com/scbirlab/schemist/issues"
46
+ "Documentation" = "https://readthedocs.org/schemist"
47
+
48
+ [project.scripts] # Optional
49
+ schemist = "schemist.cli:main"
50
+
51
+ [tool.setuptools]
52
+ # If there are data files included in your packages that need to be
53
+ # installed, specify them here.
54
+ # package-data = {"" = ["*.yml"]}
55
+
56
+ [build-system]
57
+ # These are the assumed default build requirements from pip:
58
+ # https://pip.pypa.io/en/stable/reference/pip/#pep-517-and-518-support
59
+ requires = ["setuptools>=43.0.0", "wheel"]
60
+ build-backend = "setuptools.build_meta"
schemist/__init__.py ADDED
File without changes
schemist/cleaning.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Chemical structure cleaning routines."""
2
+
3
+ from carabiner.decorators import vectorize
4
+
5
+ from datamol import sanitize_smiles
6
+ import selfies as sf
7
+
8
+ @vectorize
9
+ def clean_smiles(smiles: str,
10
+ *args, **kwargs) -> str:
11
+
12
+ """Sanitize a SMILES string or list of SMILES strings.
13
+
14
+ """
15
+
16
+ return sanitize_smiles(smiles, *args, **kwargs)
17
+
18
+
19
+ @vectorize
20
+ def clean_selfies(selfies: str,
21
+ *args, **kwargs) -> str:
22
+
23
+ """Sanitize a SELFIES string or list of SELFIES strings.
24
+
25
+ """
26
+
27
+ return sf.encode(sanitize_smiles(sf.decode(selfies), *args, **kwargs))
schemist/cli.py ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Command-line interface for schemist."""
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from argparse import FileType, Namespace
6
+ from collections import Counter, defaultdict
7
+ from functools import partial
8
+ import os
9
+ import sys
10
+ from tempfile import NamedTemporaryFile, TemporaryDirectory
11
+
12
+ from carabiner import pprint_dict, upper_and_lower, print_err
13
+ from carabiner.cliutils import clicommand, CLIOption, CLICommand, CLIApp
14
+ from carabiner.itertools import tenumerate
15
+ from carabiner.pd import get_formats, write_stream
16
+
17
+ from .collating import collate_inventory, deduplicate_file
18
+ from .converting import _TO_FUNCTIONS, _FROM_FUNCTIONS
19
+ from .generating import AA, REACTIONS
20
+ from .io import _mutate_df_stream
21
+ from .tables import (converter, cleaner, featurizer, assign_groups,
22
+ _assign_splits, splitter, _peptide_table, reactor)
23
+ from .splitting import _SPLITTERS, _GROUPED_SPLITTERS
24
+
25
+ __version__ = '0.0.1'
26
+
27
+ def _option_parser(x: Optional[List[str]]) -> Dict[str, Any]:
28
+
29
+ options = {}
30
+
31
+ try:
32
+ for opt in x:
33
+
34
+ try:
35
+ key, value = opt.split('=')
36
+ except ValueError:
37
+ raise ValueError(f"Option {opt} is misformatted. It should be in the format keyword=value.")
38
+
39
+ try:
40
+ value = int(value)
41
+ except ValueError:
42
+ try:
43
+ value = float(value)
44
+ except ValueError:
45
+ pass
46
+
47
+ options[key] = value
48
+
49
+ except TypeError:
50
+
51
+ pass
52
+
53
+ return options
54
+
55
+
56
+ def _sum_tally(tallies: Counter,
57
+ message: str = "Error counts",
58
+ use_length: bool = False):
59
+
60
+ total_tally = Counter()
61
+
62
+ for tally in tallies:
63
+
64
+ if use_length:
65
+ total_tally.update({key: len(value) for key, value in tally.items()})
66
+ else:
67
+ total_tally.update(tally)
68
+
69
+ if len(tallies) == 0:
70
+ raise ValueError(f"Nothing generated!")
71
+
72
+ pprint_dict(total_tally, message=message)
73
+
74
+ return total_tally
75
+
76
+
77
+ @clicommand(message="Cleaning file with the following parameters")
78
+ def _clean(args: Namespace) -> None:
79
+
80
+ error_tallies = _mutate_df_stream(input_file=args.input,
81
+ output_file=args.output,
82
+ function=partial(cleaner,
83
+ column=args.column,
84
+ input_representation=args.representation,
85
+ prefix=args.prefix),
86
+ file_format=args.format)
87
+
88
+ _sum_tally(error_tallies)
89
+
90
+ return None
91
+
92
+
93
+ @clicommand(message="Converting between string representations with the following parameters")
94
+ def _convert(args: Namespace) -> None:
95
+
96
+ options = _option_parser(args.options)
97
+
98
+ error_tallies = _mutate_df_stream(input_file=args.input,
99
+ output_file=args.output,
100
+ function=partial(converter,
101
+ column=args.column,
102
+ input_representation=args.representation,
103
+ output_representation=args.to,
104
+ prefix=args.prefix,
105
+ options=options),
106
+ file_format=args.format)
107
+
108
+ _sum_tally(error_tallies)
109
+
110
+ return None
111
+
112
+
113
+ @clicommand(message="Adding features to files with the following parameters")
114
+ def _featurize(args: Namespace) -> None:
115
+
116
+ error_tallies = _mutate_df_stream(input_file=args.input,
117
+ output_file=args.output,
118
+ function=partial(featurizer,
119
+ feature_type=args.feature,
120
+ column=args.column,
121
+ ids=args.id,
122
+ input_representation=args.representation,
123
+ prefix=args.prefix),
124
+ file_format=args.format)
125
+
126
+ _sum_tally(error_tallies)
127
+
128
+ return None
129
+
130
+
131
+ @clicommand(message="Splitting table with the following parameters")
132
+ def _split(args: Namespace) -> None:
133
+
134
+ split_type = args.type.casefold()
135
+
136
+ if split_type in _GROUPED_SPLITTERS:
137
+
138
+ chunk_processor, aggregator = _GROUPED_SPLITTERS[split_type]
139
+
140
+ with TemporaryDirectory() as dir:
141
+
142
+ with NamedTemporaryFile("w", dir=dir, delete=False) as f:
143
+
144
+ group_idxs = _mutate_df_stream(input_file=args.input,
145
+ output_file=f,
146
+ function=partial(assign_groups,
147
+ grouper=chunk_processor,
148
+ group_name=split_type,
149
+ column=args.column,
150
+ input_representation=args.representation),
151
+ file_format=args.format)
152
+ f.close()
153
+ new_group_idx = defaultdict(list)
154
+
155
+ totals = 0
156
+ for group_idx in group_idxs:
157
+ these_totals = 0
158
+ for key, value in group_idx.items():
159
+ these_totals += len(value)
160
+ new_group_idx[key] += [idx + totals for idx in value]
161
+ totals += these_totals
162
+
163
+ group_idx = aggregator(new_group_idx,
164
+ train=args.train,
165
+ test=args.test)
166
+
167
+ split_tallies = _mutate_df_stream(input_file=f.name,
168
+ output_file=args.output,
169
+ function=partial(_assign_splits,
170
+ split_idx=group_idx,
171
+ use_df_index=True),
172
+ file_format=args.format)
173
+ if os.path.exists(f.name):
174
+ os.remove(f.name)
175
+
176
+ else:
177
+
178
+ split_tallies = _mutate_df_stream(input_file=args.input,
179
+ output_file=args.output,
180
+ function=partial(splitter,
181
+ split_type=args.type,
182
+ column=args.column,
183
+ input_representation=args.representation,
184
+ train=args.train,
185
+ test=args.test,
186
+ set_seed=args.seed),
187
+ file_format=args.format)
188
+
189
+ _sum_tally(split_tallies,
190
+ message="Split counts")
191
+
192
+ return None
193
+
194
+
195
+ @clicommand(message="Collating files with the following parameters")
196
+ def _collate(args: Namespace) -> None:
197
+
198
+ root_dir = args.data_dir or '.'
199
+
200
+ error_tallies = _mutate_df_stream(input_file=args.input,
201
+ output_file=args.output,
202
+ function=partial(collate_inventory,
203
+ root_dir=root_dir,
204
+ drop_unmapped=not args.keep_extra_columns,
205
+ catalog_smiles_column=args.column,
206
+ id_column_name=args.id_column,
207
+ id_n_digits=args.digits,
208
+ id_prefix=args.prefix),
209
+ file_format=args.format)
210
+
211
+ _sum_tally(error_tallies,
212
+ message="Collated chemicals:")
213
+
214
+ return None
215
+
216
+
217
+ @clicommand(message="Deduplicating chemical structures with the following parameters")
218
+ def _dedup(args: Namespace) -> None:
219
+
220
+ report, deduped_df = deduplicate_file(args.input,
221
+ format=args.format,
222
+ column=args.column,
223
+ input_representation=args.representation,
224
+ index_columns=args.indexes)
225
+
226
+ if args.prefix is not None and 'inchikey' in deduped_df:
227
+ deduped_df = deduped_df.rename(columns={'inchikey': f'{args.prefix}inchikey'})
228
+
229
+ write_stream(deduped_df,
230
+ output=args.output,
231
+ format=args.format)
232
+
233
+ pprint_dict(report, message="Finished deduplicating:")
234
+
235
+ return None
236
+
237
+
238
+ @clicommand(message="Enumerating peptides with the following parameters")
239
+ def _enum(args: Namespace) -> None:
240
+
241
+ tables = _peptide_table(max_length=args.max_length,
242
+ min_length=args.min_length,
243
+ n=args.number,
244
+ indexes=args.slice,
245
+ set_seed=args.seed,
246
+ prefix=args.prefix,
247
+ suffix=args.suffix,
248
+ d_aa_only=args.d_aa_only,
249
+ include_d_aa=args.include_d_aa,
250
+ generator=True)
251
+
252
+ dAA_use = any(aa.islower() for aa in args.prefix + args.suffix)
253
+ dAA_use = dAA_use or args.include_d_aa or args.d_aa_only
254
+
255
+ tallies, error_tallies = [], []
256
+ options = _option_parser(args.options)
257
+ _converter = partial(converter,
258
+ column='peptide_sequence',
259
+ input_representation='minihelm' if dAA_use else 'aa_seq', ## affects performance
260
+ output_representation=args.to,
261
+ options=options)
262
+
263
+ for i, table in tenumerate(tables):
264
+
265
+ _err_tally, df = _converter(table)
266
+
267
+ tallies.append({"Number of peptides": df.shape[0]})
268
+ error_tallies.append(_err_tally)
269
+
270
+ write_stream(df,
271
+ output=args.output,
272
+ format=args.format,
273
+ mode='w' if i == 0 else 'a',
274
+ header=i == 0)
275
+
276
+ _sum_tally(tallies,
277
+ message="Enumerated peptides")
278
+ _sum_tally(error_tallies,
279
+ message="Conversion errors")
280
+
281
+ return None
282
+
283
+
284
+ @clicommand(message="Reacting peptides with the following parameters")
285
+ def _react(args: Namespace) -> None:
286
+
287
+ error_tallies = _mutate_df_stream(input_file=args.input,
288
+ output_file=args.output,
289
+ function=partial(reactor,
290
+ column=args.column,
291
+ input_representation=args.representation,
292
+ reaction=args.reaction,
293
+ product_name=args.name),
294
+ file_format=args.format)
295
+
296
+ _sum_tally(error_tallies)
297
+
298
+ return None
299
+
300
+
301
+ def main() -> None:
302
+
303
+ inputs = CLIOption('input',
304
+ default=sys.stdin,
305
+ type=FileType('r'),
306
+ nargs='?',
307
+ help='Input columnar Excel, CSV or TSV file. Default: STDIN.')
308
+ representation = CLIOption('--representation', '-r',
309
+ type=str,
310
+ default='SMILES',
311
+ choices=upper_and_lower(_FROM_FUNCTIONS),
312
+ help='Chemical representation to use for input. ')
313
+ column = CLIOption('--column', '-c',
314
+ default='smiles',
315
+ type=str,
316
+ help='Column to use as input string representation. ')
317
+ prefix = CLIOption('--prefix', '-p',
318
+ default=None,
319
+ type=str,
320
+ help='Prefix to add to new column name. Default: no prefix')
321
+ to = CLIOption('--to', '-2',
322
+ type=str,
323
+ default='SMILES',
324
+ nargs='*',
325
+ choices=upper_and_lower(_TO_FUNCTIONS),
326
+ help='Format to convert to.')
327
+ options = CLIOption('--options', '-x',
328
+ type=str,
329
+ default=None,
330
+ nargs='*',
331
+ help='Options to pass to converter, in the format '
332
+ '"keyword1=value1 keyword2=value2"')
333
+ output = CLIOption('--output', '-o',
334
+ type=FileType('w'),
335
+ default=sys.stdout,
336
+ help='Output file. Default: STDOUT')
337
+ formatting = CLIOption('--format', '-f',
338
+ type=str,
339
+ default=None,
340
+ choices=upper_and_lower(get_formats()),
341
+ help='Override file extensions for input and output. '
342
+ 'Default: infer from file extension.')
343
+
344
+ ## featurize
345
+ id_feat = CLIOption('--id', '-i',
346
+ type=str,
347
+ default=None,
348
+ nargs='*',
349
+ help='Columns to retain in output table. Default: use all')
350
+ feature = CLIOption('--feature', '-t',
351
+ type=str,
352
+ default='2d',
353
+ choices=['2d', 'fp'], ## TODO: implement 3d
354
+ help='Which feature type to generate.')
355
+
356
+ ## split
357
+ type_ = CLIOption('--type', '-t',
358
+ type=str,
359
+ default='random',
360
+ choices=upper_and_lower(_SPLITTERS),
361
+ help='Which split type to use.')
362
+ train = CLIOption('--train', '-a',
363
+ type=float,
364
+ default=1.,
365
+ help='Proportion of data to use for training. ')
366
+ test = CLIOption('--test', '-b',
367
+ type=float,
368
+ default=0.,
369
+ help='Proportion of data to use for testing. ')
370
+
371
+ ## collate
372
+ data_dir = CLIOption('--data-dir', '-d',
373
+ type=str,
374
+ default=None,
375
+ help='Directory containing data files. '
376
+ 'Default: current directory')
377
+ id_column = CLIOption('--id-column', '-s',
378
+ default=None,
379
+ type=str,
380
+ help='If provided, add a structure ID column with this name. '
381
+ 'Default: don\'t add structure IDs')
382
+ prefix_collate = CLIOption('--prefix', '-p',
383
+ default='ID-',
384
+ type=str,
385
+ help='Prefix to add to structure IDs. '
386
+ 'Default: no prefix')
387
+ digits = CLIOption('--digits', '-n',
388
+ default=8,
389
+ type=int,
390
+ help='Number of digits in structure IDs. ')
391
+ keep_extra_columns = CLIOption('--keep-extra-columns', '-x',
392
+ action='store_true',
393
+ help='Whether to keep columns not mentioned in the catalog. '
394
+ 'Default: drop extra columns.')
395
+ keep_invalid_smiles = CLIOption('--keep-invalid-smiles', '-y',
396
+ action='store_true',
397
+ help='Whether to keep rows with invalid SMILES. '
398
+ 'Default: drop invalid rows.')
399
+
400
+ ## dedup
401
+ indexes = CLIOption('--indexes', '-x',
402
+ type=str,
403
+ default=None,
404
+ nargs='*',
405
+ help='Columns to retain and collapse (if multiple values per unique structure). '
406
+ 'Default: retain no other columns than structure and InchiKey.')
407
+ drop_inchikey = CLIOption('--drop-inchikey', '-d',
408
+ action='store_true',
409
+ help='Whether to drop the calculated InchiKey column. '
410
+ 'Default: keep InchiKey.')
411
+
412
+ ### enum
413
+ max_length = CLIOption('--max-length', '-l',
414
+ type=int,
415
+ help='Maximum length of enumerated peptide. '
416
+ 'Required.')
417
+ min_length = CLIOption('--min-length', '-m',
418
+ type=int,
419
+ default=None,
420
+ help='Minimum length of enumerated peptide. '
421
+ 'Default: same as maximum, i.e. all peptides same length.')
422
+ number_to_gen = CLIOption('--number', '-n',
423
+ type=float,
424
+ default=None,
425
+ help='Number of peptides to sample from all possible '
426
+ 'within the constraints. If less than 1, sample '
427
+ 'that fraction of all possible. If greater than 1, '
428
+ 'sample that number. '
429
+ 'Default: return all peptides.')
430
+ slicer = CLIOption('--slice', '-z',
431
+ type=str,
432
+ default=None,
433
+ nargs='*',
434
+ help='Subset of (possibly sampled) population to return, in the format <stop> '
435
+ 'or <start> <stop> [<step>]. If "x" is used for <stop>, then it runs to the end. '
436
+ 'For example, 1000 gives the first 1000, 2 600 gives items 2-600, and '
437
+ '3 500 2 gives every other from 3 to 500. Default: return all.')
438
+ alphabet = CLIOption('--alphabet', '-b',
439
+ type=str,
440
+ default=''.join(AA),
441
+ help='Alphabet to use in sampling.')
442
+ suffix = CLIOption('--suffix', '-s',
443
+ type=str,
444
+ default='',
445
+ help='Sequence to add to end. Lowercase for D-amino acids. '
446
+ 'Default: no suffix.')
447
+ set_seed = CLIOption('--seed', '-e',
448
+ type=int,
449
+ default=None,
450
+ help='Seed to use for reproducible randomness. '
451
+ 'Default: don\'t enable reproducibility.')
452
+ d_aa_only = CLIOption('--d-aa-only', '-a',
453
+ action='store_true',
454
+ help='Whether to only use D-amino acids. '
455
+ 'Default: don\'t include.')
456
+ include_d_aa = CLIOption('--include-d-aa', '-y',
457
+ action='store_true',
458
+ help='Whether to include D-amino acids in enumeration. '
459
+ 'Default: don\'t include.')
460
+
461
+ ## reaction
462
+ name = CLIOption('--name', '-n',
463
+ type=str,
464
+ default=None,
465
+ help='Name of column for product. '
466
+ 'Default: same as reaction name.')
467
+ reaction_opt = CLIOption('--reaction', '-x',
468
+ type=str,
469
+ nargs='*',
470
+ choices=list(REACTIONS),
471
+ default='N_to_C_cyclization',
472
+ help='Reaction(s) to apply.')
473
+
474
+ clean = CLICommand('clean',
475
+ description='Clean and normalize SMILES column of a table.',
476
+ main=_clean,
477
+ options=[output, formatting, inputs, representation, column, prefix])
478
+ convert = CLICommand('convert',
479
+ description='Convert between string representations of chemical structures.',
480
+ main=_convert,
481
+ options=[output, formatting, inputs, representation, column, prefix, to, options])
482
+ featurize = CLICommand('featurize',
483
+ description='Convert between string representations of chemical structures.',
484
+ main=_featurize,
485
+ options=[output, formatting, inputs, representation, column, prefix,
486
+ id_feat, feature])
487
+ collate = CLICommand('collate',
488
+ description='Collect disparate tables or SDF files of libraries into a single table.',
489
+ main=_collate,
490
+ options=[output, formatting, inputs, representation,
491
+ data_dir, column.replace(default='input_smiles'), id_column, prefix_collate,
492
+ digits, keep_extra_columns, keep_invalid_smiles])
493
+ dedup = CLICommand('dedup',
494
+ description='Deduplicate chemical structures and retain references.',
495
+ main=_dedup,
496
+ options=[output, formatting, inputs, representation, column, prefix,
497
+ indexes, drop_inchikey])
498
+ enum = CLICommand('enumerate',
499
+ description='Enumerate bio-chemical structures within length and sequence constraints.',
500
+ main=_enum,
501
+ options=[output, formatting, to, options,
502
+ alphabet, max_length, min_length, number_to_gen,
503
+ slicer, set_seed,
504
+ prefix.replace(default='',
505
+ help='Sequence to prepend. Lowercase for D-amino acids. '
506
+ 'Default: no prefix.'),
507
+ suffix,
508
+ type_.replace(default='aa',
509
+ choices=['aa'],
510
+ help='Type of bio sequence to enumerate. '
511
+ 'Default: %(default)s.'),
512
+ d_aa_only, include_d_aa])
513
+ reaction = CLICommand('react',
514
+ description='React compounds in silico in indicated columns using a named reaction.',
515
+ main=_react,
516
+ options=[output, formatting, inputs, representation, column, name,
517
+ reaction_opt])
518
+ split = CLICommand('split',
519
+ description='Split table based on chosen algorithm, optionally taking account of chemical structure during splits.',
520
+ main=_split,
521
+ options=[output, formatting, inputs, representation, column, prefix,
522
+ type_, train, test, set_seed])
523
+
524
+ app = CLIApp("schemist",
525
+ version=__version__,
526
+ description="Tools for cleaning, collating, and augmenting chemical datasets.",
527
+ commands=[clean, convert, featurize, collate, dedup, enum, reaction, split])
528
+
529
+ app.run()
530
+
531
+ return None
532
+
533
+
534
+ if __name__ == "__main__":
535
+
536
+ main()
schemist/collating.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools to collate chemical data files."""
2
+
3
+ from typing import Callable, Dict, Iterable, List, Optional, Tuple, TextIO, Union
4
+
5
+ from collections import Counter
6
+ from functools import partial
7
+ from glob import glob
8
+ import os
9
+
10
+ from carabiner.pd import read_table, resolve_delim
11
+ from carabiner import print_err
12
+ import numpy as np
13
+ from pandas import DataFrame, concat
14
+
15
+ from .converting import convert_string_representation, _FROM_FUNCTIONS
16
+ from .io import FILE_READERS
17
+
18
+ GROUPING_COLUMNS = ("filename", "file_format", "library_name", "string_representation")
19
+ ESSENTIAL_COLUMNS = GROUPING_COLUMNS + ("compound_collection", "plate_id", "well_id")
20
+
21
+ def _column_mapper(df: DataFrame,
22
+ cols: Iterable[str]) -> Tuple[Callable, Dict]:
23
+
24
+ basic_map = {column: df[column].tolist()[0] for column in cols}
25
+ inv_basic_map = {value: key for key, value in basic_map.items()}
26
+
27
+ def column_mapper(x: DataFrame) -> DataFrame:
28
+
29
+ new_df = DataFrame()
30
+
31
+ for new_col, old_col in basic_map.items():
32
+
33
+ # old_col = str(old_col)
34
+
35
+ if old_col is None or str(old_col) in ('None', 'nan', 'NA'):
36
+
37
+ new_df[new_col] = None
38
+
39
+ elif '+' in old_col:
40
+
41
+ splits = old_col.split('+')
42
+ new_df[new_col] = x[splits[0]].str.cat([x[s].astype(str)
43
+ for s in splits[1:]])
44
+
45
+ elif ';' in old_col:
46
+
47
+ col, char, index = old_col.split(';')
48
+ index = [int(i) for i in index.split(':')]
49
+
50
+ if len(index) == 1:
51
+ index = slice(index[0], index[0] + 1)
52
+ else:
53
+ index = slice(*index)
54
+
55
+ try:
56
+
57
+ new_df[new_col] = (x[col]
58
+ .str.split(char)
59
+ .map(lambda y: char.join(y[index] if y is not np.nan else []))
60
+ .str.strip())
61
+
62
+ except TypeError as e:
63
+
64
+ print_err(x[col].str.split(char))
65
+
66
+ raise e
67
+
68
+ else:
69
+
70
+ new_df[new_col] = x[old_col].copy()
71
+
72
+ return new_df
73
+
74
+ return column_mapper, inv_basic_map
75
+
76
+
77
+ def _check_catalog(catalog: DataFrame,
78
+ catalog_smiles_column: str = 'input_smiles') -> None:
79
+
80
+ essential_columns = (catalog_smiles_column, ) + ESSENTIAL_COLUMNS
81
+ missing_essential_cols = [col for col in essential_columns
82
+ if col not in catalog]
83
+
84
+ if len(missing_essential_cols) > 0:
85
+
86
+ print_err(catalog.columns.tolist())
87
+
88
+ raise KeyError("Missing required columns from catalog: " +
89
+ ", ".join(missing_essential_cols))
90
+
91
+ return None
92
+
93
+
94
+ def collate_inventory(catalog: DataFrame,
95
+ root_dir: Optional[str] = None,
96
+ drop_invalid: bool = True,
97
+ drop_unmapped: bool = False,
98
+ catalog_smiles_column: str = 'input_smiles',
99
+ id_column_name: Optional[str] = None,
100
+ id_n_digits: int = 8,
101
+ id_prefix: str = '') -> DataFrame:
102
+
103
+ f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
104
+
105
+ The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
106
+
107
+ - filename is a glob pattern of files to collate
108
+ - file_format is one of {', '.join(FILE_READERS.keys())}
109
+ - smiles_column contains smiles strings
110
+
111
+ Other columns are optional and can have any name, but must contain the name or a pattern
112
+ matching a column (for tabular data) or field (for SDF data) in the files
113
+ of the `filename` column. In the output DataFrame, the named column data will be mapped.
114
+
115
+ Optional column contents can be either concatenated or split using the following
116
+ pattern:
117
+
118
+ - col1+col2: concatenates the contents of `col1` and `col2`
119
+ - col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
120
+
121
+ Parameters
122
+ ----------
123
+ catalog : pd.DataFrame
124
+ Table cataloging locations and format of data. Requires
125
+ columns {', '.join(ESSENTIAL_COLUMNS)}.
126
+ root_dir : str, optional
127
+ Path to look for data files. Default: current directory.
128
+ drop_invalid : bool, optional
129
+ Whether to drop rows containing invalid SMILES.
130
+
131
+
132
+ Returns
133
+ -------
134
+ pd.DataFrame
135
+ Collated chemical data.
136
+
137
+ """
138
+
139
+ root_dir = root_dir or '.'
140
+
141
+ _check_catalog(catalog, catalog_smiles_column)
142
+
143
+ nongroup_columns = [col for col in catalog
144
+ if col not in GROUPING_COLUMNS]
145
+ loaded_dataframes = []
146
+ report = Counter({"invalid SMILES": 0,
147
+ "rows processed": 0})
148
+
149
+ grouped_catalog = catalog.groupby(list(GROUPING_COLUMNS))
150
+ for (this_glob, this_filetype,
151
+ this_library_name, this_representation), filename_df in grouped_catalog:
152
+
153
+ print_err(f'\nProcessing {this_glob}:')
154
+
155
+ this_glob = glob(os.path.join(root_dir, this_glob))
156
+
157
+ these_filenames = sorted(f for f in this_glob
158
+ if not os.path.basename(f).startswith('~$'))
159
+ print_err('\t- ' + '\n\t- '.join(these_filenames))
160
+
161
+ column_mapper, mapped_cols = _column_mapper(filename_df,
162
+ nongroup_columns)
163
+
164
+ reader = FILE_READERS[this_filetype]
165
+
166
+ for filename in these_filenames:
167
+
168
+ this_data0 = reader(filename)
169
+
170
+ if not drop_unmapped:
171
+ unmapped_cols = {col: 'x_' + col.casefold().replace(' ', '_')
172
+ for col in this_data0 if col not in mapped_cols}
173
+ this_data = this_data0[list(unmapped_cols)].rename(columns=unmapped_cols)
174
+ this_data = concat([column_mapper(this_data0), this_data],
175
+ axis=1)
176
+ else:
177
+ this_data = column_mapper(this_data0)
178
+
179
+ if this_representation.casefold() not in _FROM_FUNCTIONS:
180
+
181
+ raise TypeError(' or '.join(list(set(this_representation, this_representation.casefold()))) +
182
+ "not a supported string representation. Try one of " + ", ".join(_FROM_FUNCTIONS))
183
+
184
+ this_converter = partial(convert_string_representation,
185
+ input_representation=this_representation.casefold())
186
+
187
+ this_data = (this_data
188
+ .query('compound_collection != "NA"')
189
+ .assign(library_name=this_library_name,
190
+ input_file_format=this_filetype,
191
+ input_string_representation=this_representation,
192
+ plate_id=lambda x: x['plate_id'].astype(str),
193
+ plate_loc=lambda x: x['library_name'].str.cat([x['compound_collection'], x['plate_id'], x['well_id']], sep=':'),
194
+ canonical_smiles=lambda x: this_converter(x[catalog_smiles_column]),
195
+ is_valid_smiles=lambda x: [s is not None for s in x['canonical_smiles']]))
196
+
197
+ report.update({"invalid SMILES": (~this_data['is_valid_smiles']).sum(),
198
+ "rows processed": this_data.shape[0]})
199
+
200
+ if drop_invalid:
201
+
202
+ this_data = this_data.query('is_valid_smiles')
203
+
204
+ if id_column_name is not None:
205
+
206
+ this_converter = partial(convert_string_representation,
207
+ output_representation='id',
208
+ options=dict(n=id_n_digits,
209
+ prefix=id_prefix))
210
+ this_data = this_data.assign(**{id_column_name: lambda x: this_converter(x['canonical_smiles'])})
211
+
212
+ loaded_dataframes.append(this_data)
213
+
214
+ collated_df = concat(loaded_dataframes, axis=0)
215
+
216
+ return report, collated_df
217
+
218
+
219
+ def collate_inventory_from_file(catalog_path: Union[str, TextIO],
220
+ root_dir: Optional[str] = None,
221
+ format: Optional[str] = None,
222
+ *args, **kwargs) -> DataFrame:
223
+
224
+ f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
225
+
226
+ The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
227
+
228
+ - filename is a glob pattern of files to collate
229
+ - file_format is one of {', '.join(FILE_READERS.keys())}
230
+ - smiles_column contains smiles strings
231
+
232
+ Other columns are optional and can have any name, but must contain the name or a pattern
233
+ matching a column (for tabular data) or field (for SDF data) in the files
234
+ of the `filename` column. In the output DataFrame, the named column data will be mapped.
235
+
236
+ Optional column contents can be either concatenated or split using the following
237
+ pattern:
238
+
239
+ - col1+col2: concatenates the contents of `col1` and `col2`
240
+ - col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
241
+
242
+ Parameters
243
+ ----------
244
+ catalog_path : str
245
+ Path to catalog file in XLSX, TSV or CSV format. Requires
246
+ columns {', '.join(ESSENTIAL_COLUMNS)}.
247
+ format : str, optional
248
+ Format of catalog file. Default: infer from file extension.
249
+ root_dir : str, optional
250
+ Path to look for data files. Default: use directory containing
251
+ the catalog.
252
+
253
+ Returns
254
+ -------
255
+ pd.DataFrame
256
+ Collated chemical data.
257
+
258
+ """
259
+
260
+ root_dir = root_dir or os.path.dirname(catalog_path)
261
+
262
+ data_catalog = read_table(catalog_path, format=format)
263
+
264
+ return collate_inventory(catalog=data_catalog,
265
+ root_dir=root_dir,
266
+ *args, **kwargs)
267
+
268
+
269
+ def deduplicate(df: DataFrame,
270
+ column: str = 'smiles',
271
+ input_representation: str = 'smiles',
272
+ index_columns: Optional[List[str]] = None,
273
+ drop_inchikey: bool = False) -> DataFrame:
274
+
275
+ index_columns = index_columns or []
276
+
277
+ inchikey_converter = partial(convert_string_representation,
278
+ input_representation=input_representation,
279
+ output_representation='inchikey')
280
+
281
+ df = df.assign(inchikey=lambda x: inchikey_converter(x[column]))
282
+
283
+ structure_columns = [column, 'inchikey']
284
+ df_unique = []
285
+
286
+ for (string_rep, inchikey), structure_df in df.groupby(structure_columns):
287
+
288
+ collapsed_indexes = {col: [';'.join(sorted(map(str, set(structure_df[col].tolist()))))]
289
+ for col in structure_df if col in index_columns}
290
+ collapsed_indexes.update({column: [string_rep],
291
+ 'inchikey': [inchikey],
292
+ 'instance_count': [structure_df.shape[0]]})
293
+
294
+ df_unique.append(DataFrame(collapsed_indexes))
295
+
296
+ df_unique = concat(df_unique, axis=0)
297
+
298
+ if drop_inchikey:
299
+
300
+ df_unique = df_unique.drop(columns=['inchikey'])
301
+
302
+ report = {'starting rows:': df.shape[0],
303
+ 'ending_rows': df_unique.shape[0]}
304
+
305
+ return report, df_unique
306
+
307
+
308
+ def deduplicate_file(filename: Union[str, TextIO],
309
+ format: Optional[str] = None,
310
+ *args, **kwargs) -> DataFrame:
311
+
312
+ table = read_table(filename)
313
+
314
+ return deduplicate(table, *args, **kwargs)
315
+
schemist/converting.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Converting between chemical representation formats."""
2
+
3
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
4
+
5
+ from functools import wraps
6
+
7
+ from carabiner import print_err
8
+ from carabiner.cast import cast, flatten
9
+ from carabiner.decorators import return_none_on_error, vectorize
10
+ from carabiner.itertools import batched
11
+
12
+ from datamol import sanitize_smiles
13
+ import nemony as nm
14
+ from pandas import DataFrame
15
+ from rdkit.Chem import (Mol, MolFromInchi, MolFromHELM, MolFromSequence,
16
+ MolFromSmiles, MolToInchi, MolToInchiKey,
17
+ MolToSmiles)
18
+ from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
19
+ from requests import Session
20
+ import selfies as sf
21
+
22
+ from .rest_lookup import _inchikey2pubchem_name_id, _inchikey2cactus_name
23
+
24
+ @vectorize
25
+ @return_none_on_error
26
+ def _seq2mol(s: str) -> Union[Mol, None]:
27
+
28
+ return MolFromSequence(s, sanitize=True)
29
+
30
+
31
+ @vectorize
32
+ @return_none_on_error
33
+ def _helm2mol(s: str) -> Union[Mol, None]:
34
+
35
+ return MolFromHELM(s, sanitize=True)
36
+
37
+
38
+ def mini_helm2helm(s: str) -> List[str]:
39
+
40
+ new_s = []
41
+ token = ''
42
+ between_sq_brackets = False
43
+
44
+ for letter in s:
45
+
46
+ if letter.islower() and not between_sq_brackets:
47
+
48
+ letter = f"[d{letter.upper()}]"
49
+
50
+ token += letter
51
+
52
+ if letter == '[':
53
+ between_sq_brackets = True
54
+ elif letter == ']':
55
+ between_sq_brackets = False
56
+
57
+ if not between_sq_brackets:
58
+ new_s.append(token)
59
+ token = ''
60
+
61
+ return "PEPTIDE1{{{inner_helm}}}$$$$".format(inner_helm='.'.join(new_s))
62
+
63
+
64
+ @vectorize
65
+ @return_none_on_error
66
+ def _mini_helm2mol(s: str) -> Mol:
67
+
68
+ s = mini_helm2helm(s)
69
+
70
+ return MolFromHELM(s, sanitize=True)
71
+
72
+
73
+ @vectorize
74
+ @return_none_on_error
75
+ def _inchi2mol(s: str) -> Mol:
76
+
77
+ return MolFromInchi(s,
78
+ sanitize=True,
79
+ removeHs=True)
80
+
81
+ @vectorize
82
+ # @return_none_on_error
83
+ def _smiles2mol(s: str) -> Mol:
84
+
85
+ return MolFromSmiles(sanitize_smiles(s))
86
+
87
+
88
+ @vectorize
89
+ @return_none_on_error
90
+ def _selfies2mol(s: str) -> Mol:
91
+
92
+ return MolFromSmiles(sf.decoder(s))
93
+
94
+
95
+ @vectorize
96
+ @return_none_on_error
97
+ def _mol2nonstandard_inchikey(m: Mol,
98
+ **kwargs) -> str:
99
+
100
+ return MolToInchiKey(m,
101
+ options="/FixedH /SUU /RecMet /KET /15T")
102
+
103
+
104
+ @vectorize
105
+ @return_none_on_error
106
+ def _mol2hash(m: Mol,
107
+ **kwargs) -> str:
108
+
109
+ nonstandard_inchikey = _mol2nonstandard_inchikey(m)
110
+
111
+ return nm.hash(nonstandard_inchikey)
112
+
113
+
114
+ @vectorize
115
+ @return_none_on_error
116
+ def _mol2id(m: Mol,
117
+ n: int = 8,
118
+ prefix: str = '',
119
+ **kwargs) -> str:
120
+
121
+ return prefix + str(int(_mol2hash(m), 16))[:n]
122
+
123
+
124
+ @vectorize
125
+ @return_none_on_error
126
+ def _mol2isomeric_canonical_smiles(m: Mol,
127
+ **kwargs) -> str:
128
+
129
+ return MolToSmiles(m,
130
+ isomericSmiles=True,
131
+ canonical=True)
132
+
133
+
134
+ @vectorize
135
+ @return_none_on_error
136
+ def _mol2inchi(m: Mol,
137
+ **kwargs) -> str:
138
+
139
+ return MolToInchi(m)
140
+
141
+
142
+ @vectorize
143
+ @return_none_on_error
144
+ def _mol2inchikey(m: Mol,
145
+ **kwargs) -> str:
146
+
147
+ return MolToInchiKey(m)
148
+
149
+
150
+ @vectorize
151
+ @return_none_on_error
152
+ def _mol2random_smiles(m: Mol,
153
+ **kwargs) -> str:
154
+
155
+ return MolToSmiles(m,
156
+ isomericSmiles=True,
157
+ doRandom=True)
158
+
159
+
160
+ @vectorize
161
+ @return_none_on_error
162
+ def _mol2mnemonic(m: Mol,
163
+ **kwargs) -> str:
164
+
165
+ nonstandard_inchikey = _mol2nonstandard_inchikey(m)
166
+
167
+ return nm.encode(nonstandard_inchikey)
168
+
169
+
170
+ def _mol2pubchem(m: Union[Mol, Iterable[Mol]],
171
+ session: Optional[Session] = None,
172
+ chunksize: int = 32) -> List[Dict[str, Union[None, int, str]]]:
173
+
174
+ inchikeys = cast(_mol2inchikey(m), to=list)
175
+ pubchem_ids = []
176
+
177
+ for _inchikeys in batched(inchikeys, chunksize):
178
+
179
+ these_ids = _inchikey2pubchem_name_id(_inchikeys,
180
+ session=session)
181
+ pubchem_ids += these_ids
182
+
183
+ return pubchem_ids
184
+
185
+
186
+ @return_none_on_error
187
+ def _mol2pubchem_id(m: Union[Mol, Iterable[Mol]],
188
+ session: Optional[Session] = None,
189
+ chunksize: int = 32,
190
+ **kwargs) -> Union[str, List[str]]:
191
+
192
+ return flatten([val['pubchem_id']
193
+ for val in _mol2pubchem(m,
194
+ session=session,
195
+ chunksize=chunksize)])
196
+
197
+
198
+ @return_none_on_error
199
+ def _mol2pubchem_name(m: Union[Mol, Iterable[Mol]],
200
+ session: Optional[Session] = None,
201
+ chunksize: int = 32,
202
+ **kwargs) -> Union[str, List[str]]:
203
+
204
+ return flatten([val['pubchem_name']
205
+ for val in _mol2pubchem(m,
206
+ session=session,
207
+ chunksize=chunksize)])
208
+
209
+ @return_none_on_error
210
+ def _mol2cactus_name(m: Union[Mol, Iterable[Mol]],
211
+ session: Optional[Session] = None,
212
+ **kwargs) -> Union[str, List[str]]:
213
+
214
+ return _inchikey2cactus_name(_mol2inchikey(m),
215
+ session=session)
216
+
217
+
218
+ @vectorize
219
+ @return_none_on_error
220
+ def _mol2scaffold(m: Mol,
221
+ chiral: bool = True,
222
+ **kwargs) -> str:
223
+
224
+ return MurckoScaffoldSmiles(mol=m,
225
+ includeChirality=chiral)
226
+
227
+
228
+ @vectorize
229
+ @return_none_on_error
230
+ def _mol2selfies(m: Mol,
231
+ **kwargs) -> str:
232
+
233
+ s = sf.encoder(_mol2isomeric_canonical_smiles(m))
234
+
235
+ return s if s != -1 else None
236
+
237
+
238
+ _TO_FUNCTIONS = {"smiles": _mol2isomeric_canonical_smiles,
239
+ "selfies": _mol2selfies,
240
+ "inchi": _mol2inchi,
241
+ "inchikey": _mol2inchikey,
242
+ "nonstandard_inchikey": _mol2nonstandard_inchikey,
243
+ "hash": _mol2hash,
244
+ "mnemonic": _mol2mnemonic,
245
+ "id": _mol2id,
246
+ "scaffold": _mol2scaffold,
247
+ "permuted_smiles": _mol2random_smiles,
248
+ "pubchem_id": _mol2pubchem_id,
249
+ "pubchem_name": _mol2pubchem_name,
250
+ "cactus_name": _mol2cactus_name}
251
+
252
+ _FROM_FUNCTIONS = {"smiles": _smiles2mol,
253
+ "selfies": _selfies2mol,
254
+ "inchi": _inchi2mol,
255
+ "aa_seq": _seq2mol,
256
+ "helm": _helm2mol,
257
+ "minihelm": _mini_helm2mol}
258
+
259
+
260
+ def _x2mol(strings: Union[Iterable[str], str],
261
+ input_representation: str = 'smiles') -> Union[Mol, None, Iterable[Union[Mol, None]]]:
262
+
263
+ from_function = _FROM_FUNCTIONS[input_representation.casefold()]
264
+
265
+ return from_function(strings)
266
+
267
+
268
+ def _mol2x(mols: Union[Iterable[Mol], Mol],
269
+ output_representation: str = 'smiles',
270
+ **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
271
+
272
+ to_function = _TO_FUNCTIONS[output_representation.casefold()]
273
+
274
+ return to_function(mols, **kwargs)
275
+
276
+
277
+ def convert_string_representation(strings: Union[Iterable[str], str],
278
+ input_representation: str = 'smiles',
279
+ output_representation: str = 'smiles',
280
+ **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
281
+
282
+ """Convert between string representations of chemical structures.
283
+
284
+ """
285
+
286
+ mols = _x2mol(strings, input_representation)
287
+ # print_err(mols)
288
+ outstrings = _mol2x(mols, output_representation, **kwargs)
289
+ # print_err(outstrings)
290
+
291
+ return outstrings
292
+
293
+
294
+ def _convert_input_to_smiles(f: Callable) -> Callable:
295
+
296
+ @wraps(f)
297
+ def _f(strings: Union[Iterable[str], str],
298
+ input_representation: str = 'smiles',
299
+ *args, **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
300
+
301
+ smiles = convert_string_representation(strings,
302
+ output_representation='smiles',
303
+ input_representation=input_representation)
304
+
305
+ return f(strings=smiles,
306
+ *args, **kwargs)
307
+
308
+ return _f
schemist/features.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools for generating chemical features."""
2
+
3
+ from typing import Any, Callable, Iterable, Optional, Union
4
+
5
+ from descriptastorus.descriptors import MakeGenerator
6
+ from pandas import DataFrame, Series
7
+ import numpy as np
8
+ from rdkit.Chem.AllChem import FingeprintGenerator64, GetMorganGenerator, Mol
9
+
10
+ from .converting import _smiles2mol, _convert_input_to_smiles
11
+
12
+ def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], DataFrame]:
13
+
14
+ def _f(prefix: Optional[str] = None,
15
+ *args, **kwargs) -> DataFrame:
16
+
17
+ feature_matrix = f(*args, **kwargs)
18
+
19
+ if prefix is not None:
20
+
21
+ new_cols = {col: f"{prefix}_{col}"
22
+ for col in feature_matrix.columns
23
+ if not col.startswith('_meta')}
24
+ feature_matrix = feature_matrix.rename(columns=new_cols)
25
+
26
+ return feature_matrix
27
+
28
+ return _f
29
+
30
+
31
+ def _get_descriptastorus_features(smiles: Iterable[str],
32
+ generator: str) -> DataFrame:
33
+
34
+ generator = MakeGenerator((generator, ))
35
+ smiles = Series(smiles)
36
+
37
+ features = smiles.apply(lambda z: np.array(generator.process(z)))
38
+ matrix = np.stack(features.values, axis=0)
39
+
40
+ return DataFrame(matrix,
41
+ index=smiles.index,
42
+ columns=[col for col, _ in generator.GetColumns()])
43
+
44
+
45
+ @_feature_matrix
46
+ @_convert_input_to_smiles
47
+ def calculate_2d_features(strings: Union[Iterable[str], str],
48
+ normalized: bool = True,
49
+ histogram_normalized: bool = True) -> DataFrame:
50
+
51
+ """Calculate 2d features from string representation.
52
+
53
+ """
54
+
55
+ if normalized:
56
+ if histogram_normalized:
57
+ generator_name = "RDKit2DHistogramNormalized"
58
+ else:
59
+ generator_name = "RDKit2DNormalized"
60
+ else:
61
+ generator_name = "RDKit2D"
62
+
63
+ feature_matrix = _get_descriptastorus_features(strings,
64
+ generator=generator_name)
65
+
66
+ feature_matrix = (feature_matrix
67
+ .rename(columns={f"{generator_name}_calculated": "meta_feature_valid0"})
68
+ .assign(meta_feature_type=generator_name,
69
+ meta_feature_valid=lambda x: (x['meta_feature_valid0'] == 1.))
70
+ .drop(columns=['meta_feature_valid0']))
71
+
72
+ return feature_matrix
73
+
74
+
75
+ def _fast_fingerprint(generator: FingeprintGenerator64,
76
+ mol: Mol,
77
+ to_np: bool = True) -> Union[str, np.ndarray]:
78
+
79
+ try:
80
+ fp_string = generator.GetFingerprint(mol).ToBitString()
81
+ except:
82
+ return None
83
+ else:
84
+ if to_np:
85
+ return np.frombuffer(fp_string.encode(), 'u1') - ord('0')
86
+ else:
87
+ return fp_string
88
+
89
+
90
+ @_feature_matrix
91
+ @_convert_input_to_smiles
92
+ def calculate_fingerprints(strings: Union[Iterable[str], str],
93
+ fp_type: str = 'morgan',
94
+ radius: int = 2,
95
+ chiral: bool = True,
96
+ on_bits: bool = True) -> DataFrame:
97
+
98
+ """
99
+
100
+ """
101
+
102
+ if fp_type.casefold() == 'morgan':
103
+ generator_class = GetMorganGenerator
104
+ else:
105
+ raise AttributeError(f"Fingerprint type {fp_type} not supported!")
106
+
107
+ fp_generator = generator_class(radius=radius,
108
+ includeChirality=chiral)
109
+ mols = (_smiles2mol(s) for s in strings)
110
+ fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits)
111
+ for mol in mols)
112
+
113
+ if on_bits:
114
+
115
+ fingerprints = (map(str, np.flatnonzero(fp_string).tolist())
116
+ for fp_string in fp_strings)
117
+ fingerprints = [';'.join(fp) for fp in fingerprints]
118
+ validity = [len(fp) > 0 for fp in fingerprints]
119
+
120
+ feature_matrix = DataFrame(fingerprints,
121
+ columns=['fp_bits'])
122
+
123
+ else:
124
+
125
+ fingerprints = [np.array(int(digit) for digit in fp_string)
126
+ if fp_string is not None
127
+ else (-np.ones((fp_generator.GetOptions().fpSize, )))
128
+ for fp_string in fp_strings]
129
+ validity = [np.all(fp >= 0) for fp in fingerprints]
130
+
131
+ feature_matrix = DataFrame(np.stack(fingerprints, axis=0),
132
+ columns=[f"fp_{i}" for i in range(len(fingerprints[0]))])
133
+
134
+ return feature_matrix.assign(meta_feature_type=fp_type.casefold(),
135
+ meta_feature_valid=validity)
136
+
137
+
138
+ _FEATURE_CALCULATORS = {"2d": calculate_2d_features, "fp": calculate_fingerprints}
139
+
140
+ def calculate_feature(feature_type: str,
141
+ *args, **kwargs):
142
+
143
+ """
144
+
145
+ """
146
+
147
+ featurizer = _FEATURE_CALCULATORS[feature_type]
148
+
149
+ return featurizer(*args, **kwargs)
schemist/generating.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools for enumerating compounds. Currently only works with peptides."""
2
+
3
+ from typing import Callable, Iterable, Optional, Tuple, Union
4
+
5
+ from functools import partial
6
+ from itertools import chain, islice, product, repeat
7
+ from math import ceil, expm1, floor
8
+ from random import choice, choices, random, seed
9
+
10
+ from carabiner import print_err
11
+ from carabiner.decorators import vectorize, return_none_on_error
12
+ from carabiner.random import sample_iter
13
+ from rdkit.Chem import Mol, rdChemReactions
14
+ import numpy as np
15
+
16
+ from .converting import (_x2mol, _mol2x,
17
+ _convert_input_to_smiles)
18
+
19
+ AA = tuple('GALVITSMCPFYWHKRDENQ')
20
+ dAA = tuple(aa.casefold() for aa in AA)
21
+
22
+ REACTIONS = {'N_to_C_cyclization': '([N;H1:5][C:1][C:2](=[O:6])[O:3].[N;H2:4][C:7][C:8](=[O:9])[N;H1:10])>>[N;H1:5][C:1][C:2](=[O:6])[N;H1:4][C:7][C:8](=[O:9])[N;H1:10].[O;H2:3]',
23
+ 'cysteine_to_chloroacetyl_cyclization': '([N;H1:5][C:2](=[O:6])[C:1][Cl:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8]',
24
+ 'cysteine_to_N_cyclization':'([N;H1:5][C:2](=[O:6])[C:1][N;H2:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8].[N;H3:3]'}
25
+
26
+ def _get_alphabet(alphabet: Optional[Iterable[str]] = None,
27
+ d_aa_only: bool = False,
28
+ include_d_aa: bool = False) -> Tuple[str]:
29
+
30
+ alphabet = alphabet or AA
31
+ alphabet_lower = tuple(set(aa.casefold() for aa in AA))
32
+
33
+ if d_aa_only:
34
+ alphabet = alphabet_lower
35
+ elif include_d_aa:
36
+ alphabet = tuple(set(chain(alphabet, alphabet_lower)))
37
+
38
+ return alphabet
39
+
40
+
41
+
42
+ def all_peptides_of_one_length(length: int,
43
+ alphabet: Optional[Iterable[str]] = None,
44
+ d_aa_only: bool = False,
45
+ include_d_aa: bool = False) -> Iterable[str]:
46
+
47
+ """
48
+
49
+ """
50
+
51
+ alphabet = _get_alphabet(alphabet=alphabet,
52
+ d_aa_only=d_aa_only,
53
+ include_d_aa=include_d_aa)
54
+
55
+ return (''.join(peptide)
56
+ for peptide in product(alphabet, repeat=length))
57
+
58
+
59
+ def all_peptides_in_length_range(max_length: int,
60
+ min_length: int = 1,
61
+ by: int = 1,
62
+ alphabet: Optional[Iterable[str]] = None,
63
+ d_aa_only: bool = False,
64
+ include_d_aa: bool = False,
65
+ *args, **kwargs) -> Iterable[str]:
66
+
67
+ """
68
+
69
+ """
70
+
71
+ length_range = range(*sorted([min_length, max_length + 1]), by)
72
+ peptide_maker = partial(all_peptides_of_one_length,
73
+ alphabet=alphabet,
74
+ d_aa_only=d_aa_only,
75
+ include_d_aa=include_d_aa,
76
+ *args, **kwargs)
77
+
78
+ return chain.from_iterable(peptide_maker(length=length)
79
+ for length in length_range)
80
+
81
+
82
+ def _number_of_peptides(max_length: int,
83
+ min_length: int = 1,
84
+ by: int = 1,
85
+ alphabet: Optional[Iterable[str]] = None,
86
+ d_aa_only: bool = False,
87
+ include_d_aa: bool = False):
88
+
89
+ alphabet = _get_alphabet(alphabet=alphabet,
90
+ d_aa_only=d_aa_only,
91
+ include_d_aa=include_d_aa)
92
+ n_peptides = [len(alphabet) ** length
93
+ for length in range(*sorted([min_length, max_length + 1]), by)]
94
+
95
+ return n_peptides
96
+
97
+
98
+ def _naive_sample_peptides_in_length_range(max_length: int,
99
+ min_length: int = 1,
100
+ by: int = 1,
101
+ n: Optional[Union[float, int]] = None,
102
+ alphabet: Optional[Iterable[str]] = None,
103
+ d_aa_only: bool = False,
104
+ include_d_aa: bool = False,
105
+ set_seed: Optional[int] = None):
106
+
107
+ alphabet = _get_alphabet(alphabet=alphabet,
108
+ d_aa_only=d_aa_only,
109
+ include_d_aa=include_d_aa)
110
+ n_peptides = _number_of_peptides(max_length=max_length,
111
+ min_length=min_length,
112
+ by=by,
113
+ alphabet=alphabet,
114
+ d_aa_only=d_aa_only,
115
+ include_d_aa=include_d_aa)
116
+ lengths = list(range(*sorted([min_length, max_length + 1]), by))
117
+ weight_per_length = [n / min(n_peptides) for n in n_peptides]
118
+ weighted_lengths = list(chain.from_iterable(repeat(l, ceil(w)) for l, w in zip(lengths, weight_per_length)))
119
+
120
+ lengths_sample = (choice(weighted_lengths) for _ in range(n))
121
+ return (''.join(choices(list(alphabet), k=k)) for k in lengths_sample)
122
+
123
+
124
+ def sample_peptides_in_length_range(max_length: int,
125
+ min_length: int = 1,
126
+ by: int = 1,
127
+ n: Optional[Union[float, int]] = None,
128
+ alphabet: Optional[Iterable[str]] = None,
129
+ d_aa_only: bool = False,
130
+ include_d_aa: bool = False,
131
+ naive_sampling_cutoff: float = 5e-3,
132
+ reservoir_sampling: bool = True,
133
+ indexes: Optional[Iterable[int]] = None,
134
+ set_seed: Optional[int] = None,
135
+ *args, **kwargs) -> Iterable[str]:
136
+
137
+ """
138
+
139
+ """
140
+
141
+ seed(set_seed)
142
+
143
+ alphabet = _get_alphabet(alphabet=alphabet,
144
+ d_aa_only=d_aa_only,
145
+ include_d_aa=include_d_aa)
146
+
147
+ n_peptides = sum(len(alphabet) ** length
148
+ for length in range(*sorted([min_length, max_length + 1]), by))
149
+ if n is None:
150
+ n_requested = n_peptides
151
+ elif n >= 1.:
152
+ n_requested = min(floor(n), n_peptides)
153
+ elif n < 1.:
154
+ n_requested = floor(n * n_peptides)
155
+
156
+ frac_requested = n_requested / n_peptides
157
+
158
+ # approximation of birthday problem
159
+ p_any_collision = -expm1(-n_requested * (n_requested - 1.) / (2. * n_peptides))
160
+ n_collisons = n_requested * (1. - ((n_peptides - 1.) / n_peptides) ** (n_requested - 1.))
161
+ frac_collisions = n_collisons / n_requested
162
+
163
+ print_err(f"Sampling {n_requested} ({frac_requested * 100.} %) peptides from "
164
+ f"length {min_length} to {max_length} ({n_peptides} combinations). "
165
+ f"Probability of collision if drawing randomly is {p_any_collision}, "
166
+ f"with {n_collisons} ({100. * frac_collisions} %) collisions on average.")
167
+
168
+ if frac_collisions < naive_sampling_cutoff and n_peptides > 2e9:
169
+
170
+ print_err("> Executing naive sampling. ")
171
+
172
+ peptides = _naive_sample_peptides_in_length_range(max_length, min_length, by,
173
+ n=n_requested,
174
+ alphabet=alphabet,
175
+ d_aa_only=d_aa_only,
176
+ include_d_aa=include_d_aa)
177
+
178
+ else:
179
+
180
+ print_err("> Executing exhaustive sampling.")
181
+
182
+ all_peptides = all_peptides_in_length_range(max_length, min_length, by,
183
+ alphabet=alphabet,
184
+ d_aa_only=d_aa_only,
185
+ include_d_aa=include_d_aa,
186
+ *args, **kwargs)
187
+
188
+ if n is None:
189
+
190
+ peptides = all_peptides
191
+
192
+ elif n >= 1.:
193
+
194
+ if reservoir_sampling:
195
+ peptides = sample_iter(all_peptides, k=n_requested,
196
+ shuffle_output=False)
197
+ else:
198
+ peptides = (pep for pep in all_peptides
199
+ if random() <= frac_requested)
200
+
201
+ elif n < 1.:
202
+
203
+ peptides = (pep for pep in all_peptides
204
+ if random() <= n)
205
+
206
+ if indexes is not None:
207
+
208
+ indexes = (int(ix) if (isinstance(ix, str) and ix.isdigit()) or isinstance(ix, int) or isinstance(ix, float)
209
+ else None
210
+ for ix in islice(indexes, 3))
211
+ indexes = [ix if (ix is None or ix >= 0) else None
212
+ for ix in indexes]
213
+
214
+ if len(indexes) > 1:
215
+ if n is not None and n >=1. and indexes[0] > n:
216
+ raise ValueError(f"Minimum slice ({indexes[0]}) is higher than number of items ({n}).")
217
+
218
+ peptides = islice(peptides, *indexes)
219
+
220
+ return peptides
221
+
222
+
223
+ def _reactor(smarts: str) -> Callable[[Mol], Union[Mol, None]]:
224
+
225
+ rxn = rdChemReactions.ReactionFromSmarts(smarts)
226
+ reaction_function = rxn.RunReactants
227
+
228
+ @vectorize
229
+ @return_none_on_error
230
+ def reactor(s: Mol) -> Mol:
231
+
232
+ return reaction_function([s])[0][0]
233
+
234
+ return reactor
235
+
236
+
237
+ @_convert_input_to_smiles
238
+ def react(strings: Union[str, Iterable[str]],
239
+ reaction: str = 'N_to_C_cyclization',
240
+ output_representation: str = 'smiles',
241
+ **kwargs) -> Union[str, Iterable[str]]:
242
+
243
+ """
244
+
245
+ """
246
+
247
+ try:
248
+ _this_reaction = REACTIONS[reaction]
249
+ except KeyError:
250
+ raise KeyError(f"Reaction {reaction} is not available. Try: " +
251
+ ", ".join(list(REACTIONS)))
252
+
253
+ # strings = cast(strings, to=list)
254
+ # print_err((strings))
255
+
256
+ reactor = _reactor(_this_reaction)
257
+ mols = _x2mol(strings)
258
+ mols = reactor(mols)
259
+
260
+ return _mol2x(mols,
261
+ output_representation=output_representation,
262
+ **kwargs)
schemist/io.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools to facilitate input and output."""
2
+
3
+ from typing import Any, Callable, List, Optional, TextIO, Tuple, Union
4
+
5
+ from collections import defaultdict
6
+ from functools import partial
7
+ from string import printable
8
+ from tempfile import NamedTemporaryFile
9
+ from xml.etree import ElementTree
10
+
11
+ from carabiner import print_err
12
+ from carabiner.cast import cast
13
+ from carabiner.itertools import tenumerate
14
+ from carabiner.pd import read_table, write_stream
15
+
16
+ from pandas import DataFrame, read_excel
17
+ from rdkit.Chem import SDMolSupplier
18
+
19
+ from .converting import _mol2isomeric_canonical_smiles
20
+
21
+ def _mutate_df_stream(input_file: Union[str, TextIO],
22
+ output_file: Union[str, TextIO],
23
+ function: Callable[[DataFrame], Tuple[Any, DataFrame]],
24
+ file_format: Optional[str] = None,
25
+ chunksize: int = 1000) -> List[Any]:
26
+
27
+ carries = []
28
+
29
+ for i, chunk in tenumerate(read_table(input_file,
30
+ format=file_format,
31
+ progress=False,
32
+ chunksize=chunksize)):
33
+
34
+ result = function(chunk)
35
+
36
+ try:
37
+ carry, df = result
38
+ except ValueError:
39
+ df = result
40
+ carry = 0
41
+
42
+ write_stream(df,
43
+ output=output_file,
44
+ format=file_format,
45
+ header=i == 0,
46
+ mode='w' if i == 0 else 'a')
47
+
48
+ carries.append(carry)
49
+
50
+ return carries
51
+
52
+
53
+ def read_weird_xml(filename: Union[str, TextIO],
54
+ header: bool = True,
55
+ namespace: str = '{urn:schemas-microsoft-com:office:spreadsheet}') -> DataFrame:
56
+
57
+ """
58
+
59
+ """
60
+
61
+ with cast(filename, TextIOWrapper, mode='r') as f:
62
+
63
+ xml_string = ''.join(filter(printable.__contains__, f.read()))
64
+
65
+ try:
66
+
67
+ root = ElementTree.fromstring(xml_string)
68
+
69
+ except Exception as e:
70
+
71
+ print_err('\n!!! ' + xml_string.split('\n')[1184][377:380])
72
+
73
+ raise e
74
+
75
+ for i, row in enumerate(root.iter(f'{namespace}Row') ):
76
+
77
+ this_row = [datum.text for datum in row.iter(f'{namespace}Data')]
78
+
79
+ if i == 0:
80
+
81
+ if header:
82
+
83
+ heading = this_row
84
+ df = {colname: [] for colname in heading}
85
+
86
+ else:
87
+
88
+ heading = [f'X{j}' for j, _ in enumerate(this_row)]
89
+ df = {colname: [datum] for colname, datum in zip(heading, this_row)}
90
+
91
+ else:
92
+
93
+ for colname, datum in zip(heading, this_row):
94
+
95
+ df[colname].append(datum)
96
+
97
+ return DataFrame(df)
98
+
99
+
100
+ def read_sdf(filename: Union[str, TextIO]):
101
+
102
+ """
103
+
104
+ """
105
+
106
+ filename = cast(filename, str)
107
+
108
+ with open(filename, 'r', errors='replace') as f:
109
+ with NamedTemporaryFile("w") as o:
110
+
111
+ o.write(f.read())
112
+ o.seek(0)
113
+
114
+ df = defaultdict(list)
115
+
116
+ for i, mol in enumerate(SDMolSupplier(o.name)):
117
+
118
+ if mol is None:
119
+
120
+ continue
121
+
122
+ propdict = mol.GetPropsAsDict()
123
+ propdict['SMILES'] = _mol2isomeric_canonical_smiles(mol)
124
+
125
+ for colname in propdict:
126
+
127
+ df[colname].append(propdict[colname])
128
+
129
+ for colname in df:
130
+
131
+ if colname not in propdict:
132
+
133
+ df[colname].append(None)
134
+
135
+ col_lengths = {col: len(val) for col, val in df.items()}
136
+
137
+ if len(set(col_lengths.values())) > 1:
138
+
139
+ raise ValueError(f"Column lengths not all the same:\n\t" +
140
+ '\n\t'.join(f"{key}:{val}" for key, val in col_lengths.items()))
141
+
142
+ return DataFrame(df)
143
+
144
+
145
+ FILE_READERS = {
146
+ 'bad_xml': read_weird_xml,
147
+ 'xlsx': partial(read_excel, engine='openpyxl'),
148
+ 'sdf': read_sdf
149
+ }
schemist/rest_lookup.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools for querying PubChem."""
2
+
3
+ from typing import Dict, Iterable, List, Optional, Union
4
+ from time import sleep
5
+ from xml.etree import ElementTree
6
+
7
+ from carabiner import print_err
8
+ from carabiner.cast import cast
9
+ from carabiner.decorators import vectorize
10
+ from requests import Response, Session
11
+
12
+ _PUBCHEM_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/{get}/{format}"
13
+ _CACTUS_URL = "https://cactus.nci.nih.gov/chemical/structure/{inchikey}/{get}"
14
+
15
+ _OVERLOAD_CODES = {500, 501, 503, 504}
16
+
17
+
18
+ def _url_request(inchikeys: Union[str, Iterable[str]],
19
+ url: str,
20
+ session: Optional[Session] = None,
21
+ **kwargs) -> Response:
22
+
23
+ if session is None:
24
+ session = Session()
25
+
26
+ inchikeys = cast(inchikeys, to=list)
27
+
28
+ return session.get(url.format(inchikey=','.join(inchikeys), **kwargs))
29
+
30
+
31
+ def _inchikey2pubchem_name_id(inchikeys: Union[str, Iterable[str]],
32
+ session: Optional[Session] = None,
33
+ counter: int = 0,
34
+ max_tries: int = 10,
35
+ namespace: str = "{http://pubchem.ncbi.nlm.nih.gov/pug_rest}") -> List[Dict[str, Union[None, int, str]]]:
36
+
37
+ r = _url_request(inchikeys, url=_PUBCHEM_URL,
38
+ session=session,
39
+ get="Title,InchiKey", format="XML")
40
+
41
+ if r.status_code == 200:
42
+
43
+ root = ElementTree.fromstring(r.text)
44
+ compounds = root.iter(f'{namespace}Properties')
45
+
46
+ result_dict = dict()
47
+
48
+ for cmpd in compounds:
49
+
50
+ cmpd_dict = dict()
51
+
52
+ for child in cmpd:
53
+ cmpd_dict[child.tag.split(namespace)[1]] = child.text
54
+
55
+ try:
56
+ inchikey, name, pcid = cmpd_dict['InChIKey'], cmpd_dict['Title'], cmpd_dict['CID']
57
+ except KeyError:
58
+ print(cmpd_dict)
59
+ else:
60
+ result_dict[inchikey] = {'pubchem_name': name.casefold(),
61
+ 'pubchem_id': pcid}
62
+
63
+ print_err(f'PubChem: Looked up InchiKeys: {",".join(inchikeys)}')
64
+
65
+ result_list = [result_dict[inchikey]
66
+ if inchikey in result_dict
67
+ else {'pubchem_name': None, 'pubchem_id': None}
68
+ for inchikey in inchikeys]
69
+
70
+ return result_list
71
+
72
+ elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
73
+
74
+ sleep(1.)
75
+
76
+ return _inchikey2pubchem_name_id(inchikeys,
77
+ session=session,
78
+ counter=counter + 1,
79
+ max_tries=max_tries,
80
+ namespace=namespace)
81
+
82
+ else:
83
+
84
+ print_err(f'PubChem: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
85
+
86
+ return [{'pubchem_name': None, 'pubchem_id': None}
87
+ for _ in range(len(inchikeys))]
88
+
89
+
90
+ @vectorize
91
+ def _inchikey2cactus_name(inchikeys: str,
92
+ session: Optional[Session] = None,
93
+ counter: int = 0,
94
+ max_tries: int = 10):
95
+
96
+ r = _url_request(inchikeys, url=_CACTUS_URL,
97
+ session=session,
98
+ get="names")
99
+
100
+ if r.status_code == 200:
101
+
102
+ return r.text.split('\n')[0].casefold()
103
+
104
+ elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
105
+
106
+ sleep(1.)
107
+
108
+ return _inchikey2cactus_name(inchikeys,
109
+ session=session,
110
+ counter=counter + 1,
111
+ max_tries=max_tries)
112
+
113
+ else:
114
+
115
+ print_err(f'Cactus: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
116
+
117
+ return None
118
+
schemist/splitting.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools for splitting tabular datasets, optionally based on chemical features."""
2
+
3
+ from typing import Dict, Iterable, List, Optional, Tuple, Union
4
+ from collections import defaultdict
5
+ from math import ceil
6
+ from random import random, seed
7
+
8
+ try:
9
+ from itertools import batched
10
+ except ImportError:
11
+ from carabiner.itertools import batched
12
+
13
+ from tqdm.auto import tqdm
14
+
15
+ from .converting import convert_string_representation, _convert_input_to_smiles
16
+ from .typing import DataSplits
17
+
18
+ # def _train_test_splits
19
+
20
+ def _train_test_val_sizes(total: int,
21
+ train: float = 1.,
22
+ test: float = 0.) -> Tuple[int]:
23
+
24
+ n_train = int(ceil(train * total))
25
+ n_test = int(ceil(test * total))
26
+ n_val = total - n_train - n_test
27
+
28
+ return n_train, n_test, n_val
29
+
30
+
31
+ def _random_chunk(strings: str,
32
+ train: float = 1.,
33
+ test: float = 0.,
34
+ carry: Optional[Dict[str, List[int]]] = None,
35
+ start_from: int = 0) -> Dict[str, List[int]]:
36
+
37
+ carry = carry or defaultdict(list)
38
+
39
+ train_test: float = train + test
40
+
41
+ for i, _ in enumerate(strings):
42
+
43
+ random_number: float = random()
44
+
45
+ if random_number < train:
46
+
47
+ key = 'train'
48
+
49
+ elif random_number < train_test:
50
+
51
+ key = 'test'
52
+
53
+ else:
54
+
55
+ key = 'validation'
56
+
57
+ carry[key].append(start_from + i)
58
+
59
+ return carry
60
+
61
+
62
+ def split_random(strings: Union[str, Iterable[str]],
63
+ train: float = 1.,
64
+ test: float = 0.,
65
+ chunksize: Optional[int] = None,
66
+ set_seed: Optional[int] = None,
67
+ *args, **kwargs) -> DataSplits:
68
+
69
+ """
70
+
71
+ """
72
+
73
+ if set_seed is not None:
74
+
75
+ seed(set_seed)
76
+
77
+
78
+ if chunksize is None:
79
+
80
+ idx = _random_chunk(strings=strings,
81
+ train=train,
82
+ test=test)
83
+
84
+ else:
85
+
86
+ idx = defaultdict(list)
87
+
88
+ for i, chunk in enumerate(batched(strings, chunksize)):
89
+
90
+ idx = _random_chunk(strings=chunk,
91
+ train=train,
92
+ test=test,
93
+ carry=idx,
94
+ start_from=i * chunksize)
95
+
96
+ seed(None)
97
+
98
+ return DataSplits(**idx)
99
+
100
+
101
+ @_convert_input_to_smiles
102
+ def _scaffold_chunk(strings: str,
103
+ carry: Optional[Dict[str, List[int]]] = None,
104
+ start_from: int = 0) -> Dict[str, List[int]]:
105
+
106
+ carry = carry or defaultdict(list)
107
+
108
+ these_scaffolds = convert_string_representation(strings=strings,
109
+ output_representation='scaffold')
110
+
111
+ for j, scaff in enumerate(these_scaffolds):
112
+ carry[scaff].append(start_from + j)
113
+
114
+ return carry
115
+
116
+
117
+ def _scaffold_aggregator(scaffold_sets: Dict[str, List[int]],
118
+ train: float = 1.,
119
+ test: float = 0.,
120
+ progress: bool = False) -> DataSplits:
121
+
122
+ scaffold_sets = {key: sorted(value)
123
+ for key, value in scaffold_sets.items()}
124
+ scaffold_sets = sorted(scaffold_sets.items(),
125
+ key=lambda x: (len(x[1]), x[1][0]),
126
+ reverse=True)
127
+ nrows = sum(len(idx) for _, idx in scaffold_sets)
128
+ n_train, n_test, n_val = _train_test_val_sizes(nrows,
129
+ train,
130
+ test)
131
+ idx = defaultdict(list)
132
+
133
+ iterator = tqdm(scaffold_sets) if progress else scaffold_sets
134
+ for _, scaffold_idx in iterator:
135
+
136
+ if (len(idx['train']) + len(scaffold_idx)) > n_train:
137
+
138
+ if (len(idx['test']) + len(scaffold_idx)) > n_test:
139
+
140
+ key = 'validation'
141
+
142
+ else:
143
+
144
+ key = 'test'
145
+ else:
146
+
147
+ key = 'train'
148
+
149
+ idx[key] += scaffold_idx
150
+
151
+ return DataSplits(**idx)
152
+
153
+
154
+ def split_scaffold(strings: Union[str, Iterable[str]],
155
+ train: float = 1.,
156
+ test: float = 0.,
157
+ chunksize: Optional[int] = None,
158
+ progress: bool = True) -> DataSplits:
159
+
160
+ """
161
+
162
+ """
163
+
164
+ if chunksize is None:
165
+
166
+ scaffold_sets = _scaffold_chunk(strings)
167
+
168
+ else:
169
+
170
+ scaffold_sets = defaultdict(list)
171
+
172
+ for i, chunk in enumerate(batched(strings, chunksize)):
173
+
174
+ scaffold_sets = _scaffold_chunk(chunk,
175
+ carry=scaffold_sets,
176
+ start_from=i * chunksize)
177
+
178
+ return _scaffold_aggregator(scaffold_sets,
179
+ train=train, test=test,
180
+ progress=progress)
181
+
182
+
183
+ _SPLITTERS = {#'simpd': split_simpd,
184
+ 'scaffold': split_scaffold,
185
+ 'random': split_random}
186
+
187
+ # _SPLIT_SUPERTYPES = {'scaffold': 'grouped',
188
+ # 'random': 'independent'}
189
+
190
+ _GROUPED_SPLITTERS = {'scaffold': (_scaffold_chunk, _scaffold_aggregator)}
191
+
192
+ assert all(_type in _SPLITTERS
193
+ for _type in _GROUPED_SPLITTERS) ## Should never fail!
194
+
195
+ def split(split_type: str,
196
+ *args, **kwargs) -> DataSplits:
197
+
198
+ """
199
+
200
+ """
201
+
202
+ splitter = _SPLITTERS[split_type]
203
+
204
+ return splitter(*args, **kwargs)
schemist/tables.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools for processing tabular data."""
2
+
3
+ from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Tuple, Union
4
+ from functools import partial
5
+
6
+ try:
7
+ from itertools import batched
8
+ except ImportError:
9
+ from carabiner.itertools import batched
10
+
11
+ from carabiner.cast import cast, clist
12
+ from carabiner import print_err
13
+ from pandas import DataFrame, concat
14
+
15
+ from .cleaning import clean_smiles, clean_selfies
16
+ from .converting import convert_string_representation
17
+ from .features import calculate_feature
18
+ from .generating import sample_peptides_in_length_range, react
19
+ from .splitting import split
20
+ from .typing import DataSplits
21
+
22
+ def _get_error_tally(df: DataFrame,
23
+ cols: Union[str, List[str]]) -> Dict[str, int]:
24
+
25
+ cols = cast(cols, to=list)
26
+
27
+ try:
28
+ tally = {col: (df[col].isna() | ~df[col]).sum() for col in cols}
29
+ except TypeError:
30
+ tally = {col: df[col].isna().sum() for col in cols}
31
+
32
+ return tally
33
+
34
+
35
+ def converter(df: DataFrame,
36
+ column: str = 'smiles',
37
+ input_representation: str = 'smiles',
38
+ output_representation: Union[str, List[str]] = 'smiles',
39
+ prefix: Optional[str] = None,
40
+ options: Optional[Dict[str, Any]] = None) -> Tuple[Dict[str, int], DataFrame]:
41
+
42
+ """
43
+
44
+ """
45
+
46
+ prefix = prefix or ''
47
+
48
+ converters = {f"{prefix}{rep_out}": partial(convert_string_representation,
49
+ output_representation=rep_out,
50
+ input_representation=input_representation,
51
+ **options)
52
+ for rep_out in cast(output_representation, to=list)}
53
+
54
+ column_values = df[column]
55
+
56
+ converted = {col: cast(f(column_values), to=list)
57
+ for col, f in converters.items()}
58
+
59
+ df = df.assign(**converted)
60
+
61
+ return _get_error_tally(df, list(converters)), df
62
+
63
+
64
+ def cleaner(df: DataFrame,
65
+ column: str = 'smiles',
66
+ input_representation: str = 'smiles',
67
+ prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
68
+
69
+ """
70
+
71
+ """
72
+
73
+ if input_representation.casefold() == 'smiles':
74
+ cleaner = clean_smiles
75
+ elif input_representation.casefold() == 'selfies':
76
+ cleaner = clean_selfies
77
+ else:
78
+ raise ValueError(f"Representation {input_representation} is not supported for cleaning.")
79
+
80
+ prefix = prefix or ''
81
+ new_column = f"{prefix}{column}"
82
+
83
+ df = df.assign(**{new_column: lambda x: cleaner(x[column])})
84
+
85
+ return _get_error_tally(df, new_column), df
86
+
87
+
88
+ def featurizer(df: DataFrame,
89
+ feature_type: str,
90
+ column: str = 'smiles',
91
+ ids: Optional[Union[str, List[str]]] = None,
92
+ input_representation: str = 'smiles',
93
+ prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
94
+
95
+ """
96
+
97
+ """
98
+
99
+ if ids is None:
100
+ ids = df.columns.tolist()
101
+ else:
102
+ ids = cast(ids, to=list)
103
+
104
+ feature_df = calculate_feature(feature_type=feature_type,
105
+ strings=df[column],
106
+ prefix=prefix,
107
+ input_representation=input_representation)
108
+
109
+ if len(ids) > 0:
110
+ df = concat([df[ids], feature_df], axis=1)
111
+
112
+ return _get_error_tally(feature_df, 'meta_feature_valid'), df
113
+
114
+
115
+ def assign_groups(df: DataFrame,
116
+ grouper: Callable[[Union[str, Iterable[str]]], Dict[str, Tuple[int]]],
117
+ group_name: str = 'group',
118
+ column: str = 'smiles',
119
+ input_representation: str = 'smiles',
120
+ *args, **kwargs) -> Tuple[Dict[str, Tuple[int]], DataFrame]:
121
+
122
+ group_idx = grouper(strings=df[column],
123
+ input_representation=input_representation,
124
+ *args, **kwargs)
125
+
126
+ inv_group_idx = {i: group for group, idx in group_idx.items() for i in idx}
127
+ groups = [inv_group_idx[i] for i in range(len(inv_group_idx))]
128
+
129
+ return group_idx, df.assign(**{group_name: groups})
130
+
131
+
132
+ def _assign_splits(df: DataFrame,
133
+ split_idx: DataSplits,
134
+ use_df_index: bool = False) -> DataFrame:
135
+
136
+ row_index = df.index if use_df_index else tuple(range(df.shape[0]))
137
+
138
+ df = df.assign(**{f'is_{key}': [i in getattr(split_idx, key) for i in row_index]
139
+ for key in split_idx._fields})
140
+ split_counts = {key: sum(df[f'is_{key}'].values) for key in split_idx._fields}
141
+
142
+ return split_counts, df
143
+
144
+
145
+ def splitter(df: DataFrame,
146
+ split_type: str = 'random',
147
+ column: str = 'smiles',
148
+ input_representation: str = 'smiles',
149
+ *args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
150
+
151
+ """
152
+
153
+ """
154
+
155
+ split_idx = split(split_type=split_type,
156
+ strings=df[column],
157
+ input_representation=input_representation,
158
+ *args, **kwargs)
159
+
160
+ return _assign_splits(df, split_idx=split_idx)
161
+
162
+
163
+ def reactor(df: DataFrame,
164
+ column: str = 'smiles',
165
+ reaction: Union[str, Iterable[str]] = 'N_to_C_cyclization',
166
+ prefix: Optional[str] = None,
167
+ *args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
168
+
169
+ """
170
+
171
+ """
172
+
173
+ prefix = prefix or ''
174
+
175
+ reactors = {col: partial(react, reaction=col)
176
+ for col in cast(reaction, to=list)}
177
+
178
+ column_values = df[column]
179
+
180
+ new_columns = {f"{prefix}{col}": list(_reactor(strings=column_values, *args, **kwargs))
181
+ for col, _reactor in reactors.items()}
182
+
183
+ df = df.assign(**new_columns)
184
+
185
+ return _get_error_tally(df, reaction), df
186
+
187
+
188
+ def _peptide_table(max_length: int,
189
+ min_length: Optional[int] = None,
190
+ by: int = 1,
191
+ n: Optional[Union[float, int]] = None,
192
+ prefix: str = '',
193
+ suffix: str = '',
194
+ generator: bool = False,
195
+ batch_size: int = 1000,
196
+ *args, **kwargs) -> Union[DataFrame, Generator]:
197
+
198
+ min_length = min_length or max_length
199
+
200
+ peptides = sample_peptides_in_length_range(max_length=max_length,
201
+ min_length=min_length,
202
+ by=by,
203
+ n=n,
204
+ *args, **kwargs)
205
+
206
+ if generator:
207
+
208
+ for peps in batched(peptides, batch_size):
209
+
210
+ peps = [f"{prefix}{pep}{suffix}"
211
+ for pep in peps]
212
+
213
+ yield DataFrame(dict(peptide_sequence=peps))
214
+
215
+ else:
216
+
217
+ peps = [f"{prefix}{pep}{suffix}"
218
+ for pep in peptides]
219
+
220
+ return DataFrame(dict(peptide_sequence=peps))
schemist/typing.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """Types used in schemist."""
2
+
3
+ from collections import namedtuple
4
+
5
+ DataSplits = namedtuple('DataSplits',
6
+ ['train', 'test', 'validation'],
7
+ defaults=[tuple(), tuple(), tuple()])
schemist/utils.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Miscellaneous utilities for schemist."""
test/data/AmpC_screen_table_10k.csv.gz ADDED
Binary file (171 kB). View file
 
test/tests.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import doctest
2
+ import schemist as sch
3
+
4
+ if __name__ == '__main__':
5
+
6
+ doctest.testmod(sch)