Spaces:

scbirlab
/

chem-converter

Sleeping

App Files Files Community

Eachan Johnson commited on Feb 21, 2024

Commit

2dceef8

0 Parent(s):

Initial commit

Browse files

Files changed (38) hide show

.github/workflows/python-package.yml +48 -0
.github/workflows/python-publish.yml +42 -0
.gitignore +28 -0
.readthedocs.yml +25 -0
LICENSE +21 -0
README.md +73 -0
build/lib/schemist/__init__.py +0 -0
build/lib/schemist/cleaning.py +27 -0
build/lib/schemist/cli.py +536 -0
build/lib/schemist/collating.py +315 -0
build/lib/schemist/converting.py +308 -0
build/lib/schemist/features.py +149 -0
build/lib/schemist/generating.py +262 -0
build/lib/schemist/io.py +149 -0
build/lib/schemist/rest_lookup.py +118 -0
build/lib/schemist/splitting.py +204 -0
build/lib/schemist/tables.py +220 -0
build/lib/schemist/typing.py +7 -0
build/lib/schemist/utils.py +1 -0
docs/requirements.txt +8 -0
docs/source/conf.py +45 -0
docs/source/index.md +21 -0
pyproject.toml +60 -0
schemist/__init__.py +0 -0
schemist/cleaning.py +27 -0
schemist/cli.py +536 -0
schemist/collating.py +315 -0
schemist/converting.py +308 -0
schemist/features.py +149 -0
schemist/generating.py +262 -0
schemist/io.py +149 -0
schemist/rest_lookup.py +118 -0
schemist/splitting.py +204 -0
schemist/tables.py +220 -0
schemist/typing.py +7 -0
schemist/utils.py +1 -0
test/data/AmpC_screen_table_10k.csv.gz +0 -0
test/tests.py +6 -0

.github/workflows/python-package.yml ADDED Viewed

	@@ -0,0 +1,48 @@

+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+name: Python package
+on: [push]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest pytest-cov
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install -e .
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest htstools --doctest-modules --junitxml=tests/test-results.xml --cov=com --cov-report=xml --cov-report=html
+    - name: Test with scripts
+      run: |
+          bash test/scripts/test-plate-tab.sh
+          bash test/scripts/test-row-xlsx.sh
+    - name: Upload pytest test results
+      uses: actions/upload-artifact@v3
+      with:
+          name: pytest-results-${{ matrix.python-version }}
+          path: junit/test-results-${{ matrix.python-version }}.xml
+      # Use always() to always run this step to publish test results when there are test failures
+      if: ${{ always() }}

.github/workflows/python-publish.yml ADDED Viewed

	@@ -0,0 +1,42 @@

+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+name: Upload Python Package
+on:
+  release:
+    types: [published]
+permissions:
+  contents: read
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build --sdist --wheel --outdir dist
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}

.gitignore ADDED Viewed

	@@ -0,0 +1,28 @@

+*.pyc
+*.so
+*.egg-info
+*.whl
+.DS_Store
+.mypy_cache/
+.pytype/
+.idea
+.vscode
+.envrc
+__pycache__
+.pytest_cache
+# Sphinx
+/docs/build/
+/docs/_autosummary/
+/docs/make.bat
+/docs/Makefile
+/test/outputs/
+/test/data/collate/
+# virtualenv/venv directories
+/venv/
+/bin/
+/include/
+/lib/
+/share/

.readthedocs.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+# Required
+version: 2
+build:
+  os: "ubuntu-20.04"
+  tools:
+    python: "3.10"
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+  fail_on_warning: false
+# Optionally build your docs in additional formats such as PDF and ePub
+formats:
+  - htmlzip
+# Optionally set the version of Python and requirements required to build your docs
+python:
+  install:
+    - requirements: docs/requirements.txt

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) [year] [fullname]
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,73 @@

+# ⬢⬢⬢ schemist
+![GitHub Workflow Status (with branch)](https://img.shields.io/github/actions/workflow/status/scbirlab/schemist/python-publish.yml)
+![PyPI - Python Version](https://img.shields.io/pypi/pyversions/schemist)
+![PyPI](https://img.shields.io/pypi/v/schemist)
+Cleaning, collating, and augmenting chemical datasets.
+- [Installation](#installation)
+- [Command-line usage](#command-line-usage)
+    - [Example](#example)
+    - [Other commands](#other-commands)
+- [Python API](#python-api)
+- [Documentation](#documentation)
+## Installation
+### The easy way
+Install the pre-compiled version from PyPI:
+```bash
+pip install schemist
+```
+### From source
+Clone the repository, then `cd` into it. Then run:
+```bash
+pip install -e .
+```
+## Command-line usage
+**schemist**  provides command-line utlities to ... The tools complete specific tasks which
+can be easily composed into analysis pipelines, because the TSV table output goes to
+`stdout` by default so they can be piped from one tool to another.
+To get a list of commands (tools), do
+```bash
+schemist --help
+```
+And to get help for a specific command, do
+```bash
+schemist <command> --help
+```
+For the Python API, [see below](#python-api).
+## Example
+## Other commands
+## Python API
+**schemist** can be imported into Python to help make custom analyses.
+```python
+>>> import schemist as sch
+```
+## Documentation
+Full API documentation is at [ReadTheDocs](https://schemist.readthedocs.org).

build/lib/schemist/__init__.py ADDED Viewed

File without changes

build/lib/schemist/cleaning.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Chemical structure cleaning routines."""
+from carabiner.decorators import vectorize
+from datamol import sanitize_smiles
+import selfies as sf
+@vectorize
+def clean_smiles(smiles: str,
+                 *args, **kwargs) -> str:
+    """Sanitize a SMILES string or list of SMILES strings.
+    """
+    return sanitize_smiles(smiles, *args, **kwargs)
+@vectorize
+def clean_selfies(selfies: str,
+                  *args, **kwargs) -> str:
+    """Sanitize a SELFIES string or list of SELFIES strings.
+    """
+    return sf.encode(sanitize_smiles(sf.decode(selfies), *args, **kwargs))

build/lib/schemist/cli.py ADDED Viewed

	@@ -0,0 +1,536 @@

+"""Command-line interface for schemist."""
+from typing import Any, Dict, List, Optional
+from argparse import FileType, Namespace
+from collections import Counter, defaultdict
+from functools import partial
+import os
+import sys
+from tempfile import NamedTemporaryFile, TemporaryDirectory
+from carabiner import pprint_dict, upper_and_lower, print_err
+from carabiner.cliutils import clicommand, CLIOption, CLICommand, CLIApp
+from carabiner.itertools import tenumerate
+from carabiner.pd import get_formats, write_stream
+from .collating import collate_inventory, deduplicate_file
+from .converting import _TO_FUNCTIONS, _FROM_FUNCTIONS
+from .generating import AA, REACTIONS
+from .io import _mutate_df_stream
+from .tables import (converter, cleaner, featurizer, assign_groups,
+                     _assign_splits, splitter, _peptide_table, reactor)
+from .splitting import _SPLITTERS, _GROUPED_SPLITTERS
+__version__ = '0.0.1'
+def _option_parser(x: Optional[List[str]]) -> Dict[str, Any]:
+    options = {}
+    try:
+        for opt in x:
+            try:
+                key, value = opt.split('=')
+            except ValueError:
+                raise ValueError(f"Option {opt} is misformatted. It should be in the format keyword=value.")
+            try:
+                value = int(value)
+            except ValueError:
+                try:
+                    value = float(value)
+                except ValueError:
+                    pass
+            options[key] = value
+    except TypeError:
+        pass
+    return options
+def _sum_tally(tallies: Counter,
+               message: str = "Error counts",
+               use_length: bool = False):
+    total_tally = Counter()
+    for tally in tallies:
+        if use_length:
+            total_tally.update({key: len(value) for key, value in tally.items()})
+        else:
+            total_tally.update(tally)
+    if len(tallies) == 0:
+        raise ValueError(f"Nothing generated!")
+    pprint_dict(total_tally, message=message)
+    return total_tally
+@clicommand(message="Cleaning file with the following parameters")
+def _clean(args: Namespace) -> None:
+    error_tallies = _mutate_df_stream(input_file=args.input,
+                                      output_file=args.output,
+                                      function=partial(cleaner,
+                                                       column=args.column,
+                                                       input_representation=args.representation,
+                                                       prefix=args.prefix),
+                                      file_format=args.format)
+    _sum_tally(error_tallies)
+    return None
+@clicommand(message="Converting between string representations with the following parameters")
+def _convert(args: Namespace) -> None:
+    options = _option_parser(args.options)
+    error_tallies = _mutate_df_stream(input_file=args.input,
+                                      output_file=args.output,
+                                      function=partial(converter,
+                                                       column=args.column,
+                                                       input_representation=args.representation,
+                                                       output_representation=args.to,
+                                                       prefix=args.prefix,
+                                                       options=options),
+                                      file_format=args.format)
+    _sum_tally(error_tallies)
+    return None
+@clicommand(message="Adding features to files with the following parameters")
+def _featurize(args: Namespace) -> None:
+    error_tallies = _mutate_df_stream(input_file=args.input,
+                                      output_file=args.output,
+                                      function=partial(featurizer,
+                                                       feature_type=args.feature,
+                                                       column=args.column,
+                                                       ids=args.id,
+                                                       input_representation=args.representation,
+                                                       prefix=args.prefix),
+                                      file_format=args.format)
+    _sum_tally(error_tallies)
+    return None
+@clicommand(message="Splitting table with the following parameters")
+def _split(args: Namespace) -> None:
+    split_type = args.type.casefold()
+    if split_type in _GROUPED_SPLITTERS:
+        chunk_processor, aggregator = _GROUPED_SPLITTERS[split_type]
+        with TemporaryDirectory() as dir:
+            with NamedTemporaryFile("w", dir=dir, delete=False) as f:
+                group_idxs = _mutate_df_stream(input_file=args.input,
+                                                output_file=f,
+                                                function=partial(assign_groups,
+                                                                grouper=chunk_processor,
+                                                                group_name=split_type,
+                                                                column=args.column,
+                                                                input_representation=args.representation),
+                                                file_format=args.format)
+                f.close()
+                new_group_idx = defaultdict(list)
+                totals = 0
+                for group_idx in group_idxs:
+                    these_totals = 0
+                    for key, value in group_idx.items():
+                        these_totals += len(value)
+                        new_group_idx[key] += [idx + totals for idx in value]
+                    totals += these_totals
+                group_idx = aggregator(new_group_idx,
+                                    train=args.train,
+                                    test=args.test)
+                split_tallies = _mutate_df_stream(input_file=f.name,
+                                                  output_file=args.output,
+                                                  function=partial(_assign_splits,
+                                                                  split_idx=group_idx,
+                                                                  use_df_index=True),
+                                                  file_format=args.format)
+                if os.path.exists(f.name):
+                    os.remove(f.name)
+    else:
+        split_tallies = _mutate_df_stream(input_file=args.input,
+                                          output_file=args.output,
+                                          function=partial(splitter,
+                                                           split_type=args.type,
+                                                           column=args.column,
+                                                           input_representation=args.representation,
+                                                           train=args.train,
+                                                           test=args.test,
+                                                           set_seed=args.seed),
+                                          file_format=args.format)
+    _sum_tally(split_tallies,
+               message="Split counts")
+    return None
+@clicommand(message="Collating files with the following parameters")
+def _collate(args: Namespace) -> None:
+    root_dir = args.data_dir or '.'
+    error_tallies = _mutate_df_stream(input_file=args.input,
+                                      output_file=args.output,
+                                      function=partial(collate_inventory,
+                                                       root_dir=root_dir,
+                                                       drop_unmapped=not args.keep_extra_columns,
+                                                       catalog_smiles_column=args.column,
+                                                       id_column_name=args.id_column,
+                                                       id_n_digits=args.digits,
+                                                       id_prefix=args.prefix),
+                                      file_format=args.format)
+    _sum_tally(error_tallies,
+               message="Collated chemicals:")
+    return None
+@clicommand(message="Deduplicating chemical structures with the following parameters")
+def _dedup(args: Namespace) -> None:
+    report, deduped_df = deduplicate_file(args.input,
+                                      format=args.format,
+                                      column=args.column,
+                                      input_representation=args.representation,
+                                      index_columns=args.indexes)
+    if args.prefix is not None and 'inchikey' in deduped_df:
+        deduped_df = deduped_df.rename(columns={'inchikey': f'{args.prefix}inchikey'})
+    write_stream(deduped_df,
+                 output=args.output,
+                 format=args.format)
+    pprint_dict(report, message="Finished deduplicating:")
+    return None
+@clicommand(message="Enumerating peptides with the following parameters")
+def _enum(args: Namespace) -> None:
+    tables = _peptide_table(max_length=args.max_length,
+                            min_length=args.min_length,
+                            n=args.number,
+                            indexes=args.slice,
+                            set_seed=args.seed,
+                            prefix=args.prefix,
+                            suffix=args.suffix,
+                            d_aa_only=args.d_aa_only,
+                            include_d_aa=args.include_d_aa,
+                            generator=True)
+    dAA_use = any(aa.islower() for aa in args.prefix + args.suffix)
+    dAA_use = dAA_use or args.include_d_aa or args.d_aa_only
+    tallies, error_tallies = [], []
+    options = _option_parser(args.options)
+    _converter = partial(converter,
+                         column='peptide_sequence',
+                         input_representation='minihelm' if dAA_use else 'aa_seq',  ## affects performance
+                         output_representation=args.to,
+                         options=options)
+    for i, table in tenumerate(tables):
+        _err_tally, df = _converter(table)
+        tallies.append({"Number of peptides": df.shape[0]})
+        error_tallies.append(_err_tally)
+        write_stream(df,
+                     output=args.output,
+                     format=args.format,
+                     mode='w' if i == 0  else 'a',
+                     header=i == 0)
+    _sum_tally(tallies,
+               message="Enumerated peptides")
+    _sum_tally(error_tallies,
+               message="Conversion errors")
+    return None
+@clicommand(message="Reacting peptides with the following parameters")
+def _react(args: Namespace) -> None:
+    error_tallies = _mutate_df_stream(input_file=args.input,
+                                      output_file=args.output,
+                                      function=partial(reactor,
+                                                       column=args.column,
+                                                       input_representation=args.representation,
+                                                       reaction=args.reaction,
+                                                       product_name=args.name),
+                                      file_format=args.format)
+    _sum_tally(error_tallies)
+    return None
+def main() -> None:
+    inputs = CLIOption('input',
+                       default=sys.stdin,
+                       type=FileType('r'),
+                       nargs='?',
+                       help='Input columnar Excel, CSV or TSV file. Default: STDIN.')
+    representation = CLIOption('--representation', '-r',
+                       type=str,
+                       default='SMILES',
+                       choices=upper_and_lower(_FROM_FUNCTIONS),
+                       help='Chemical representation to use for input. ')
+    column = CLIOption('--column', '-c',
+                       default='smiles',
+                       type=str,
+                       help='Column to use as input string representation. ')
+    prefix = CLIOption('--prefix', '-p',
+                       default=None,
+                       type=str,
+                       help='Prefix to add to new column name. Default: no prefix')
+    to = CLIOption('--to', '-2',
+                       type=str,
+                       default='SMILES',
+                       nargs='*',
+                       choices=upper_and_lower(_TO_FUNCTIONS),
+                       help='Format to convert to.')
+    options = CLIOption('--options', '-x',
+                       type=str,
+                       default=None,
+                       nargs='*',
+                       help='Options to pass to converter, in the format '
+                           '"keyword1=value1 keyword2=value2"')
+    output = CLIOption('--output', '-o',
+                       type=FileType('w'),
+                       default=sys.stdout,
+                       help='Output file. Default: STDOUT')
+    formatting = CLIOption('--format', '-f',
+                           type=str,
+                           default=None,
+                           choices=upper_and_lower(get_formats()),
+                           help='Override file extensions for input and output. '
+                                'Default: infer from file extension.')
+    ## featurize
+    id_feat = CLIOption('--id', '-i',
+                        type=str,
+                        default=None,
+                        nargs='*',
+                        help='Columns to retain in output table. Default: use all')
+    feature = CLIOption('--feature', '-t',
+                           type=str,
+                           default='2d',
+                           choices=['2d', 'fp'],  ## TODO: implement 3d
+                           help='Which feature type to generate.')
+    ## split
+    type_ = CLIOption('--type', '-t',
+                       type=str,
+                       default='random',
+                       choices=upper_and_lower(_SPLITTERS),
+                       help='Which split type to use.')
+    train = CLIOption('--train', '-a',
+                       type=float,
+                       default=1.,
+                       help='Proportion of data to use for training. ')
+    test = CLIOption('--test', '-b',
+                       type=float,
+                       default=0.,
+                       help='Proportion of data to use for testing. ')
+    ## collate
+    data_dir = CLIOption('--data-dir', '-d',
+                         type=str,
+                         default=None,
+                         help='Directory containing data files. '
+                              'Default: current directory')
+    id_column = CLIOption('--id-column', '-s',
+                         default=None,
+                         type=str,
+                         help='If provided, add a structure ID column with this name. '
+                              'Default: don\'t add structure IDs')
+    prefix_collate = CLIOption('--prefix', '-p',
+                         default='ID-',
+                         type=str,
+                         help='Prefix to add to structure IDs. '
+                              'Default: no prefix')
+    digits = CLIOption('--digits', '-n',
+                         default=8,
+                         type=int,
+                         help='Number of digits in structure IDs. ')
+    keep_extra_columns = CLIOption('--keep-extra-columns', '-x',
+                         action='store_true',
+                         help='Whether to keep columns not mentioned in the catalog. '
+                              'Default: drop extra columns.')
+    keep_invalid_smiles = CLIOption('--keep-invalid-smiles', '-y',
+                         action='store_true',
+                         help='Whether to keep rows with invalid SMILES. '
+                              'Default: drop invalid rows.')
+    ## dedup
+    indexes = CLIOption('--indexes', '-x',
+                        type=str,
+                        default=None,
+                        nargs='*',
+                         help='Columns to retain and collapse (if multiple values per unique structure). '
+                              'Default: retain no other columns than structure and InchiKey.')
+    drop_inchikey = CLIOption('--drop-inchikey', '-d',
+                         action='store_true',
+                         help='Whether to drop the calculated InchiKey column. '
+                              'Default: keep InchiKey.')
+    ### enum
+    max_length = CLIOption('--max-length', '-l',
+                           type=int,
+                           help='Maximum length of enumerated peptide. '
+                                'Required.')
+    min_length = CLIOption('--min-length', '-m',
+                      type=int,
+                      default=None,
+                      help='Minimum length of enumerated peptide. '
+                           'Default: same as maximum, i.e. all peptides same length.')
+    number_to_gen = CLIOption('--number', '-n',
+                              type=float,
+                              default=None,
+                              help='Number of peptides to sample from all possible '
+                                   'within the constraints. If less than 1, sample '
+                                   'that fraction of all possible. If greater than 1, '
+                                   'sample that number. '
+                                   'Default: return all peptides.')
+    slicer = CLIOption('--slice', '-z',
+                       type=str,
+                       default=None,
+                       nargs='*',
+                       help='Subset of (possibly sampled) population to return, in the format <stop> '
+                            'or <start> <stop> [<step>]. If "x" is used for <stop>, then it runs to the end. '
+                            'For example, 1000 gives the first 1000, 2 600 gives items 2-600, and '
+                            '3 500 2 gives every other from 3 to 500. Default: return all.')
+    alphabet = CLIOption('--alphabet', '-b',
+                      type=str,
+                      default=''.join(AA),
+                      help='Alphabet to use in sampling.')
+    suffix = CLIOption('--suffix', '-s',
+                      type=str,
+                      default='',
+                      help='Sequence to add to end. Lowercase for D-amino acids. '
+                           'Default: no suffix.')
+    set_seed = CLIOption('--seed', '-e',
+                      type=int,
+                      default=None,
+                      help='Seed to use for reproducible randomness. '
+                           'Default: don\'t enable reproducibility.')
+    d_aa_only = CLIOption('--d-aa-only', '-a',
+                      action='store_true',
+                      help='Whether to only use D-amino acids. '
+                           'Default: don\'t include.')
+    include_d_aa = CLIOption('--include-d-aa', '-y',
+                      action='store_true',
+                      help='Whether to include D-amino acids in enumeration. '
+                           'Default: don\'t include.')
+    ## reaction
+    name = CLIOption('--name', '-n',
+                     type=str,
+                     default=None,
+                     help='Name of column for product. '
+                          'Default: same as reaction name.')
+    reaction_opt = CLIOption('--reaction', '-x',
+                             type=str,
+                             nargs='*',
+                             choices=list(REACTIONS),
+                             default='N_to_C_cyclization',
+                             help='Reaction(s) to apply.')
+    clean = CLICommand('clean',
+                       description='Clean and normalize SMILES column of a table.',
+                       main=_clean,
+                       options=[output, formatting, inputs, representation, column, prefix])
+    convert = CLICommand('convert',
+                         description='Convert between string representations of chemical structures.',
+                         main=_convert,
+                         options=[output, formatting, inputs, representation, column, prefix, to, options])
+    featurize = CLICommand('featurize',
+                         description='Convert between string representations of chemical structures.',
+                         main=_featurize,
+                         options=[output, formatting, inputs, representation, column, prefix,
+                                  id_feat, feature])
+    collate = CLICommand('collate',
+                         description='Collect disparate tables or SDF files of libraries into a single table.',
+                         main=_collate,
+                         options=[output, formatting, inputs, representation,
+                                  data_dir, column.replace(default='input_smiles'), id_column, prefix_collate,
+                                  digits, keep_extra_columns, keep_invalid_smiles])
+    dedup = CLICommand('dedup',
+                         description='Deduplicate chemical structures and retain references.',
+                         main=_dedup,
+                         options=[output, formatting, inputs, representation, column, prefix,
+                                  indexes, drop_inchikey])
+    enum = CLICommand('enumerate',
+                      description='Enumerate bio-chemical structures within length and sequence constraints.',
+                      main=_enum,
+                      options=[output, formatting, to, options,
+                               alphabet, max_length, min_length, number_to_gen,
+                               slicer, set_seed,
+                               prefix.replace(default='',
+                                              help='Sequence to prepend. Lowercase for D-amino acids. '
+                                                   'Default: no prefix.'),
+                               suffix,
+                               type_.replace(default='aa',
+                                             choices=['aa'],
+                                             help='Type of bio sequence to enumerate. '
+                                                  'Default: %(default)s.'),
+                               d_aa_only, include_d_aa])
+    reaction = CLICommand('react',
+                         description='React compounds in silico in indicated columns using a named reaction.',
+                         main=_react,
+                         options=[output, formatting, inputs, representation, column, name,
+                                  reaction_opt])
+    split = CLICommand('split',
+                         description='Split table based on chosen algorithm, optionally taking account of chemical structure during splits.',
+                         main=_split,
+                         options=[output, formatting, inputs, representation, column, prefix,
+                                  type_, train, test, set_seed])
+    app = CLIApp("schemist",
+                 version=__version__,
+                 description="Tools for cleaning, collating, and augmenting chemical datasets.",
+                 commands=[clean, convert, featurize, collate, dedup, enum, reaction, split])
+    app.run()
+    return None
+if __name__ == "__main__":
+    main()

build/lib/schemist/collating.py ADDED Viewed

	@@ -0,0 +1,315 @@

+"""Tools to collate chemical data files."""
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, TextIO, Union
+from collections import Counter
+from functools import partial
+from glob import glob
+import os
+from carabiner.pd import read_table, resolve_delim
+from carabiner import print_err
+import numpy as np
+from pandas import DataFrame, concat
+from .converting import convert_string_representation, _FROM_FUNCTIONS
+from .io import FILE_READERS
+GROUPING_COLUMNS = ("filename", "file_format", "library_name", "string_representation")
+ESSENTIAL_COLUMNS = GROUPING_COLUMNS + ("compound_collection", "plate_id", "well_id")
+def _column_mapper(df: DataFrame,
+                   cols: Iterable[str]) -> Tuple[Callable, Dict]:
+    basic_map = {column: df[column].tolist()[0] for column in cols}
+    inv_basic_map = {value: key for key, value in basic_map.items()}
+    def column_mapper(x: DataFrame) -> DataFrame:
+        new_df = DataFrame()
+        for new_col, old_col in basic_map.items():
+            # old_col = str(old_col)
+            if old_col is None or str(old_col) in ('None', 'nan', 'NA'):
+                new_df[new_col] = None
+            elif '+' in old_col:
+                splits = old_col.split('+')
+                new_df[new_col] = x[splits[0]].str.cat([x[s].astype(str)
+                                                        for s in splits[1:]])
+            elif ';' in old_col:
+                col, char, index = old_col.split(';')
+                index = [int(i) for i in index.split(':')]
+                if len(index) == 1:
+                    index = slice(index[0], index[0] + 1)
+                else:
+                    index = slice(*index)
+                try:
+                    new_df[new_col] = (x[col]
+                                       .str.split(char)
+                                       .map(lambda y: char.join(y[index] if y is not np.nan else []))
+                                       .str.strip())
+                except TypeError as e:
+                    print_err(x[col].str.split(char))
+                    raise e
+            else:
+                new_df[new_col] = x[old_col].copy()
+        return new_df
+    return column_mapper, inv_basic_map
+def _check_catalog(catalog: DataFrame,
+                   catalog_smiles_column: str = 'input_smiles') -> None:
+    essential_columns = (catalog_smiles_column, ) + ESSENTIAL_COLUMNS
+    missing_essential_cols = [col for col in essential_columns
+                              if col not in catalog]
+    if len(missing_essential_cols) > 0:
+        print_err(catalog.columns.tolist())
+        raise KeyError("Missing required columns from catalog: " +
+                       ", ".join(missing_essential_cols))
+    return None
+def collate_inventory(catalog: DataFrame,
+                      root_dir: Optional[str] = None,
+                      drop_invalid: bool = True,
+                      drop_unmapped: bool = False,
+                      catalog_smiles_column: str = 'input_smiles',
+                      id_column_name: Optional[str] = None,
+                      id_n_digits: int = 8,
+                      id_prefix: str = '') -> DataFrame:
+    f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
+    The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
+    - filename is a glob pattern of files to collate
+    - file_format is one of {', '.join(FILE_READERS.keys())}
+    - smiles_column contains smiles strings
+    Other columns are optional and can have any name, but must contain the name or a pattern
+    matching a column (for tabular data) or field (for SDF data) in the files
+    of the `filename` column. In the output DataFrame, the named column data will be mapped.
+    Optional column contents can be either concatenated or split using the following
+    pattern:
+    - col1+col2: concatenates the contents of `col1` and `col2`
+    - col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
+    Parameters
+    ----------
+    catalog : pd.DataFrame
+        Table cataloging locations and format of data. Requires
+        columns {', '.join(ESSENTIAL_COLUMNS)}.
+    root_dir : str, optional
+        Path to look for data files. Default: current directory.
+    drop_invalid : bool, optional
+        Whether to drop rows containing invalid SMILES.
+    Returns
+    -------
+    pd.DataFrame
+        Collated chemical data.
+    """
+    root_dir = root_dir or '.'
+    _check_catalog(catalog, catalog_smiles_column)
+    nongroup_columns = [col for col in catalog
+                         if col not in GROUPING_COLUMNS]
+    loaded_dataframes = []
+    report = Counter({"invalid SMILES": 0,
+                      "rows processed": 0})
+    grouped_catalog = catalog.groupby(list(GROUPING_COLUMNS))
+    for (this_glob, this_filetype,
+         this_library_name, this_representation), filename_df in grouped_catalog:
+        print_err(f'\nProcessing {this_glob}:')
+        this_glob = glob(os.path.join(root_dir, this_glob))
+        these_filenames = sorted(f for f in this_glob
+                                 if not os.path.basename(f).startswith('~$'))
+        print_err('\t- ' + '\n\t- '.join(these_filenames))
+        column_mapper, mapped_cols = _column_mapper(filename_df,
+                                                    nongroup_columns)
+        reader = FILE_READERS[this_filetype]
+        for filename in these_filenames:
+            this_data0 = reader(filename)
+            if not drop_unmapped:
+                unmapped_cols = {col: 'x_' + col.casefold().replace(' ', '_')
+                                for col in this_data0 if col not in mapped_cols}
+                this_data = this_data0[list(unmapped_cols)].rename(columns=unmapped_cols)
+                this_data = concat([column_mapper(this_data0), this_data],
+                                    axis=1)
+            else:
+                this_data = column_mapper(this_data0)
+            if this_representation.casefold() not in _FROM_FUNCTIONS:
+                raise TypeError(' or '.join(list(set(this_representation, this_representation.casefold()))) +
+                                "not a supported string representation. Try one of " + ", ".join(_FROM_FUNCTIONS))
+            this_converter = partial(convert_string_representation,
+                                     input_representation=this_representation.casefold())
+            this_data = (this_data
+                         .query('compound_collection != "NA"')
+                         .assign(library_name=this_library_name,
+                                 input_file_format=this_filetype,
+                                 input_string_representation=this_representation,
+                                 plate_id=lambda x: x['plate_id'].astype(str),
+                                 plate_loc=lambda x: x['library_name'].str.cat([x['compound_collection'], x['plate_id'], x['well_id']], sep=':'),
+                                 canonical_smiles=lambda x: this_converter(x[catalog_smiles_column]),
+                                 is_valid_smiles=lambda x: [s is not None for s in x['canonical_smiles']]))
+            report.update({"invalid SMILES": (~this_data['is_valid_smiles']).sum(),
+                           "rows processed": this_data.shape[0]})
+            if drop_invalid:
+                this_data = this_data.query('is_valid_smiles')
+            if id_column_name is not None:
+                this_converter = partial(convert_string_representation,
+                                         output_representation='id',
+                                         options=dict(n=id_n_digits,
+                                                      prefix=id_prefix))
+                this_data = this_data.assign(**{id_column_name: lambda x: this_converter(x['canonical_smiles'])})
+            loaded_dataframes.append(this_data)
+    collated_df = concat(loaded_dataframes, axis=0)
+    return report, collated_df
+def collate_inventory_from_file(catalog_path: Union[str, TextIO],
+                                root_dir: Optional[str] = None,
+                                format: Optional[str] = None,
+                                *args, **kwargs) -> DataFrame:
+    f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
+    The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
+    - filename is a glob pattern of files to collate
+    - file_format is one of {', '.join(FILE_READERS.keys())}
+    - smiles_column contains smiles strings
+    Other columns are optional and can have any name, but must contain the name or a pattern
+    matching a column (for tabular data) or field (for SDF data) in the files
+    of the `filename` column. In the output DataFrame, the named column data will be mapped.
+    Optional column contents can be either concatenated or split using the following
+    pattern:
+    - col1+col2: concatenates the contents of `col1` and `col2`
+    - col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
+    Parameters
+    ----------
+    catalog_path : str
+        Path to catalog file in XLSX, TSV or CSV format. Requires
+        columns {', '.join(ESSENTIAL_COLUMNS)}.
+    format : str, optional
+        Format of catalog file. Default: infer from file extension.
+    root_dir : str, optional
+        Path to look for data files. Default: use directory containing
+        the catalog.
+    Returns
+    -------
+    pd.DataFrame
+        Collated chemical data.
+    """
+    root_dir = root_dir or os.path.dirname(catalog_path)
+    data_catalog = read_table(catalog_path, format=format)
+    return collate_inventory(catalog=data_catalog,
+                             root_dir=root_dir,
+                             *args, **kwargs)
+def deduplicate(df: DataFrame,
+                column: str = 'smiles',
+                input_representation: str = 'smiles',
+                index_columns: Optional[List[str]] = None,
+                drop_inchikey: bool = False) -> DataFrame:
+    index_columns = index_columns or []
+    inchikey_converter = partial(convert_string_representation,
+                                 input_representation=input_representation,
+                                 output_representation='inchikey')
+    df = df.assign(inchikey=lambda x: inchikey_converter(x[column]))
+    structure_columns = [column, 'inchikey']
+    df_unique = []
+    for (string_rep, inchikey), structure_df in df.groupby(structure_columns):
+        collapsed_indexes = {col: [';'.join(sorted(map(str, set(structure_df[col].tolist()))))]
+                             for col in structure_df if col in index_columns}
+        collapsed_indexes.update({column: [string_rep],
+                                  'inchikey': [inchikey],
+                                  'instance_count': [structure_df.shape[0]]})
+        df_unique.append(DataFrame(collapsed_indexes))
+    df_unique = concat(df_unique, axis=0)
+    if drop_inchikey:
+        df_unique = df_unique.drop(columns=['inchikey'])
+    report = {'starting rows:': df.shape[0],
+              'ending_rows': df_unique.shape[0]}
+    return report, df_unique
+def deduplicate_file(filename: Union[str, TextIO],
+                     format: Optional[str] = None,
+                     *args, **kwargs) -> DataFrame:
+    table = read_table(filename)
+    return deduplicate(table, *args, **kwargs)

build/lib/schemist/converting.py ADDED Viewed

	@@ -0,0 +1,308 @@

+"""Converting between chemical representation formats."""
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from functools import wraps
+from carabiner import print_err
+from carabiner.cast import cast, flatten
+from carabiner.decorators import return_none_on_error, vectorize
+from carabiner.itertools import batched
+from datamol import sanitize_smiles
+import nemony as nm
+from pandas import DataFrame
+from rdkit.Chem import (Mol, MolFromInchi, MolFromHELM, MolFromSequence,
+                        MolFromSmiles, MolToInchi, MolToInchiKey,
+                        MolToSmiles)
+from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
+from requests import Session
+import selfies as sf
+from .rest_lookup import _inchikey2pubchem_name_id, _inchikey2cactus_name
+@vectorize
+@return_none_on_error
+def _seq2mol(s: str) -> Union[Mol, None]:
+    return MolFromSequence(s, sanitize=True)
+@vectorize
+@return_none_on_error
+def _helm2mol(s: str) -> Union[Mol, None]:
+    return MolFromHELM(s, sanitize=True)
+def mini_helm2helm(s: str) -> List[str]:
+    new_s = []
+    token = ''
+    between_sq_brackets = False
+    for letter in s:
+        if letter.islower() and not between_sq_brackets:
+            letter = f"[d{letter.upper()}]"
+        token += letter
+        if letter == '[':
+            between_sq_brackets = True
+        elif letter == ']':
+            between_sq_brackets = False
+        if not between_sq_brackets:
+            new_s.append(token)
+            token = ''
+    return "PEPTIDE1{{{inner_helm}}}$$$$".format(inner_helm='.'.join(new_s))
+@vectorize
+@return_none_on_error
+def _mini_helm2mol(s: str) -> Mol:
+    s = mini_helm2helm(s)
+    return MolFromHELM(s, sanitize=True)
+@vectorize
+@return_none_on_error
+def _inchi2mol(s: str) -> Mol:
+    return MolFromInchi(s,
+                        sanitize=True,
+                        removeHs=True)
+@vectorize
+# @return_none_on_error
+def _smiles2mol(s: str) -> Mol:
+    return MolFromSmiles(sanitize_smiles(s))
+@vectorize
+@return_none_on_error
+def _selfies2mol(s: str) -> Mol:
+    return MolFromSmiles(sf.decoder(s))
+@vectorize
+@return_none_on_error
+def _mol2nonstandard_inchikey(m: Mol,
+                              **kwargs) -> str:
+    return MolToInchiKey(m,
+                         options="/FixedH /SUU /RecMet /KET /15T")
+@vectorize
+@return_none_on_error
+def _mol2hash(m: Mol,
+              **kwargs) -> str:
+    nonstandard_inchikey = _mol2nonstandard_inchikey(m)
+    return nm.hash(nonstandard_inchikey)
+@vectorize
+@return_none_on_error
+def _mol2id(m: Mol,
+            n: int = 8,
+            prefix: str = '',
+            **kwargs) -> str:
+    return prefix + str(int(_mol2hash(m), 16))[:n]
+@vectorize
+@return_none_on_error
+def _mol2isomeric_canonical_smiles(m: Mol,
+                                   **kwargs) -> str:
+    return MolToSmiles(m,
+                       isomericSmiles=True,
+                       canonical=True)
+@vectorize
+@return_none_on_error
+def _mol2inchi(m: Mol,
+               **kwargs) -> str:
+    return MolToInchi(m)
+@vectorize
+@return_none_on_error
+def _mol2inchikey(m: Mol,
+                  **kwargs) -> str:
+    return MolToInchiKey(m)
+@vectorize
+@return_none_on_error
+def _mol2random_smiles(m: Mol,
+                       **kwargs) -> str:
+    return MolToSmiles(m,
+                       isomericSmiles=True,
+                       doRandom=True)
+@vectorize
+@return_none_on_error
+def _mol2mnemonic(m: Mol,
+                  **kwargs) -> str:
+    nonstandard_inchikey = _mol2nonstandard_inchikey(m)
+    return nm.encode(nonstandard_inchikey)
+def _mol2pubchem(m: Union[Mol, Iterable[Mol]],
+                 session: Optional[Session] = None,
+                 chunksize: int = 32) -> List[Dict[str, Union[None, int, str]]]:
+    inchikeys = cast(_mol2inchikey(m), to=list)
+    pubchem_ids = []
+    for _inchikeys in batched(inchikeys, chunksize):
+        these_ids = _inchikey2pubchem_name_id(_inchikeys,
+                                              session=session)
+        pubchem_ids += these_ids
+    return pubchem_ids
+@return_none_on_error
+def _mol2pubchem_id(m: Union[Mol, Iterable[Mol]],
+                    session: Optional[Session] = None,
+                    chunksize: int = 32,
+                    **kwargs) -> Union[str, List[str]]:
+    return flatten([val['pubchem_id']
+                     for val in _mol2pubchem(m,
+                                             session=session,
+                                             chunksize=chunksize)])
+@return_none_on_error
+def _mol2pubchem_name(m: Union[Mol, Iterable[Mol]],
+                      session: Optional[Session] = None,
+                      chunksize: int = 32,
+                      **kwargs) -> Union[str, List[str]]:
+    return flatten([val['pubchem_name']
+                     for val in _mol2pubchem(m,
+                                             session=session,
+                                             chunksize=chunksize)])
+@return_none_on_error
+def _mol2cactus_name(m: Union[Mol, Iterable[Mol]],
+                     session: Optional[Session] = None,
+                     **kwargs) -> Union[str, List[str]]:
+    return _inchikey2cactus_name(_mol2inchikey(m),
+                                 session=session)
+@vectorize
+@return_none_on_error
+def _mol2scaffold(m: Mol,
+                  chiral: bool = True,
+                  **kwargs) -> str:
+    return MurckoScaffoldSmiles(mol=m,
+                                includeChirality=chiral)
+@vectorize
+@return_none_on_error
+def _mol2selfies(m: Mol,
+                 **kwargs) -> str:
+    s = sf.encoder(_mol2isomeric_canonical_smiles(m))
+    return s if s != -1 else None
+_TO_FUNCTIONS = {"smiles": _mol2isomeric_canonical_smiles,
+                 "selfies": _mol2selfies,
+                 "inchi": _mol2inchi,
+                 "inchikey": _mol2inchikey,
+                 "nonstandard_inchikey": _mol2nonstandard_inchikey,
+                 "hash": _mol2hash,
+                 "mnemonic": _mol2mnemonic,
+                 "id": _mol2id,
+                 "scaffold": _mol2scaffold,
+                 "permuted_smiles": _mol2random_smiles,
+                 "pubchem_id": _mol2pubchem_id,
+                 "pubchem_name": _mol2pubchem_name,
+                 "cactus_name": _mol2cactus_name}
+_FROM_FUNCTIONS = {"smiles": _smiles2mol,
+                   "selfies": _selfies2mol,
+                   "inchi": _inchi2mol,
+                   "aa_seq": _seq2mol,
+                   "helm": _helm2mol,
+                   "minihelm": _mini_helm2mol}
+def _x2mol(strings: Union[Iterable[str], str],
+           input_representation: str = 'smiles') -> Union[Mol, None, Iterable[Union[Mol, None]]]:
+    from_function = _FROM_FUNCTIONS[input_representation.casefold()]
+    return from_function(strings)
+def _mol2x(mols: Union[Iterable[Mol], Mol],
+           output_representation: str = 'smiles',
+           **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
+    to_function = _TO_FUNCTIONS[output_representation.casefold()]
+    return to_function(mols, **kwargs)
+def convert_string_representation(strings: Union[Iterable[str], str],
+                                  input_representation: str = 'smiles',
+                                  output_representation: str = 'smiles',
+                                  **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
+    """Convert between string representations of chemical structures.
+    """
+    mols = _x2mol(strings, input_representation)
+    # print_err(mols)
+    outstrings = _mol2x(mols, output_representation, **kwargs)
+    # print_err(outstrings)
+    return outstrings
+def _convert_input_to_smiles(f: Callable) -> Callable:
+    @wraps(f)
+    def _f(strings: Union[Iterable[str], str],
+           input_representation: str = 'smiles',
+           *args, **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
+        smiles = convert_string_representation(strings,
+                                               output_representation='smiles',
+                                               input_representation=input_representation)
+        return f(strings=smiles,
+                 *args, **kwargs)
+    return _f

build/lib/schemist/features.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""Tools for generating chemical features."""
+from typing import Any, Callable, Iterable, Optional, Union
+from descriptastorus.descriptors import MakeGenerator
+from pandas import DataFrame, Series
+import numpy as np
+from rdkit.Chem.AllChem import FingeprintGenerator64, GetMorganGenerator, Mol
+from .converting import _smiles2mol, _convert_input_to_smiles
+def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], DataFrame]:
+    def _f(prefix: Optional[str] = None,
+           *args, **kwargs) -> DataFrame:
+        feature_matrix = f(*args, **kwargs)
+        if prefix is not None:
+            new_cols = {col: f"{prefix}_{col}"
+                        for col in feature_matrix.columns
+                        if not col.startswith('_meta')}
+            feature_matrix = feature_matrix.rename(columns=new_cols)
+        return feature_matrix
+    return _f
+def _get_descriptastorus_features(smiles: Iterable[str],
+                                  generator: str) -> DataFrame:
+    generator = MakeGenerator((generator, ))
+    smiles = Series(smiles)
+    features = smiles.apply(lambda z: np.array(generator.process(z)))
+    matrix = np.stack(features.values, axis=0)
+    return DataFrame(matrix,
+                     index=smiles.index,
+                     columns=[col for col, _ in generator.GetColumns()])
+@_feature_matrix
+@_convert_input_to_smiles
+def calculate_2d_features(strings: Union[Iterable[str], str],
+                          normalized: bool = True,
+                          histogram_normalized: bool = True) -> DataFrame:
+    """Calculate 2d features from string representation.
+    """
+    if normalized:
+        if histogram_normalized:
+            generator_name = "RDKit2DHistogramNormalized"
+        else:
+            generator_name = "RDKit2DNormalized"
+    else:
+        generator_name = "RDKit2D"
+    feature_matrix = _get_descriptastorus_features(strings,
+                                                   generator=generator_name)
+    feature_matrix = (feature_matrix
+                      .rename(columns={f"{generator_name}_calculated": "meta_feature_valid0"})
+                      .assign(meta_feature_type=generator_name,
+                              meta_feature_valid=lambda x: (x['meta_feature_valid0'] == 1.))
+                      .drop(columns=['meta_feature_valid0']))
+    return feature_matrix
+def _fast_fingerprint(generator: FingeprintGenerator64,
+                      mol: Mol,
+                      to_np: bool = True) -> Union[str, np.ndarray]:
+    try:
+        fp_string = generator.GetFingerprint(mol).ToBitString()
+    except:
+        return None
+    else:
+        if to_np:
+            return np.frombuffer(fp_string.encode(), 'u1') - ord('0')
+        else:
+            return fp_string
+@_feature_matrix
+@_convert_input_to_smiles
+def calculate_fingerprints(strings: Union[Iterable[str], str],
+                           fp_type: str = 'morgan',
+                           radius: int = 2,
+                           chiral: bool = True,
+                           on_bits: bool = True) -> DataFrame:
+    """
+    """
+    if fp_type.casefold() == 'morgan':
+        generator_class = GetMorganGenerator
+    else:
+        raise AttributeError(f"Fingerprint type {fp_type} not supported!")
+    fp_generator = generator_class(radius=radius,
+                                   includeChirality=chiral)
+    mols = (_smiles2mol(s) for s in strings)
+    fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits)
+                  for mol in mols)
+    if on_bits:
+        fingerprints = (map(str, np.flatnonzero(fp_string).tolist())
+                        for fp_string in fp_strings)
+        fingerprints = [';'.join(fp) for fp in fingerprints]
+        validity = [len(fp) > 0 for fp in fingerprints]
+        feature_matrix = DataFrame(fingerprints,
+                                   columns=['fp_bits'])
+    else:
+        fingerprints = [np.array(int(digit) for digit in fp_string)
+                        if fp_string is not None
+                        else (-np.ones((fp_generator.GetOptions().fpSize, )))
+                        for fp_string in fp_strings]
+        validity = [np.all(fp >= 0) for fp in fingerprints]
+        feature_matrix = DataFrame(np.stack(fingerprints, axis=0),
+                                   columns=[f"fp_{i}" for i in range(len(fingerprints[0]))])
+    return feature_matrix.assign(meta_feature_type=fp_type.casefold(),
+                                 meta_feature_valid=validity)
+_FEATURE_CALCULATORS = {"2d": calculate_2d_features, "fp": calculate_fingerprints}
+def calculate_feature(feature_type: str,
+                      *args, **kwargs):
+    """
+    """
+    featurizer = _FEATURE_CALCULATORS[feature_type]
+    return featurizer(*args, **kwargs)

build/lib/schemist/generating.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""Tools for enumerating compounds. Currently only works with peptides."""
+from typing import Callable, Iterable, Optional, Tuple, Union
+from functools import partial
+from itertools import chain, islice, product, repeat
+from math import ceil, expm1, floor
+from random import choice, choices, random, seed
+from carabiner import print_err
+from carabiner.decorators import vectorize, return_none_on_error
+from carabiner.random import sample_iter
+from rdkit.Chem import Mol, rdChemReactions
+import numpy as np
+from .converting import (_x2mol, _mol2x,
+                         _convert_input_to_smiles)
+AA = tuple('GALVITSMCPFYWHKRDENQ')
+dAA = tuple(aa.casefold() for aa in AA)
+REACTIONS = {'N_to_C_cyclization': '([N;H1:5][C:1][C:2](=[O:6])[O:3].[N;H2:4][C:7][C:8](=[O:9])[N;H1:10])>>[N;H1:5][C:1][C:2](=[O:6])[N;H1:4][C:7][C:8](=[O:9])[N;H1:10].[O;H2:3]',
+             'cysteine_to_chloroacetyl_cyclization': '([N;H1:5][C:2](=[O:6])[C:1][Cl:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8]',
+             'cysteine_to_N_cyclization':'([N;H1:5][C:2](=[O:6])[C:1][N;H2:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8].[N;H3:3]'}
+def _get_alphabet(alphabet: Optional[Iterable[str]] = None,
+                  d_aa_only: bool = False,
+                  include_d_aa: bool = False) -> Tuple[str]:
+    alphabet = alphabet or AA
+    alphabet_lower = tuple(set(aa.casefold() for aa in AA))
+    if d_aa_only:
+        alphabet = alphabet_lower
+    elif include_d_aa:
+        alphabet = tuple(set(chain(alphabet, alphabet_lower)))
+    return alphabet
+def all_peptides_of_one_length(length: int,
+                               alphabet: Optional[Iterable[str]] = None,
+                               d_aa_only: bool = False,
+                               include_d_aa: bool = False) -> Iterable[str]:
+    """
+    """
+    alphabet = _get_alphabet(alphabet=alphabet,
+                             d_aa_only=d_aa_only,
+                             include_d_aa=include_d_aa)
+    return (''.join(peptide)
+            for peptide in product(alphabet, repeat=length))
+def all_peptides_in_length_range(max_length: int,
+                                 min_length: int = 1,
+                                 by: int = 1,
+                                 alphabet: Optional[Iterable[str]] = None,
+                                 d_aa_only: bool = False,
+                                 include_d_aa: bool = False,
+                                 *args, **kwargs) -> Iterable[str]:
+    """
+    """
+    length_range = range(*sorted([min_length, max_length + 1]), by)
+    peptide_maker = partial(all_peptides_of_one_length,
+                            alphabet=alphabet,
+                            d_aa_only=d_aa_only,
+                            include_d_aa=include_d_aa,
+                            *args, **kwargs)
+    return chain.from_iterable(peptide_maker(length=length)
+                               for length in length_range)
+def _number_of_peptides(max_length: int,
+                        min_length: int = 1,
+                        by: int = 1,
+                        alphabet: Optional[Iterable[str]] = None,
+                        d_aa_only: bool = False,
+                        include_d_aa: bool = False):
+    alphabet = _get_alphabet(alphabet=alphabet,
+                             d_aa_only=d_aa_only,
+                             include_d_aa=include_d_aa)
+    n_peptides = [len(alphabet) ** length
+                  for length in range(*sorted([min_length, max_length + 1]), by)]
+    return n_peptides
+def _naive_sample_peptides_in_length_range(max_length: int,
+                                           min_length: int = 1,
+                                           by: int = 1,
+                                           n: Optional[Union[float, int]] = None,
+                                           alphabet: Optional[Iterable[str]] = None,
+                                           d_aa_only: bool = False,
+                                           include_d_aa: bool = False,
+                                           set_seed: Optional[int] = None):
+    alphabet = _get_alphabet(alphabet=alphabet,
+                             d_aa_only=d_aa_only,
+                             include_d_aa=include_d_aa)
+    n_peptides = _number_of_peptides(max_length=max_length,
+                                     min_length=min_length,
+                                     by=by,
+                                     alphabet=alphabet,
+                                     d_aa_only=d_aa_only,
+                                     include_d_aa=include_d_aa)
+    lengths = list(range(*sorted([min_length, max_length + 1]), by))
+    weight_per_length = [n / min(n_peptides) for n in n_peptides]
+    weighted_lengths = list(chain.from_iterable(repeat(l, ceil(w)) for l, w in zip(lengths, weight_per_length)))
+    lengths_sample = (choice(weighted_lengths) for _ in range(n))
+    return (''.join(choices(list(alphabet), k=k)) for k in lengths_sample)
+def sample_peptides_in_length_range(max_length: int,
+                                    min_length: int = 1,
+                                    by: int = 1,
+                                    n: Optional[Union[float, int]] = None,
+                                    alphabet: Optional[Iterable[str]] = None,
+                                    d_aa_only: bool = False,
+                                    include_d_aa: bool = False,
+                                    naive_sampling_cutoff: float = 5e-3,
+                                    reservoir_sampling: bool = True,
+                                    indexes: Optional[Iterable[int]] = None,
+                                    set_seed: Optional[int] = None,
+                                    *args, **kwargs) -> Iterable[str]:
+    """
+    """
+    seed(set_seed)
+    alphabet = _get_alphabet(alphabet=alphabet,
+                             d_aa_only=d_aa_only,
+                             include_d_aa=include_d_aa)
+    n_peptides = sum(len(alphabet) ** length
+                     for length in range(*sorted([min_length, max_length + 1]), by))
+    if n is None:
+        n_requested = n_peptides
+    elif n >= 1.:
+        n_requested = min(floor(n), n_peptides)
+    elif n < 1.:
+        n_requested = floor(n * n_peptides)
+    frac_requested = n_requested / n_peptides
+    # approximation of birthday problem
+    p_any_collision = -expm1(-n_requested * (n_requested - 1.) / (2. * n_peptides))
+    n_collisons = n_requested * (1. - ((n_peptides - 1.) / n_peptides) ** (n_requested - 1.))
+    frac_collisions = n_collisons / n_requested
+    print_err(f"Sampling {n_requested} ({frac_requested * 100.} %) peptides from "
+              f"length {min_length} to {max_length} ({n_peptides} combinations). "
+              f"Probability of collision if drawing randomly is {p_any_collision}, "
+              f"with {n_collisons} ({100. * frac_collisions} %) collisions on average.")
+    if frac_collisions < naive_sampling_cutoff and n_peptides > 2e9:
+        print_err("> Executing naive sampling. ")
+        peptides = _naive_sample_peptides_in_length_range(max_length, min_length, by,
+                                                          n=n_requested,
+                                                          alphabet=alphabet,
+                                                          d_aa_only=d_aa_only,
+                                                          include_d_aa=include_d_aa)
+    else:
+        print_err("> Executing exhaustive sampling.")
+        all_peptides = all_peptides_in_length_range(max_length, min_length, by,
+                                                    alphabet=alphabet,
+                                                    d_aa_only=d_aa_only,
+                                                    include_d_aa=include_d_aa,
+                                                    *args, **kwargs)
+        if n is None:
+            peptides = all_peptides
+        elif n >= 1.:
+            if reservoir_sampling:
+                peptides = sample_iter(all_peptides, k=n_requested,
+                                       shuffle_output=False)
+            else:
+                peptides = (pep for pep in all_peptides
+                            if random() <= frac_requested)
+        elif n < 1.:
+            peptides = (pep for pep in all_peptides
+                        if random() <= n)
+    if indexes is not None:
+        indexes = (int(ix) if (isinstance(ix, str) and ix.isdigit()) or isinstance(ix, int) or isinstance(ix, float)
+                   else None
+                   for ix in islice(indexes, 3))
+        indexes = [ix if (ix is None or ix >= 0) else None
+                   for ix in indexes]
+        if len(indexes) > 1:
+            if n is not None and n >=1. and indexes[0] > n:
+                raise ValueError(f"Minimum slice ({indexes[0]}) is higher than number of items ({n}).")
+        peptides = islice(peptides, *indexes)
+    return peptides
+def _reactor(smarts: str) -> Callable[[Mol], Union[Mol, None]]:
+    rxn = rdChemReactions.ReactionFromSmarts(smarts)
+    reaction_function = rxn.RunReactants
+    @vectorize
+    @return_none_on_error
+    def reactor(s: Mol) -> Mol:
+        return reaction_function([s])[0][0]
+    return reactor
+@_convert_input_to_smiles
+def react(strings: Union[str, Iterable[str]],
+          reaction: str = 'N_to_C_cyclization',
+          output_representation: str = 'smiles',
+          **kwargs) -> Union[str, Iterable[str]]:
+    """
+    """
+    try:
+        _this_reaction = REACTIONS[reaction]
+    except KeyError:
+        raise KeyError(f"Reaction {reaction} is not available. Try: " +
+                        ", ".join(list(REACTIONS)))
+    # strings = cast(strings, to=list)
+    # print_err((strings))
+    reactor = _reactor(_this_reaction)
+    mols = _x2mol(strings)
+    mols = reactor(mols)
+    return _mol2x(mols,
+                  output_representation=output_representation,
+                  **kwargs)

build/lib/schemist/io.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""Tools to facilitate input and output."""
+from typing import Any, Callable, List, Optional, TextIO, Tuple, Union
+from collections import defaultdict
+from functools import partial
+from string import printable
+from tempfile import NamedTemporaryFile
+from xml.etree import ElementTree
+from carabiner import print_err
+from carabiner.cast import cast
+from carabiner.itertools import tenumerate
+from carabiner.pd import read_table, write_stream
+from pandas import DataFrame, read_excel
+from rdkit.Chem import SDMolSupplier
+from .converting import _mol2isomeric_canonical_smiles
+def _mutate_df_stream(input_file: Union[str, TextIO],
+                      output_file: Union[str, TextIO],
+                      function: Callable[[DataFrame], Tuple[Any, DataFrame]],
+                      file_format: Optional[str] = None,
+                      chunksize: int = 1000) -> List[Any]:
+    carries = []
+    for i, chunk in tenumerate(read_table(input_file,
+                                          format=file_format,
+                                          progress=False,
+                                          chunksize=chunksize)):
+        result = function(chunk)
+        try:
+            carry, df = result
+        except ValueError:
+            df = result
+            carry = 0
+        write_stream(df,
+                     output=output_file,
+                     format=file_format,
+                     header=i == 0,
+                     mode='w' if i == 0 else 'a')
+        carries.append(carry)
+    return carries
+def read_weird_xml(filename: Union[str, TextIO],
+                   header: bool = True,
+                   namespace: str = '{urn:schemas-microsoft-com:office:spreadsheet}') -> DataFrame:
+    """
+    """
+    with cast(filename, TextIOWrapper, mode='r') as f:
+        xml_string = ''.join(filter(printable.__contains__, f.read()))
+    try:
+        root = ElementTree.fromstring(xml_string)
+    except Exception as e:
+        print_err('\n!!! ' + xml_string.split('\n')[1184][377:380])
+        raise e
+    for i, row in enumerate(root.iter(f'{namespace}Row') ):
+        this_row = [datum.text for datum in row.iter(f'{namespace}Data')]
+        if i == 0:
+            if header:
+                heading = this_row
+                df = {colname: [] for colname in heading}
+            else:
+                heading = [f'X{j}' for j, _ in enumerate(this_row)]
+                df = {colname: [datum] for colname, datum in zip(heading, this_row)}
+        else:
+            for colname, datum in zip(heading, this_row):
+                df[colname].append(datum)
+    return DataFrame(df)
+def read_sdf(filename: Union[str, TextIO]):
+    """
+    """
+    filename = cast(filename, str)
+    with open(filename, 'r', errors='replace') as f:
+        with NamedTemporaryFile("w") as o:
+            o.write(f.read())
+            o.seek(0)
+            df = defaultdict(list)
+            for i, mol in enumerate(SDMolSupplier(o.name)):
+                if mol is None:
+                    continue
+                propdict = mol.GetPropsAsDict()
+                propdict['SMILES'] = _mol2isomeric_canonical_smiles(mol)
+                for colname in propdict:
+                    df[colname].append(propdict[colname])
+                for colname in df:
+                    if colname not in propdict:
+                        df[colname].append(None)
+    col_lengths = {col: len(val) for col, val in df.items()}
+    if len(set(col_lengths.values())) > 1:
+        raise ValueError(f"Column lengths not all the same:\n\t" +
+                         '\n\t'.join(f"{key}:{val}" for key, val in col_lengths.items()))
+    return DataFrame(df)
+FILE_READERS = {
+    'bad_xml': read_weird_xml,
+    'xlsx': partial(read_excel, engine='openpyxl'),
+    'sdf': read_sdf
+}

build/lib/schemist/rest_lookup.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""Tools for querying PubChem."""
+from typing import Dict, Iterable, List, Optional, Union
+from time import sleep
+from xml.etree import ElementTree
+from carabiner import print_err
+from carabiner.cast import cast
+from carabiner.decorators import vectorize
+from requests import Response, Session
+_PUBCHEM_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/{get}/{format}"
+_CACTUS_URL = "https://cactus.nci.nih.gov/chemical/structure/{inchikey}/{get}"
+_OVERLOAD_CODES = {500, 501, 503, 504}
+def _url_request(inchikeys: Union[str, Iterable[str]],
+                 url: str,
+                 session: Optional[Session] = None,
+                 **kwargs) -> Response:
+    if session is None:
+        session = Session()
+    inchikeys = cast(inchikeys, to=list)
+    return session.get(url.format(inchikey=','.join(inchikeys), **kwargs))
+def _inchikey2pubchem_name_id(inchikeys: Union[str, Iterable[str]],
+                        session: Optional[Session] = None,
+                        counter: int = 0,
+                        max_tries: int = 10,
+                        namespace: str = "{http://pubchem.ncbi.nlm.nih.gov/pug_rest}") -> List[Dict[str, Union[None, int, str]]]:
+    r = _url_request(inchikeys, url=_PUBCHEM_URL,
+                     session=session,
+                     get="Title,InchiKey", format="XML")
+    if r.status_code == 200:
+        root = ElementTree.fromstring(r.text)
+        compounds = root.iter(f'{namespace}Properties')
+        result_dict = dict()
+        for cmpd in compounds:
+            cmpd_dict = dict()
+            for child in cmpd:
+                cmpd_dict[child.tag.split(namespace)[1]] = child.text
+            try:
+                inchikey, name, pcid = cmpd_dict['InChIKey'], cmpd_dict['Title'], cmpd_dict['CID']
+            except KeyError:
+                print(cmpd_dict)
+            else:
+                result_dict[inchikey] = {'pubchem_name': name.casefold(),
+                                         'pubchem_id': pcid}
+        print_err(f'PubChem: Looked up InchiKeys: {",".join(inchikeys)}')
+        result_list = [result_dict[inchikey]
+                       if inchikey in result_dict
+                       else {'pubchem_name': None, 'pubchem_id': None}
+                       for inchikey in inchikeys]
+        return result_list
+    elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
+        sleep(1.)
+        return _inchikey2pubchem_name_id(inchikeys,
+                                         session=session,
+                                         counter=counter + 1,
+                                         max_tries=max_tries,
+                                         namespace=namespace)
+    else:
+        print_err(f'PubChem: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
+        return [{'pubchem_name': None, 'pubchem_id': None}
+                for _ in range(len(inchikeys))]
+@vectorize
+def _inchikey2cactus_name(inchikeys: str,
+                          session: Optional[Session] = None,
+                          counter: int = 0,
+                          max_tries: int = 10):
+    r = _url_request(inchikeys, url=_CACTUS_URL,
+                     session=session,
+                     get="names")
+    if r.status_code == 200:
+        return r.text.split('\n')[0].casefold()
+    elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
+        sleep(1.)
+        return _inchikey2cactus_name(inchikeys,
+                                     session=session,
+                                     counter=counter + 1,
+                                     max_tries=max_tries)
+    else:
+        print_err(f'Cactus: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
+        return None

build/lib/schemist/splitting.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""Tools for splitting tabular datasets, optionally based on chemical features."""
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+from collections import defaultdict
+from math import ceil
+from random import random, seed
+try:
+    from itertools import batched
+except ImportError:
+    from carabiner.itertools import batched
+from tqdm.auto import tqdm
+from .converting import convert_string_representation, _convert_input_to_smiles
+from .typing import DataSplits
+# def _train_test_splits
+def _train_test_val_sizes(total: int,
+                          train: float = 1.,
+                          test: float = 0.) -> Tuple[int]:
+    n_train = int(ceil(train * total))
+    n_test = int(ceil(test * total))
+    n_val = total - n_train - n_test
+    return n_train, n_test, n_val
+def _random_chunk(strings: str,
+                  train: float = 1.,
+                  test: float = 0.,
+                  carry: Optional[Dict[str, List[int]]] = None,
+                  start_from: int = 0) -> Dict[str, List[int]]:
+    carry = carry or defaultdict(list)
+    train_test: float = train + test
+    for i, _ in enumerate(strings):
+        random_number: float = random()
+        if random_number < train:
+            key = 'train'
+        elif random_number < train_test:
+            key = 'test'
+        else:
+            key = 'validation'
+        carry[key].append(start_from + i)
+    return carry
+def split_random(strings: Union[str, Iterable[str]],
+                 train: float = 1.,
+                 test: float = 0.,
+                 chunksize: Optional[int] = None,
+                 set_seed: Optional[int] = None,
+                 *args, **kwargs) -> DataSplits:
+    """
+    """
+    if set_seed is not None:
+        seed(set_seed)
+    if chunksize is None:
+        idx = _random_chunk(strings=strings,
+                            train=train,
+                            test=test)
+    else:
+        idx = defaultdict(list)
+        for i, chunk in enumerate(batched(strings, chunksize)):
+            idx = _random_chunk(strings=chunk,
+                                train=train,
+                                test=test,
+                                carry=idx,
+                                start_from=i * chunksize)
+    seed(None)
+    return DataSplits(**idx)
+@_convert_input_to_smiles
+def _scaffold_chunk(strings: str,
+                    carry: Optional[Dict[str, List[int]]] = None,
+                    start_from: int = 0) -> Dict[str, List[int]]:
+    carry = carry or defaultdict(list)
+    these_scaffolds = convert_string_representation(strings=strings,
+                                                    output_representation='scaffold')
+    for j, scaff in enumerate(these_scaffolds):
+        carry[scaff].append(start_from + j)
+    return carry
+def _scaffold_aggregator(scaffold_sets: Dict[str, List[int]],
+                         train: float = 1.,
+                         test: float = 0.,
+                         progress: bool = False) -> DataSplits:
+    scaffold_sets = {key: sorted(value)
+                     for key, value in scaffold_sets.items()}
+    scaffold_sets = sorted(scaffold_sets.items(),
+                           key=lambda x: (len(x[1]), x[1][0]),
+                           reverse=True)
+    nrows = sum(len(idx) for _, idx in scaffold_sets)
+    n_train, n_test, n_val = _train_test_val_sizes(nrows,
+                                                   train,
+                                                   test)
+    idx = defaultdict(list)
+    iterator = tqdm(scaffold_sets) if progress else scaffold_sets
+    for _, scaffold_idx in iterator:
+        if (len(idx['train']) + len(scaffold_idx)) > n_train:
+            if (len(idx['test']) + len(scaffold_idx)) > n_test:
+                key = 'validation'
+            else:
+                key = 'test'
+        else:
+            key = 'train'
+        idx[key] += scaffold_idx
+    return DataSplits(**idx)
+def split_scaffold(strings: Union[str, Iterable[str]],
+                   train: float = 1.,
+                   test: float = 0.,
+                   chunksize: Optional[int] = None,
+                   progress: bool = True) -> DataSplits:
+    """
+    """
+    if chunksize is None:
+        scaffold_sets = _scaffold_chunk(strings)
+    else:
+        scaffold_sets = defaultdict(list)
+        for i, chunk in enumerate(batched(strings, chunksize)):
+            scaffold_sets = _scaffold_chunk(chunk,
+                                            carry=scaffold_sets,
+                                            start_from=i * chunksize)
+    return _scaffold_aggregator(scaffold_sets,
+                                train=train, test=test,
+                                progress=progress)
+_SPLITTERS = {#'simpd': split_simpd,
+              'scaffold': split_scaffold,
+              'random': split_random}
+# _SPLIT_SUPERTYPES = {'scaffold': 'grouped',
+#                      'random': 'independent'}
+_GROUPED_SPLITTERS = {'scaffold': (_scaffold_chunk, _scaffold_aggregator)}
+assert all(_type in _SPLITTERS
+           for _type in _GROUPED_SPLITTERS)  ## Should never fail!
+def split(split_type: str,
+          *args, **kwargs) -> DataSplits:
+    """
+    """
+    splitter = _SPLITTERS[split_type]
+    return splitter(*args, **kwargs)

build/lib/schemist/tables.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""Tools for processing tabular data."""
+from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Tuple, Union
+from functools import partial
+try:
+    from itertools import batched
+except ImportError:
+    from carabiner.itertools import batched
+from carabiner.cast import cast, clist
+from carabiner import print_err
+from pandas import DataFrame, concat
+from .cleaning import clean_smiles, clean_selfies
+from .converting import convert_string_representation
+from .features import calculate_feature
+from .generating import sample_peptides_in_length_range, react
+from .splitting import split
+from .typing import DataSplits
+def _get_error_tally(df: DataFrame,
+                     cols: Union[str, List[str]]) -> Dict[str, int]:
+    cols = cast(cols, to=list)
+    try:
+        tally = {col: (df[col].isna() | ~df[col]).sum() for col in cols}
+    except TypeError:
+        tally = {col: df[col].isna().sum() for col in cols}
+    return tally
+def converter(df: DataFrame,
+              column: str = 'smiles',
+              input_representation: str = 'smiles',
+              output_representation: Union[str, List[str]] = 'smiles',
+              prefix: Optional[str] = None,
+              options: Optional[Dict[str, Any]] = None) -> Tuple[Dict[str, int], DataFrame]:
+    """
+    """
+    prefix = prefix or ''
+    converters = {f"{prefix}{rep_out}": partial(convert_string_representation,
+                                                output_representation=rep_out,
+                                                input_representation=input_representation,
+                                                **options)
+                  for rep_out in cast(output_representation, to=list)}
+    column_values = df[column]
+    converted = {col: cast(f(column_values), to=list)
+                 for col, f in converters.items()}
+    df = df.assign(**converted)
+    return  _get_error_tally(df, list(converters)), df
+def cleaner(df: DataFrame,
+            column: str = 'smiles',
+            input_representation: str = 'smiles',
+            prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
+    """
+    """
+    if input_representation.casefold() == 'smiles':
+        cleaner = clean_smiles
+    elif input_representation.casefold() == 'selfies':
+        cleaner = clean_selfies
+    else:
+        raise ValueError(f"Representation {input_representation} is not supported for cleaning.")
+    prefix = prefix or ''
+    new_column = f"{prefix}{column}"
+    df = df.assign(**{new_column: lambda x: cleaner(x[column])})
+    return _get_error_tally(df, new_column), df
+def featurizer(df: DataFrame,
+               feature_type: str,
+               column: str = 'smiles',
+               ids: Optional[Union[str, List[str]]] = None,
+               input_representation: str = 'smiles',
+               prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
+    """
+    """
+    if ids is None:
+        ids = df.columns.tolist()
+    else:
+        ids = cast(ids, to=list)
+    feature_df = calculate_feature(feature_type=feature_type,
+                                   strings=df[column],
+                                   prefix=prefix,
+                                   input_representation=input_representation)
+    if len(ids) > 0:
+        df = concat([df[ids], feature_df], axis=1)
+    return _get_error_tally(feature_df, 'meta_feature_valid'), df
+def assign_groups(df: DataFrame,
+                  grouper: Callable[[Union[str, Iterable[str]]], Dict[str, Tuple[int]]],
+                  group_name: str = 'group',
+                  column: str = 'smiles',
+                  input_representation: str = 'smiles',
+                  *args, **kwargs) -> Tuple[Dict[str, Tuple[int]], DataFrame]:
+    group_idx = grouper(strings=df[column],
+                        input_representation=input_representation,
+                        *args, **kwargs)
+    inv_group_idx = {i: group for group, idx in group_idx.items() for i in idx}
+    groups = [inv_group_idx[i] for i in range(len(inv_group_idx))]
+    return group_idx, df.assign(**{group_name: groups})
+def _assign_splits(df: DataFrame,
+                   split_idx: DataSplits,
+                   use_df_index: bool = False) -> DataFrame:
+    row_index = df.index if use_df_index else tuple(range(df.shape[0]))
+    df = df.assign(**{f'is_{key}': [i in getattr(split_idx, key) for i in row_index]
+                      for key in split_idx._fields})
+    split_counts = {key: sum(df[f'is_{key}'].values) for key in split_idx._fields}
+    return split_counts, df
+def splitter(df: DataFrame,
+             split_type: str = 'random',
+             column: str = 'smiles',
+             input_representation: str = 'smiles',
+             *args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
+    """
+    """
+    split_idx = split(split_type=split_type,
+                      strings=df[column],
+                      input_representation=input_representation,
+                      *args, **kwargs)
+    return _assign_splits(df, split_idx=split_idx)
+def reactor(df: DataFrame,
+            column: str = 'smiles',
+            reaction: Union[str, Iterable[str]] = 'N_to_C_cyclization',
+            prefix: Optional[str] = None,
+            *args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
+    """
+    """
+    prefix = prefix or ''
+    reactors = {col: partial(react, reaction=col)
+                for col in cast(reaction, to=list)}
+    column_values = df[column]
+    new_columns = {f"{prefix}{col}": list(_reactor(strings=column_values, *args, **kwargs))
+                   for col, _reactor in reactors.items()}
+    df = df.assign(**new_columns)
+    return _get_error_tally(df, reaction), df
+def _peptide_table(max_length: int,
+                   min_length: Optional[int] = None,
+                   by: int = 1,
+                   n: Optional[Union[float, int]] = None,
+                   prefix: str = '',
+                   suffix: str = '',
+                   generator: bool = False,
+                   batch_size: int = 1000,
+                   *args, **kwargs) -> Union[DataFrame, Generator]:
+    min_length = min_length or max_length
+    peptides = sample_peptides_in_length_range(max_length=max_length,
+                                               min_length=min_length,
+                                               by=by,
+                                               n=n,
+                                               *args, **kwargs)
+    if generator:
+        for peps in batched(peptides, batch_size):
+            peps = [f"{prefix}{pep}{suffix}"
+                    for pep in peps]
+            yield DataFrame(dict(peptide_sequence=peps))
+    else:
+        peps = [f"{prefix}{pep}{suffix}"
+                for pep in peptides]
+        return DataFrame(dict(peptide_sequence=peps))

build/lib/schemist/typing.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Types used in schemist."""
+from collections import namedtuple
+DataSplits = namedtuple('DataSplits',
+                        ['train', 'test', 'validation'],
+                        defaults=[tuple(), tuple(), tuple()])

build/lib/schemist/utils.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Miscellaneous utilities for schemist."""

docs/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+myst_parser
+matplotlib
+numpy
+openpyxl==3.1.0
+pandas
+scipy
+sphinx_rtd_theme
+./

docs/source/conf.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+project = 'schemist'
+copyright = '2024, Eachan Johnson'
+author = 'Eachan Johnson'
+release = '0.0.1'
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+extensions = ['sphinx.ext.doctest',
+              'sphinx.ext.autodoc',
+              'sphinx.ext.autosummary',
+              'sphinx.ext.napoleon',
+              'sphinx.ext.viewcode',
+              'myst_parser']
+myst_enable_extensions = [
+    "amsmath",
+    "dollarmath",
+]
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.txt': 'markdown',
+    '.md': 'markdown',
+}
+templates_path = ['_templates']
+exclude_patterns = []
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+html_theme = 'sphinx_rtd_theme'
+html_static_path = []

docs/source/index.md ADDED Viewed

	@@ -0,0 +1,21 @@

+# ⬢⬢⬢ schemist
+![GitHub Workflow Status (with branch)](https://img.shields.io/github/actions/workflow/status/scbirlab/schemist/python-publish.yml)
+![PyPI - Python Version](https://img.shields.io/pypi/pyversions/schemist)
+![PyPI](https://img.shields.io/pypi/v/schemist)
+Cleaning, collating, and augmenting chemical datasets.
+```{toctree}
+:maxdepth: 2
+:caption: Contents:
+installation
+usage
+python
+modules
+```
+## Source
+`GitHub <https://github.com/scbirlab/schemist>`_

pyproject.toml ADDED Viewed

	@@ -0,0 +1,60 @@

+[project]
+name = "schemist"
+version = "0.0.1"
+authors = [
+  { name="Eachan Johnson", email="[email protected]" },
+]
+description = "Organizing and processing tables of chemical structures."
+readme = "README.md"
+requires-python = ">=3.8"
+license = {file = "LICENSE"}
+keywords = ["science", "chemistry", "SMILES", "SELFIES", "cheminformatics"]
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  # Indicate who your project is intended for
+  "Intended Audience :: Science/Research",
+  "Topic :: Scientific/Engineering :: Chemistry",
+  "License :: OSI Approved :: MIT License",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3 :: Only",
+]
+dependencies = [
+  "git+https://github.com/scbirlab/carabiner.git",
+  "datamol",
+  "descriptastorus",
+  "nemony",
+  "openpyxl==3.1.0",
+  "pandas",
+  "rdkit",
+  "requests",
+  "selfies"
+]
+[project.urls]
+"Homepage" = "https://github.com/scbirlab/schemist"
+"Repository" = "https://github.com/scbirlab/schemist.git"
+"Bug Tracker" = "https://github.com/scbirlab/schemist/issues"
+"Documentation" = "https://readthedocs.org/schemist"
+[project.scripts]  # Optional
+schemist = "schemist.cli:main"
+[tool.setuptools]
+# If there are data files included in your packages that need to be
+# installed, specify them here.
+# package-data = {"" = ["*.yml"]}
+[build-system]
+# These are the assumed default build requirements from pip:
+# https://pip.pypa.io/en/stable/reference/pip/#pep-517-and-518-support
+requires = ["setuptools>=43.0.0", "wheel"]
+build-backend = "setuptools.build_meta"

schemist/__init__.py ADDED Viewed

File without changes

schemist/cleaning.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Chemical structure cleaning routines."""
+from carabiner.decorators import vectorize
+from datamol import sanitize_smiles
+import selfies as sf
+@vectorize
+def clean_smiles(smiles: str,
+                 *args, **kwargs) -> str:
+    """Sanitize a SMILES string or list of SMILES strings.
+    """
+    return sanitize_smiles(smiles, *args, **kwargs)
+@vectorize
+def clean_selfies(selfies: str,
+                  *args, **kwargs) -> str:
+    """Sanitize a SELFIES string or list of SELFIES strings.
+    """
+    return sf.encode(sanitize_smiles(sf.decode(selfies), *args, **kwargs))

schemist/cli.py ADDED Viewed

	@@ -0,0 +1,536 @@

+"""Command-line interface for schemist."""
+from typing import Any, Dict, List, Optional
+from argparse import FileType, Namespace
+from collections import Counter, defaultdict
+from functools import partial
+import os
+import sys
+from tempfile import NamedTemporaryFile, TemporaryDirectory
+from carabiner import pprint_dict, upper_and_lower, print_err
+from carabiner.cliutils import clicommand, CLIOption, CLICommand, CLIApp
+from carabiner.itertools import tenumerate
+from carabiner.pd import get_formats, write_stream
+from .collating import collate_inventory, deduplicate_file
+from .converting import _TO_FUNCTIONS, _FROM_FUNCTIONS
+from .generating import AA, REACTIONS
+from .io import _mutate_df_stream
+from .tables import (converter, cleaner, featurizer, assign_groups,
+                     _assign_splits, splitter, _peptide_table, reactor)
+from .splitting import _SPLITTERS, _GROUPED_SPLITTERS
+__version__ = '0.0.1'
+def _option_parser(x: Optional[List[str]]) -> Dict[str, Any]:
+    options = {}
+    try:
+        for opt in x:
+            try:
+                key, value = opt.split('=')
+            except ValueError:
+                raise ValueError(f"Option {opt} is misformatted. It should be in the format keyword=value.")
+            try:
+                value = int(value)
+            except ValueError:
+                try:
+                    value = float(value)
+                except ValueError:
+                    pass
+            options[key] = value
+    except TypeError:
+        pass
+    return options
+def _sum_tally(tallies: Counter,
+               message: str = "Error counts",
+               use_length: bool = False):
+    total_tally = Counter()
+    for tally in tallies:
+        if use_length:
+            total_tally.update({key: len(value) for key, value in tally.items()})
+        else:
+            total_tally.update(tally)
+    if len(tallies) == 0:
+        raise ValueError(f"Nothing generated!")
+    pprint_dict(total_tally, message=message)
+    return total_tally
+@clicommand(message="Cleaning file with the following parameters")
+def _clean(args: Namespace) -> None:
+    error_tallies = _mutate_df_stream(input_file=args.input,
+                                      output_file=args.output,
+                                      function=partial(cleaner,
+                                                       column=args.column,
+                                                       input_representation=args.representation,
+                                                       prefix=args.prefix),
+                                      file_format=args.format)
+    _sum_tally(error_tallies)
+    return None
+@clicommand(message="Converting between string representations with the following parameters")
+def _convert(args: Namespace) -> None:
+    options = _option_parser(args.options)
+    error_tallies = _mutate_df_stream(input_file=args.input,
+                                      output_file=args.output,
+                                      function=partial(converter,
+                                                       column=args.column,
+                                                       input_representation=args.representation,
+                                                       output_representation=args.to,
+                                                       prefix=args.prefix,
+                                                       options=options),
+                                      file_format=args.format)
+    _sum_tally(error_tallies)
+    return None
+@clicommand(message="Adding features to files with the following parameters")
+def _featurize(args: Namespace) -> None:
+    error_tallies = _mutate_df_stream(input_file=args.input,
+                                      output_file=args.output,
+                                      function=partial(featurizer,
+                                                       feature_type=args.feature,
+                                                       column=args.column,
+                                                       ids=args.id,
+                                                       input_representation=args.representation,
+                                                       prefix=args.prefix),
+                                      file_format=args.format)
+    _sum_tally(error_tallies)
+    return None
+@clicommand(message="Splitting table with the following parameters")
+def _split(args: Namespace) -> None:
+    split_type = args.type.casefold()
+    if split_type in _GROUPED_SPLITTERS:
+        chunk_processor, aggregator = _GROUPED_SPLITTERS[split_type]
+        with TemporaryDirectory() as dir:
+            with NamedTemporaryFile("w", dir=dir, delete=False) as f:
+                group_idxs = _mutate_df_stream(input_file=args.input,
+                                                output_file=f,
+                                                function=partial(assign_groups,
+                                                                grouper=chunk_processor,
+                                                                group_name=split_type,
+                                                                column=args.column,
+                                                                input_representation=args.representation),
+                                                file_format=args.format)
+                f.close()
+                new_group_idx = defaultdict(list)
+                totals = 0
+                for group_idx in group_idxs:
+                    these_totals = 0
+                    for key, value in group_idx.items():
+                        these_totals += len(value)
+                        new_group_idx[key] += [idx + totals for idx in value]
+                    totals += these_totals
+                group_idx = aggregator(new_group_idx,
+                                    train=args.train,
+                                    test=args.test)
+                split_tallies = _mutate_df_stream(input_file=f.name,
+                                                  output_file=args.output,
+                                                  function=partial(_assign_splits,
+                                                                  split_idx=group_idx,
+                                                                  use_df_index=True),
+                                                  file_format=args.format)
+                if os.path.exists(f.name):
+                    os.remove(f.name)
+    else:
+        split_tallies = _mutate_df_stream(input_file=args.input,
+                                          output_file=args.output,
+                                          function=partial(splitter,
+                                                           split_type=args.type,
+                                                           column=args.column,
+                                                           input_representation=args.representation,
+                                                           train=args.train,
+                                                           test=args.test,
+                                                           set_seed=args.seed),
+                                          file_format=args.format)
+    _sum_tally(split_tallies,
+               message="Split counts")
+    return None
+@clicommand(message="Collating files with the following parameters")
+def _collate(args: Namespace) -> None:
+    root_dir = args.data_dir or '.'
+    error_tallies = _mutate_df_stream(input_file=args.input,
+                                      output_file=args.output,
+                                      function=partial(collate_inventory,
+                                                       root_dir=root_dir,
+                                                       drop_unmapped=not args.keep_extra_columns,
+                                                       catalog_smiles_column=args.column,
+                                                       id_column_name=args.id_column,
+                                                       id_n_digits=args.digits,
+                                                       id_prefix=args.prefix),
+                                      file_format=args.format)
+    _sum_tally(error_tallies,
+               message="Collated chemicals:")
+    return None
+@clicommand(message="Deduplicating chemical structures with the following parameters")
+def _dedup(args: Namespace) -> None:
+    report, deduped_df = deduplicate_file(args.input,
+                                      format=args.format,
+                                      column=args.column,
+                                      input_representation=args.representation,
+                                      index_columns=args.indexes)
+    if args.prefix is not None and 'inchikey' in deduped_df:
+        deduped_df = deduped_df.rename(columns={'inchikey': f'{args.prefix}inchikey'})
+    write_stream(deduped_df,
+                 output=args.output,
+                 format=args.format)
+    pprint_dict(report, message="Finished deduplicating:")
+    return None
+@clicommand(message="Enumerating peptides with the following parameters")
+def _enum(args: Namespace) -> None:
+    tables = _peptide_table(max_length=args.max_length,
+                            min_length=args.min_length,
+                            n=args.number,
+                            indexes=args.slice,
+                            set_seed=args.seed,
+                            prefix=args.prefix,
+                            suffix=args.suffix,
+                            d_aa_only=args.d_aa_only,
+                            include_d_aa=args.include_d_aa,
+                            generator=True)
+    dAA_use = any(aa.islower() for aa in args.prefix + args.suffix)
+    dAA_use = dAA_use or args.include_d_aa or args.d_aa_only
+    tallies, error_tallies = [], []
+    options = _option_parser(args.options)
+    _converter = partial(converter,
+                         column='peptide_sequence',
+                         input_representation='minihelm' if dAA_use else 'aa_seq',  ## affects performance
+                         output_representation=args.to,
+                         options=options)
+    for i, table in tenumerate(tables):
+        _err_tally, df = _converter(table)
+        tallies.append({"Number of peptides": df.shape[0]})
+        error_tallies.append(_err_tally)
+        write_stream(df,
+                     output=args.output,
+                     format=args.format,
+                     mode='w' if i == 0  else 'a',
+                     header=i == 0)
+    _sum_tally(tallies,
+               message="Enumerated peptides")
+    _sum_tally(error_tallies,
+               message="Conversion errors")
+    return None
+@clicommand(message="Reacting peptides with the following parameters")
+def _react(args: Namespace) -> None:
+    error_tallies = _mutate_df_stream(input_file=args.input,
+                                      output_file=args.output,
+                                      function=partial(reactor,
+                                                       column=args.column,
+                                                       input_representation=args.representation,
+                                                       reaction=args.reaction,
+                                                       product_name=args.name),
+                                      file_format=args.format)
+    _sum_tally(error_tallies)
+    return None
+def main() -> None:
+    inputs = CLIOption('input',
+                       default=sys.stdin,
+                       type=FileType('r'),
+                       nargs='?',
+                       help='Input columnar Excel, CSV or TSV file. Default: STDIN.')
+    representation = CLIOption('--representation', '-r',
+                       type=str,
+                       default='SMILES',
+                       choices=upper_and_lower(_FROM_FUNCTIONS),
+                       help='Chemical representation to use for input. ')
+    column = CLIOption('--column', '-c',
+                       default='smiles',
+                       type=str,
+                       help='Column to use as input string representation. ')
+    prefix = CLIOption('--prefix', '-p',
+                       default=None,
+                       type=str,
+                       help='Prefix to add to new column name. Default: no prefix')
+    to = CLIOption('--to', '-2',
+                       type=str,
+                       default='SMILES',
+                       nargs='*',
+                       choices=upper_and_lower(_TO_FUNCTIONS),
+                       help='Format to convert to.')
+    options = CLIOption('--options', '-x',
+                       type=str,
+                       default=None,
+                       nargs='*',
+                       help='Options to pass to converter, in the format '
+                           '"keyword1=value1 keyword2=value2"')
+    output = CLIOption('--output', '-o',
+                       type=FileType('w'),
+                       default=sys.stdout,
+                       help='Output file. Default: STDOUT')
+    formatting = CLIOption('--format', '-f',
+                           type=str,
+                           default=None,
+                           choices=upper_and_lower(get_formats()),
+                           help='Override file extensions for input and output. '
+                                'Default: infer from file extension.')
+    ## featurize
+    id_feat = CLIOption('--id', '-i',
+                        type=str,
+                        default=None,
+                        nargs='*',
+                        help='Columns to retain in output table. Default: use all')
+    feature = CLIOption('--feature', '-t',
+                           type=str,
+                           default='2d',
+                           choices=['2d', 'fp'],  ## TODO: implement 3d
+                           help='Which feature type to generate.')
+    ## split
+    type_ = CLIOption('--type', '-t',
+                       type=str,
+                       default='random',
+                       choices=upper_and_lower(_SPLITTERS),
+                       help='Which split type to use.')
+    train = CLIOption('--train', '-a',
+                       type=float,
+                       default=1.,
+                       help='Proportion of data to use for training. ')
+    test = CLIOption('--test', '-b',
+                       type=float,
+                       default=0.,
+                       help='Proportion of data to use for testing. ')
+    ## collate
+    data_dir = CLIOption('--data-dir', '-d',
+                         type=str,
+                         default=None,
+                         help='Directory containing data files. '
+                              'Default: current directory')
+    id_column = CLIOption('--id-column', '-s',
+                         default=None,
+                         type=str,
+                         help='If provided, add a structure ID column with this name. '
+                              'Default: don\'t add structure IDs')
+    prefix_collate = CLIOption('--prefix', '-p',
+                         default='ID-',
+                         type=str,
+                         help='Prefix to add to structure IDs. '
+                              'Default: no prefix')
+    digits = CLIOption('--digits', '-n',
+                         default=8,
+                         type=int,
+                         help='Number of digits in structure IDs. ')
+    keep_extra_columns = CLIOption('--keep-extra-columns', '-x',
+                         action='store_true',
+                         help='Whether to keep columns not mentioned in the catalog. '
+                              'Default: drop extra columns.')
+    keep_invalid_smiles = CLIOption('--keep-invalid-smiles', '-y',
+                         action='store_true',
+                         help='Whether to keep rows with invalid SMILES. '
+                              'Default: drop invalid rows.')
+    ## dedup
+    indexes = CLIOption('--indexes', '-x',
+                        type=str,
+                        default=None,
+                        nargs='*',
+                         help='Columns to retain and collapse (if multiple values per unique structure). '
+                              'Default: retain no other columns than structure and InchiKey.')
+    drop_inchikey = CLIOption('--drop-inchikey', '-d',
+                         action='store_true',
+                         help='Whether to drop the calculated InchiKey column. '
+                              'Default: keep InchiKey.')
+    ### enum
+    max_length = CLIOption('--max-length', '-l',
+                           type=int,
+                           help='Maximum length of enumerated peptide. '
+                                'Required.')
+    min_length = CLIOption('--min-length', '-m',
+                      type=int,
+                      default=None,
+                      help='Minimum length of enumerated peptide. '
+                           'Default: same as maximum, i.e. all peptides same length.')
+    number_to_gen = CLIOption('--number', '-n',
+                              type=float,
+                              default=None,
+                              help='Number of peptides to sample from all possible '
+                                   'within the constraints. If less than 1, sample '
+                                   'that fraction of all possible. If greater than 1, '
+                                   'sample that number. '
+                                   'Default: return all peptides.')
+    slicer = CLIOption('--slice', '-z',
+                       type=str,
+                       default=None,
+                       nargs='*',
+                       help='Subset of (possibly sampled) population to return, in the format <stop> '
+                            'or <start> <stop> [<step>]. If "x" is used for <stop>, then it runs to the end. '
+                            'For example, 1000 gives the first 1000, 2 600 gives items 2-600, and '
+                            '3 500 2 gives every other from 3 to 500. Default: return all.')
+    alphabet = CLIOption('--alphabet', '-b',
+                      type=str,
+                      default=''.join(AA),
+                      help='Alphabet to use in sampling.')
+    suffix = CLIOption('--suffix', '-s',
+                      type=str,
+                      default='',
+                      help='Sequence to add to end. Lowercase for D-amino acids. '
+                           'Default: no suffix.')
+    set_seed = CLIOption('--seed', '-e',
+                      type=int,
+                      default=None,
+                      help='Seed to use for reproducible randomness. '
+                           'Default: don\'t enable reproducibility.')
+    d_aa_only = CLIOption('--d-aa-only', '-a',
+                      action='store_true',
+                      help='Whether to only use D-amino acids. '
+                           'Default: don\'t include.')
+    include_d_aa = CLIOption('--include-d-aa', '-y',
+                      action='store_true',
+                      help='Whether to include D-amino acids in enumeration. '
+                           'Default: don\'t include.')
+    ## reaction
+    name = CLIOption('--name', '-n',
+                     type=str,
+                     default=None,
+                     help='Name of column for product. '
+                          'Default: same as reaction name.')
+    reaction_opt = CLIOption('--reaction', '-x',
+                             type=str,
+                             nargs='*',
+                             choices=list(REACTIONS),
+                             default='N_to_C_cyclization',
+                             help='Reaction(s) to apply.')
+    clean = CLICommand('clean',
+                       description='Clean and normalize SMILES column of a table.',
+                       main=_clean,
+                       options=[output, formatting, inputs, representation, column, prefix])
+    convert = CLICommand('convert',
+                         description='Convert between string representations of chemical structures.',
+                         main=_convert,
+                         options=[output, formatting, inputs, representation, column, prefix, to, options])
+    featurize = CLICommand('featurize',
+                         description='Convert between string representations of chemical structures.',
+                         main=_featurize,
+                         options=[output, formatting, inputs, representation, column, prefix,
+                                  id_feat, feature])
+    collate = CLICommand('collate',
+                         description='Collect disparate tables or SDF files of libraries into a single table.',
+                         main=_collate,
+                         options=[output, formatting, inputs, representation,
+                                  data_dir, column.replace(default='input_smiles'), id_column, prefix_collate,
+                                  digits, keep_extra_columns, keep_invalid_smiles])
+    dedup = CLICommand('dedup',
+                         description='Deduplicate chemical structures and retain references.',
+                         main=_dedup,
+                         options=[output, formatting, inputs, representation, column, prefix,
+                                  indexes, drop_inchikey])
+    enum = CLICommand('enumerate',
+                      description='Enumerate bio-chemical structures within length and sequence constraints.',
+                      main=_enum,
+                      options=[output, formatting, to, options,
+                               alphabet, max_length, min_length, number_to_gen,
+                               slicer, set_seed,
+                               prefix.replace(default='',
+                                              help='Sequence to prepend. Lowercase for D-amino acids. '
+                                                   'Default: no prefix.'),
+                               suffix,
+                               type_.replace(default='aa',
+                                             choices=['aa'],
+                                             help='Type of bio sequence to enumerate. '
+                                                  'Default: %(default)s.'),
+                               d_aa_only, include_d_aa])
+    reaction = CLICommand('react',
+                         description='React compounds in silico in indicated columns using a named reaction.',
+                         main=_react,
+                         options=[output, formatting, inputs, representation, column, name,
+                                  reaction_opt])
+    split = CLICommand('split',
+                         description='Split table based on chosen algorithm, optionally taking account of chemical structure during splits.',
+                         main=_split,
+                         options=[output, formatting, inputs, representation, column, prefix,
+                                  type_, train, test, set_seed])
+    app = CLIApp("schemist",
+                 version=__version__,
+                 description="Tools for cleaning, collating, and augmenting chemical datasets.",
+                 commands=[clean, convert, featurize, collate, dedup, enum, reaction, split])
+    app.run()
+    return None
+if __name__ == "__main__":
+    main()

schemist/collating.py ADDED Viewed

	@@ -0,0 +1,315 @@

+"""Tools to collate chemical data files."""
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, TextIO, Union
+from collections import Counter
+from functools import partial
+from glob import glob
+import os
+from carabiner.pd import read_table, resolve_delim
+from carabiner import print_err
+import numpy as np
+from pandas import DataFrame, concat
+from .converting import convert_string_representation, _FROM_FUNCTIONS
+from .io import FILE_READERS
+GROUPING_COLUMNS = ("filename", "file_format", "library_name", "string_representation")
+ESSENTIAL_COLUMNS = GROUPING_COLUMNS + ("compound_collection", "plate_id", "well_id")
+def _column_mapper(df: DataFrame,
+                   cols: Iterable[str]) -> Tuple[Callable, Dict]:
+    basic_map = {column: df[column].tolist()[0] for column in cols}
+    inv_basic_map = {value: key for key, value in basic_map.items()}
+    def column_mapper(x: DataFrame) -> DataFrame:
+        new_df = DataFrame()
+        for new_col, old_col in basic_map.items():
+            # old_col = str(old_col)
+            if old_col is None or str(old_col) in ('None', 'nan', 'NA'):
+                new_df[new_col] = None
+            elif '+' in old_col:
+                splits = old_col.split('+')
+                new_df[new_col] = x[splits[0]].str.cat([x[s].astype(str)
+                                                        for s in splits[1:]])
+            elif ';' in old_col:
+                col, char, index = old_col.split(';')
+                index = [int(i) for i in index.split(':')]
+                if len(index) == 1:
+                    index = slice(index[0], index[0] + 1)
+                else:
+                    index = slice(*index)
+                try:
+                    new_df[new_col] = (x[col]
+                                       .str.split(char)
+                                       .map(lambda y: char.join(y[index] if y is not np.nan else []))
+                                       .str.strip())
+                except TypeError as e:
+                    print_err(x[col].str.split(char))
+                    raise e
+            else:
+                new_df[new_col] = x[old_col].copy()
+        return new_df
+    return column_mapper, inv_basic_map
+def _check_catalog(catalog: DataFrame,
+                   catalog_smiles_column: str = 'input_smiles') -> None:
+    essential_columns = (catalog_smiles_column, ) + ESSENTIAL_COLUMNS
+    missing_essential_cols = [col for col in essential_columns
+                              if col not in catalog]
+    if len(missing_essential_cols) > 0:
+        print_err(catalog.columns.tolist())
+        raise KeyError("Missing required columns from catalog: " +
+                       ", ".join(missing_essential_cols))
+    return None
+def collate_inventory(catalog: DataFrame,
+                      root_dir: Optional[str] = None,
+                      drop_invalid: bool = True,
+                      drop_unmapped: bool = False,
+                      catalog_smiles_column: str = 'input_smiles',
+                      id_column_name: Optional[str] = None,
+                      id_n_digits: int = 8,
+                      id_prefix: str = '') -> DataFrame:
+    f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
+    The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
+    - filename is a glob pattern of files to collate
+    - file_format is one of {', '.join(FILE_READERS.keys())}
+    - smiles_column contains smiles strings
+    Other columns are optional and can have any name, but must contain the name or a pattern
+    matching a column (for tabular data) or field (for SDF data) in the files
+    of the `filename` column. In the output DataFrame, the named column data will be mapped.
+    Optional column contents can be either concatenated or split using the following
+    pattern:
+    - col1+col2: concatenates the contents of `col1` and `col2`
+    - col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
+    Parameters
+    ----------
+    catalog : pd.DataFrame
+        Table cataloging locations and format of data. Requires
+        columns {', '.join(ESSENTIAL_COLUMNS)}.
+    root_dir : str, optional
+        Path to look for data files. Default: current directory.
+    drop_invalid : bool, optional
+        Whether to drop rows containing invalid SMILES.
+    Returns
+    -------
+    pd.DataFrame
+        Collated chemical data.
+    """
+    root_dir = root_dir or '.'
+    _check_catalog(catalog, catalog_smiles_column)
+    nongroup_columns = [col for col in catalog
+                         if col not in GROUPING_COLUMNS]
+    loaded_dataframes = []
+    report = Counter({"invalid SMILES": 0,
+                      "rows processed": 0})
+    grouped_catalog = catalog.groupby(list(GROUPING_COLUMNS))
+    for (this_glob, this_filetype,
+         this_library_name, this_representation), filename_df in grouped_catalog:
+        print_err(f'\nProcessing {this_glob}:')
+        this_glob = glob(os.path.join(root_dir, this_glob))
+        these_filenames = sorted(f for f in this_glob
+                                 if not os.path.basename(f).startswith('~$'))
+        print_err('\t- ' + '\n\t- '.join(these_filenames))
+        column_mapper, mapped_cols = _column_mapper(filename_df,
+                                                    nongroup_columns)
+        reader = FILE_READERS[this_filetype]
+        for filename in these_filenames:
+            this_data0 = reader(filename)
+            if not drop_unmapped:
+                unmapped_cols = {col: 'x_' + col.casefold().replace(' ', '_')
+                                for col in this_data0 if col not in mapped_cols}
+                this_data = this_data0[list(unmapped_cols)].rename(columns=unmapped_cols)
+                this_data = concat([column_mapper(this_data0), this_data],
+                                    axis=1)
+            else:
+                this_data = column_mapper(this_data0)
+            if this_representation.casefold() not in _FROM_FUNCTIONS:
+                raise TypeError(' or '.join(list(set(this_representation, this_representation.casefold()))) +
+                                "not a supported string representation. Try one of " + ", ".join(_FROM_FUNCTIONS))
+            this_converter = partial(convert_string_representation,
+                                     input_representation=this_representation.casefold())
+            this_data = (this_data
+                         .query('compound_collection != "NA"')
+                         .assign(library_name=this_library_name,
+                                 input_file_format=this_filetype,
+                                 input_string_representation=this_representation,
+                                 plate_id=lambda x: x['plate_id'].astype(str),
+                                 plate_loc=lambda x: x['library_name'].str.cat([x['compound_collection'], x['plate_id'], x['well_id']], sep=':'),
+                                 canonical_smiles=lambda x: this_converter(x[catalog_smiles_column]),
+                                 is_valid_smiles=lambda x: [s is not None for s in x['canonical_smiles']]))
+            report.update({"invalid SMILES": (~this_data['is_valid_smiles']).sum(),
+                           "rows processed": this_data.shape[0]})
+            if drop_invalid:
+                this_data = this_data.query('is_valid_smiles')
+            if id_column_name is not None:
+                this_converter = partial(convert_string_representation,
+                                         output_representation='id',
+                                         options=dict(n=id_n_digits,
+                                                      prefix=id_prefix))
+                this_data = this_data.assign(**{id_column_name: lambda x: this_converter(x['canonical_smiles'])})
+            loaded_dataframes.append(this_data)
+    collated_df = concat(loaded_dataframes, axis=0)
+    return report, collated_df
+def collate_inventory_from_file(catalog_path: Union[str, TextIO],
+                                root_dir: Optional[str] = None,
+                                format: Optional[str] = None,
+                                *args, **kwargs) -> DataFrame:
+    f"""Process a catalog of files containing chemical libraries into a uniform dataframe.
+    The catalog table needs to have columns {', '.join(ESSENTIAL_COLUMNS)}:
+    - filename is a glob pattern of files to collate
+    - file_format is one of {', '.join(FILE_READERS.keys())}
+    - smiles_column contains smiles strings
+    Other columns are optional and can have any name, but must contain the name or a pattern
+    matching a column (for tabular data) or field (for SDF data) in the files
+    of the `filename` column. In the output DataFrame, the named column data will be mapped.
+    Optional column contents can be either concatenated or split using the following
+    pattern:
+    - col1+col2: concatenates the contents of `col1` and `col2`
+    - col1;-;1:2 : splits the contents of `col1` on the `-` character, and takes splits 1-2 (0-indexed)
+    Parameters
+    ----------
+    catalog_path : str
+        Path to catalog file in XLSX, TSV or CSV format. Requires
+        columns {', '.join(ESSENTIAL_COLUMNS)}.
+    format : str, optional
+        Format of catalog file. Default: infer from file extension.
+    root_dir : str, optional
+        Path to look for data files. Default: use directory containing
+        the catalog.
+    Returns
+    -------
+    pd.DataFrame
+        Collated chemical data.
+    """
+    root_dir = root_dir or os.path.dirname(catalog_path)
+    data_catalog = read_table(catalog_path, format=format)
+    return collate_inventory(catalog=data_catalog,
+                             root_dir=root_dir,
+                             *args, **kwargs)
+def deduplicate(df: DataFrame,
+                column: str = 'smiles',
+                input_representation: str = 'smiles',
+                index_columns: Optional[List[str]] = None,
+                drop_inchikey: bool = False) -> DataFrame:
+    index_columns = index_columns or []
+    inchikey_converter = partial(convert_string_representation,
+                                 input_representation=input_representation,
+                                 output_representation='inchikey')
+    df = df.assign(inchikey=lambda x: inchikey_converter(x[column]))
+    structure_columns = [column, 'inchikey']
+    df_unique = []
+    for (string_rep, inchikey), structure_df in df.groupby(structure_columns):
+        collapsed_indexes = {col: [';'.join(sorted(map(str, set(structure_df[col].tolist()))))]
+                             for col in structure_df if col in index_columns}
+        collapsed_indexes.update({column: [string_rep],
+                                  'inchikey': [inchikey],
+                                  'instance_count': [structure_df.shape[0]]})
+        df_unique.append(DataFrame(collapsed_indexes))
+    df_unique = concat(df_unique, axis=0)
+    if drop_inchikey:
+        df_unique = df_unique.drop(columns=['inchikey'])
+    report = {'starting rows:': df.shape[0],
+              'ending_rows': df_unique.shape[0]}
+    return report, df_unique
+def deduplicate_file(filename: Union[str, TextIO],
+                     format: Optional[str] = None,
+                     *args, **kwargs) -> DataFrame:
+    table = read_table(filename)
+    return deduplicate(table, *args, **kwargs)

schemist/converting.py ADDED Viewed

	@@ -0,0 +1,308 @@

+"""Converting between chemical representation formats."""
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from functools import wraps
+from carabiner import print_err
+from carabiner.cast import cast, flatten
+from carabiner.decorators import return_none_on_error, vectorize
+from carabiner.itertools import batched
+from datamol import sanitize_smiles
+import nemony as nm
+from pandas import DataFrame
+from rdkit.Chem import (Mol, MolFromInchi, MolFromHELM, MolFromSequence,
+                        MolFromSmiles, MolToInchi, MolToInchiKey,
+                        MolToSmiles)
+from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
+from requests import Session
+import selfies as sf
+from .rest_lookup import _inchikey2pubchem_name_id, _inchikey2cactus_name
+@vectorize
+@return_none_on_error
+def _seq2mol(s: str) -> Union[Mol, None]:
+    return MolFromSequence(s, sanitize=True)
+@vectorize
+@return_none_on_error
+def _helm2mol(s: str) -> Union[Mol, None]:
+    return MolFromHELM(s, sanitize=True)
+def mini_helm2helm(s: str) -> List[str]:
+    new_s = []
+    token = ''
+    between_sq_brackets = False
+    for letter in s:
+        if letter.islower() and not between_sq_brackets:
+            letter = f"[d{letter.upper()}]"
+        token += letter
+        if letter == '[':
+            between_sq_brackets = True
+        elif letter == ']':
+            between_sq_brackets = False
+        if not between_sq_brackets:
+            new_s.append(token)
+            token = ''
+    return "PEPTIDE1{{{inner_helm}}}$$$$".format(inner_helm='.'.join(new_s))
+@vectorize
+@return_none_on_error
+def _mini_helm2mol(s: str) -> Mol:
+    s = mini_helm2helm(s)
+    return MolFromHELM(s, sanitize=True)
+@vectorize
+@return_none_on_error
+def _inchi2mol(s: str) -> Mol:
+    return MolFromInchi(s,
+                        sanitize=True,
+                        removeHs=True)
+@vectorize
+# @return_none_on_error
+def _smiles2mol(s: str) -> Mol:
+    return MolFromSmiles(sanitize_smiles(s))
+@vectorize
+@return_none_on_error
+def _selfies2mol(s: str) -> Mol:
+    return MolFromSmiles(sf.decoder(s))
+@vectorize
+@return_none_on_error
+def _mol2nonstandard_inchikey(m: Mol,
+                              **kwargs) -> str:
+    return MolToInchiKey(m,
+                         options="/FixedH /SUU /RecMet /KET /15T")
+@vectorize
+@return_none_on_error
+def _mol2hash(m: Mol,
+              **kwargs) -> str:
+    nonstandard_inchikey = _mol2nonstandard_inchikey(m)
+    return nm.hash(nonstandard_inchikey)
+@vectorize
+@return_none_on_error
+def _mol2id(m: Mol,
+            n: int = 8,
+            prefix: str = '',
+            **kwargs) -> str:
+    return prefix + str(int(_mol2hash(m), 16))[:n]
+@vectorize
+@return_none_on_error
+def _mol2isomeric_canonical_smiles(m: Mol,
+                                   **kwargs) -> str:
+    return MolToSmiles(m,
+                       isomericSmiles=True,
+                       canonical=True)
+@vectorize
+@return_none_on_error
+def _mol2inchi(m: Mol,
+               **kwargs) -> str:
+    return MolToInchi(m)
+@vectorize
+@return_none_on_error
+def _mol2inchikey(m: Mol,
+                  **kwargs) -> str:
+    return MolToInchiKey(m)
+@vectorize
+@return_none_on_error
+def _mol2random_smiles(m: Mol,
+                       **kwargs) -> str:
+    return MolToSmiles(m,
+                       isomericSmiles=True,
+                       doRandom=True)
+@vectorize
+@return_none_on_error
+def _mol2mnemonic(m: Mol,
+                  **kwargs) -> str:
+    nonstandard_inchikey = _mol2nonstandard_inchikey(m)
+    return nm.encode(nonstandard_inchikey)
+def _mol2pubchem(m: Union[Mol, Iterable[Mol]],
+                 session: Optional[Session] = None,
+                 chunksize: int = 32) -> List[Dict[str, Union[None, int, str]]]:
+    inchikeys = cast(_mol2inchikey(m), to=list)
+    pubchem_ids = []
+    for _inchikeys in batched(inchikeys, chunksize):
+        these_ids = _inchikey2pubchem_name_id(_inchikeys,
+                                              session=session)
+        pubchem_ids += these_ids
+    return pubchem_ids
+@return_none_on_error
+def _mol2pubchem_id(m: Union[Mol, Iterable[Mol]],
+                    session: Optional[Session] = None,
+                    chunksize: int = 32,
+                    **kwargs) -> Union[str, List[str]]:
+    return flatten([val['pubchem_id']
+                     for val in _mol2pubchem(m,
+                                             session=session,
+                                             chunksize=chunksize)])
+@return_none_on_error
+def _mol2pubchem_name(m: Union[Mol, Iterable[Mol]],
+                      session: Optional[Session] = None,
+                      chunksize: int = 32,
+                      **kwargs) -> Union[str, List[str]]:
+    return flatten([val['pubchem_name']
+                     for val in _mol2pubchem(m,
+                                             session=session,
+                                             chunksize=chunksize)])
+@return_none_on_error
+def _mol2cactus_name(m: Union[Mol, Iterable[Mol]],
+                     session: Optional[Session] = None,
+                     **kwargs) -> Union[str, List[str]]:
+    return _inchikey2cactus_name(_mol2inchikey(m),
+                                 session=session)
+@vectorize
+@return_none_on_error
+def _mol2scaffold(m: Mol,
+                  chiral: bool = True,
+                  **kwargs) -> str:
+    return MurckoScaffoldSmiles(mol=m,
+                                includeChirality=chiral)
+@vectorize
+@return_none_on_error
+def _mol2selfies(m: Mol,
+                 **kwargs) -> str:
+    s = sf.encoder(_mol2isomeric_canonical_smiles(m))
+    return s if s != -1 else None
+_TO_FUNCTIONS = {"smiles": _mol2isomeric_canonical_smiles,
+                 "selfies": _mol2selfies,
+                 "inchi": _mol2inchi,
+                 "inchikey": _mol2inchikey,
+                 "nonstandard_inchikey": _mol2nonstandard_inchikey,
+                 "hash": _mol2hash,
+                 "mnemonic": _mol2mnemonic,
+                 "id": _mol2id,
+                 "scaffold": _mol2scaffold,
+                 "permuted_smiles": _mol2random_smiles,
+                 "pubchem_id": _mol2pubchem_id,
+                 "pubchem_name": _mol2pubchem_name,
+                 "cactus_name": _mol2cactus_name}
+_FROM_FUNCTIONS = {"smiles": _smiles2mol,
+                   "selfies": _selfies2mol,
+                   "inchi": _inchi2mol,
+                   "aa_seq": _seq2mol,
+                   "helm": _helm2mol,
+                   "minihelm": _mini_helm2mol}
+def _x2mol(strings: Union[Iterable[str], str],
+           input_representation: str = 'smiles') -> Union[Mol, None, Iterable[Union[Mol, None]]]:
+    from_function = _FROM_FUNCTIONS[input_representation.casefold()]
+    return from_function(strings)
+def _mol2x(mols: Union[Iterable[Mol], Mol],
+           output_representation: str = 'smiles',
+           **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
+    to_function = _TO_FUNCTIONS[output_representation.casefold()]
+    return to_function(mols, **kwargs)
+def convert_string_representation(strings: Union[Iterable[str], str],
+                                  input_representation: str = 'smiles',
+                                  output_representation: str = 'smiles',
+                                  **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
+    """Convert between string representations of chemical structures.
+    """
+    mols = _x2mol(strings, input_representation)
+    # print_err(mols)
+    outstrings = _mol2x(mols, output_representation, **kwargs)
+    # print_err(outstrings)
+    return outstrings
+def _convert_input_to_smiles(f: Callable) -> Callable:
+    @wraps(f)
+    def _f(strings: Union[Iterable[str], str],
+           input_representation: str = 'smiles',
+           *args, **kwargs) -> Union[str, None, Iterable[Union[str, None]]]:
+        smiles = convert_string_representation(strings,
+                                               output_representation='smiles',
+                                               input_representation=input_representation)
+        return f(strings=smiles,
+                 *args, **kwargs)
+    return _f

schemist/features.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""Tools for generating chemical features."""
+from typing import Any, Callable, Iterable, Optional, Union
+from descriptastorus.descriptors import MakeGenerator
+from pandas import DataFrame, Series
+import numpy as np
+from rdkit.Chem.AllChem import FingeprintGenerator64, GetMorganGenerator, Mol
+from .converting import _smiles2mol, _convert_input_to_smiles
+def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], DataFrame]:
+    def _f(prefix: Optional[str] = None,
+           *args, **kwargs) -> DataFrame:
+        feature_matrix = f(*args, **kwargs)
+        if prefix is not None:
+            new_cols = {col: f"{prefix}_{col}"
+                        for col in feature_matrix.columns
+                        if not col.startswith('_meta')}
+            feature_matrix = feature_matrix.rename(columns=new_cols)
+        return feature_matrix
+    return _f
+def _get_descriptastorus_features(smiles: Iterable[str],
+                                  generator: str) -> DataFrame:
+    generator = MakeGenerator((generator, ))
+    smiles = Series(smiles)
+    features = smiles.apply(lambda z: np.array(generator.process(z)))
+    matrix = np.stack(features.values, axis=0)
+    return DataFrame(matrix,
+                     index=smiles.index,
+                     columns=[col for col, _ in generator.GetColumns()])
+@_feature_matrix
+@_convert_input_to_smiles
+def calculate_2d_features(strings: Union[Iterable[str], str],
+                          normalized: bool = True,
+                          histogram_normalized: bool = True) -> DataFrame:
+    """Calculate 2d features from string representation.
+    """
+    if normalized:
+        if histogram_normalized:
+            generator_name = "RDKit2DHistogramNormalized"
+        else:
+            generator_name = "RDKit2DNormalized"
+    else:
+        generator_name = "RDKit2D"
+    feature_matrix = _get_descriptastorus_features(strings,
+                                                   generator=generator_name)
+    feature_matrix = (feature_matrix
+                      .rename(columns={f"{generator_name}_calculated": "meta_feature_valid0"})
+                      .assign(meta_feature_type=generator_name,
+                              meta_feature_valid=lambda x: (x['meta_feature_valid0'] == 1.))
+                      .drop(columns=['meta_feature_valid0']))
+    return feature_matrix
+def _fast_fingerprint(generator: FingeprintGenerator64,
+                      mol: Mol,
+                      to_np: bool = True) -> Union[str, np.ndarray]:
+    try:
+        fp_string = generator.GetFingerprint(mol).ToBitString()
+    except:
+        return None
+    else:
+        if to_np:
+            return np.frombuffer(fp_string.encode(), 'u1') - ord('0')
+        else:
+            return fp_string
+@_feature_matrix
+@_convert_input_to_smiles
+def calculate_fingerprints(strings: Union[Iterable[str], str],
+                           fp_type: str = 'morgan',
+                           radius: int = 2,
+                           chiral: bool = True,
+                           on_bits: bool = True) -> DataFrame:
+    """
+    """
+    if fp_type.casefold() == 'morgan':
+        generator_class = GetMorganGenerator
+    else:
+        raise AttributeError(f"Fingerprint type {fp_type} not supported!")
+    fp_generator = generator_class(radius=radius,
+                                   includeChirality=chiral)
+    mols = (_smiles2mol(s) for s in strings)
+    fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits)
+                  for mol in mols)
+    if on_bits:
+        fingerprints = (map(str, np.flatnonzero(fp_string).tolist())
+                        for fp_string in fp_strings)
+        fingerprints = [';'.join(fp) for fp in fingerprints]
+        validity = [len(fp) > 0 for fp in fingerprints]
+        feature_matrix = DataFrame(fingerprints,
+                                   columns=['fp_bits'])
+    else:
+        fingerprints = [np.array(int(digit) for digit in fp_string)
+                        if fp_string is not None
+                        else (-np.ones((fp_generator.GetOptions().fpSize, )))
+                        for fp_string in fp_strings]
+        validity = [np.all(fp >= 0) for fp in fingerprints]
+        feature_matrix = DataFrame(np.stack(fingerprints, axis=0),
+                                   columns=[f"fp_{i}" for i in range(len(fingerprints[0]))])
+    return feature_matrix.assign(meta_feature_type=fp_type.casefold(),
+                                 meta_feature_valid=validity)
+_FEATURE_CALCULATORS = {"2d": calculate_2d_features, "fp": calculate_fingerprints}
+def calculate_feature(feature_type: str,
+                      *args, **kwargs):
+    """
+    """
+    featurizer = _FEATURE_CALCULATORS[feature_type]
+    return featurizer(*args, **kwargs)

schemist/generating.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""Tools for enumerating compounds. Currently only works with peptides."""
+from typing import Callable, Iterable, Optional, Tuple, Union
+from functools import partial
+from itertools import chain, islice, product, repeat
+from math import ceil, expm1, floor
+from random import choice, choices, random, seed
+from carabiner import print_err
+from carabiner.decorators import vectorize, return_none_on_error
+from carabiner.random import sample_iter
+from rdkit.Chem import Mol, rdChemReactions
+import numpy as np
+from .converting import (_x2mol, _mol2x,
+                         _convert_input_to_smiles)
+AA = tuple('GALVITSMCPFYWHKRDENQ')
+dAA = tuple(aa.casefold() for aa in AA)
+REACTIONS = {'N_to_C_cyclization': '([N;H1:5][C:1][C:2](=[O:6])[O:3].[N;H2:4][C:7][C:8](=[O:9])[N;H1:10])>>[N;H1:5][C:1][C:2](=[O:6])[N;H1:4][C:7][C:8](=[O:9])[N;H1:10].[O;H2:3]',
+             'cysteine_to_chloroacetyl_cyclization': '([N;H1:5][C:2](=[O:6])[C:1][Cl:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8]',
+             'cysteine_to_N_cyclization':'([N;H1:5][C:2](=[O:6])[C:1][N;H2:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8].[N;H3:3]'}
+def _get_alphabet(alphabet: Optional[Iterable[str]] = None,
+                  d_aa_only: bool = False,
+                  include_d_aa: bool = False) -> Tuple[str]:
+    alphabet = alphabet or AA
+    alphabet_lower = tuple(set(aa.casefold() for aa in AA))
+    if d_aa_only:
+        alphabet = alphabet_lower
+    elif include_d_aa:
+        alphabet = tuple(set(chain(alphabet, alphabet_lower)))
+    return alphabet
+def all_peptides_of_one_length(length: int,
+                               alphabet: Optional[Iterable[str]] = None,
+                               d_aa_only: bool = False,
+                               include_d_aa: bool = False) -> Iterable[str]:
+    """
+    """
+    alphabet = _get_alphabet(alphabet=alphabet,
+                             d_aa_only=d_aa_only,
+                             include_d_aa=include_d_aa)
+    return (''.join(peptide)
+            for peptide in product(alphabet, repeat=length))
+def all_peptides_in_length_range(max_length: int,
+                                 min_length: int = 1,
+                                 by: int = 1,
+                                 alphabet: Optional[Iterable[str]] = None,
+                                 d_aa_only: bool = False,
+                                 include_d_aa: bool = False,
+                                 *args, **kwargs) -> Iterable[str]:
+    """
+    """
+    length_range = range(*sorted([min_length, max_length + 1]), by)
+    peptide_maker = partial(all_peptides_of_one_length,
+                            alphabet=alphabet,
+                            d_aa_only=d_aa_only,
+                            include_d_aa=include_d_aa,
+                            *args, **kwargs)
+    return chain.from_iterable(peptide_maker(length=length)
+                               for length in length_range)
+def _number_of_peptides(max_length: int,
+                        min_length: int = 1,
+                        by: int = 1,
+                        alphabet: Optional[Iterable[str]] = None,
+                        d_aa_only: bool = False,
+                        include_d_aa: bool = False):
+    alphabet = _get_alphabet(alphabet=alphabet,
+                             d_aa_only=d_aa_only,
+                             include_d_aa=include_d_aa)
+    n_peptides = [len(alphabet) ** length
+                  for length in range(*sorted([min_length, max_length + 1]), by)]
+    return n_peptides
+def _naive_sample_peptides_in_length_range(max_length: int,
+                                           min_length: int = 1,
+                                           by: int = 1,
+                                           n: Optional[Union[float, int]] = None,
+                                           alphabet: Optional[Iterable[str]] = None,
+                                           d_aa_only: bool = False,
+                                           include_d_aa: bool = False,
+                                           set_seed: Optional[int] = None):
+    alphabet = _get_alphabet(alphabet=alphabet,
+                             d_aa_only=d_aa_only,
+                             include_d_aa=include_d_aa)
+    n_peptides = _number_of_peptides(max_length=max_length,
+                                     min_length=min_length,
+                                     by=by,
+                                     alphabet=alphabet,
+                                     d_aa_only=d_aa_only,
+                                     include_d_aa=include_d_aa)
+    lengths = list(range(*sorted([min_length, max_length + 1]), by))
+    weight_per_length = [n / min(n_peptides) for n in n_peptides]
+    weighted_lengths = list(chain.from_iterable(repeat(l, ceil(w)) for l, w in zip(lengths, weight_per_length)))
+    lengths_sample = (choice(weighted_lengths) for _ in range(n))
+    return (''.join(choices(list(alphabet), k=k)) for k in lengths_sample)
+def sample_peptides_in_length_range(max_length: int,
+                                    min_length: int = 1,
+                                    by: int = 1,
+                                    n: Optional[Union[float, int]] = None,
+                                    alphabet: Optional[Iterable[str]] = None,
+                                    d_aa_only: bool = False,
+                                    include_d_aa: bool = False,
+                                    naive_sampling_cutoff: float = 5e-3,
+                                    reservoir_sampling: bool = True,
+                                    indexes: Optional[Iterable[int]] = None,
+                                    set_seed: Optional[int] = None,
+                                    *args, **kwargs) -> Iterable[str]:
+    """
+    """
+    seed(set_seed)
+    alphabet = _get_alphabet(alphabet=alphabet,
+                             d_aa_only=d_aa_only,
+                             include_d_aa=include_d_aa)
+    n_peptides = sum(len(alphabet) ** length
+                     for length in range(*sorted([min_length, max_length + 1]), by))
+    if n is None:
+        n_requested = n_peptides
+    elif n >= 1.:
+        n_requested = min(floor(n), n_peptides)
+    elif n < 1.:
+        n_requested = floor(n * n_peptides)
+    frac_requested = n_requested / n_peptides
+    # approximation of birthday problem
+    p_any_collision = -expm1(-n_requested * (n_requested - 1.) / (2. * n_peptides))
+    n_collisons = n_requested * (1. - ((n_peptides - 1.) / n_peptides) ** (n_requested - 1.))
+    frac_collisions = n_collisons / n_requested
+    print_err(f"Sampling {n_requested} ({frac_requested * 100.} %) peptides from "
+              f"length {min_length} to {max_length} ({n_peptides} combinations). "
+              f"Probability of collision if drawing randomly is {p_any_collision}, "
+              f"with {n_collisons} ({100. * frac_collisions} %) collisions on average.")
+    if frac_collisions < naive_sampling_cutoff and n_peptides > 2e9:
+        print_err("> Executing naive sampling. ")
+        peptides = _naive_sample_peptides_in_length_range(max_length, min_length, by,
+                                                          n=n_requested,
+                                                          alphabet=alphabet,
+                                                          d_aa_only=d_aa_only,
+                                                          include_d_aa=include_d_aa)
+    else:
+        print_err("> Executing exhaustive sampling.")
+        all_peptides = all_peptides_in_length_range(max_length, min_length, by,
+                                                    alphabet=alphabet,
+                                                    d_aa_only=d_aa_only,
+                                                    include_d_aa=include_d_aa,
+                                                    *args, **kwargs)
+        if n is None:
+            peptides = all_peptides
+        elif n >= 1.:
+            if reservoir_sampling:
+                peptides = sample_iter(all_peptides, k=n_requested,
+                                       shuffle_output=False)
+            else:
+                peptides = (pep for pep in all_peptides
+                            if random() <= frac_requested)
+        elif n < 1.:
+            peptides = (pep for pep in all_peptides
+                        if random() <= n)
+    if indexes is not None:
+        indexes = (int(ix) if (isinstance(ix, str) and ix.isdigit()) or isinstance(ix, int) or isinstance(ix, float)
+                   else None
+                   for ix in islice(indexes, 3))
+        indexes = [ix if (ix is None or ix >= 0) else None
+                   for ix in indexes]
+        if len(indexes) > 1:
+            if n is not None and n >=1. and indexes[0] > n:
+                raise ValueError(f"Minimum slice ({indexes[0]}) is higher than number of items ({n}).")
+        peptides = islice(peptides, *indexes)
+    return peptides
+def _reactor(smarts: str) -> Callable[[Mol], Union[Mol, None]]:
+    rxn = rdChemReactions.ReactionFromSmarts(smarts)
+    reaction_function = rxn.RunReactants
+    @vectorize
+    @return_none_on_error
+    def reactor(s: Mol) -> Mol:
+        return reaction_function([s])[0][0]
+    return reactor
+@_convert_input_to_smiles
+def react(strings: Union[str, Iterable[str]],
+          reaction: str = 'N_to_C_cyclization',
+          output_representation: str = 'smiles',
+          **kwargs) -> Union[str, Iterable[str]]:
+    """
+    """
+    try:
+        _this_reaction = REACTIONS[reaction]
+    except KeyError:
+        raise KeyError(f"Reaction {reaction} is not available. Try: " +
+                        ", ".join(list(REACTIONS)))
+    # strings = cast(strings, to=list)
+    # print_err((strings))
+    reactor = _reactor(_this_reaction)
+    mols = _x2mol(strings)
+    mols = reactor(mols)
+    return _mol2x(mols,
+                  output_representation=output_representation,
+                  **kwargs)

schemist/io.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""Tools to facilitate input and output."""
+from typing import Any, Callable, List, Optional, TextIO, Tuple, Union
+from collections import defaultdict
+from functools import partial
+from string import printable
+from tempfile import NamedTemporaryFile
+from xml.etree import ElementTree
+from carabiner import print_err
+from carabiner.cast import cast
+from carabiner.itertools import tenumerate
+from carabiner.pd import read_table, write_stream
+from pandas import DataFrame, read_excel
+from rdkit.Chem import SDMolSupplier
+from .converting import _mol2isomeric_canonical_smiles
+def _mutate_df_stream(input_file: Union[str, TextIO],
+                      output_file: Union[str, TextIO],
+                      function: Callable[[DataFrame], Tuple[Any, DataFrame]],
+                      file_format: Optional[str] = None,
+                      chunksize: int = 1000) -> List[Any]:
+    carries = []
+    for i, chunk in tenumerate(read_table(input_file,
+                                          format=file_format,
+                                          progress=False,
+                                          chunksize=chunksize)):
+        result = function(chunk)
+        try:
+            carry, df = result
+        except ValueError:
+            df = result
+            carry = 0
+        write_stream(df,
+                     output=output_file,
+                     format=file_format,
+                     header=i == 0,
+                     mode='w' if i == 0 else 'a')
+        carries.append(carry)
+    return carries
+def read_weird_xml(filename: Union[str, TextIO],
+                   header: bool = True,
+                   namespace: str = '{urn:schemas-microsoft-com:office:spreadsheet}') -> DataFrame:
+    """
+    """
+    with cast(filename, TextIOWrapper, mode='r') as f:
+        xml_string = ''.join(filter(printable.__contains__, f.read()))
+    try:
+        root = ElementTree.fromstring(xml_string)
+    except Exception as e:
+        print_err('\n!!! ' + xml_string.split('\n')[1184][377:380])
+        raise e
+    for i, row in enumerate(root.iter(f'{namespace}Row') ):
+        this_row = [datum.text for datum in row.iter(f'{namespace}Data')]
+        if i == 0:
+            if header:
+                heading = this_row
+                df = {colname: [] for colname in heading}
+            else:
+                heading = [f'X{j}' for j, _ in enumerate(this_row)]
+                df = {colname: [datum] for colname, datum in zip(heading, this_row)}
+        else:
+            for colname, datum in zip(heading, this_row):
+                df[colname].append(datum)
+    return DataFrame(df)
+def read_sdf(filename: Union[str, TextIO]):
+    """
+    """
+    filename = cast(filename, str)
+    with open(filename, 'r', errors='replace') as f:
+        with NamedTemporaryFile("w") as o:
+            o.write(f.read())
+            o.seek(0)
+            df = defaultdict(list)
+            for i, mol in enumerate(SDMolSupplier(o.name)):
+                if mol is None:
+                    continue
+                propdict = mol.GetPropsAsDict()
+                propdict['SMILES'] = _mol2isomeric_canonical_smiles(mol)
+                for colname in propdict:
+                    df[colname].append(propdict[colname])
+                for colname in df:
+                    if colname not in propdict:
+                        df[colname].append(None)
+    col_lengths = {col: len(val) for col, val in df.items()}
+    if len(set(col_lengths.values())) > 1:
+        raise ValueError(f"Column lengths not all the same:\n\t" +
+                         '\n\t'.join(f"{key}:{val}" for key, val in col_lengths.items()))
+    return DataFrame(df)
+FILE_READERS = {
+    'bad_xml': read_weird_xml,
+    'xlsx': partial(read_excel, engine='openpyxl'),
+    'sdf': read_sdf
+}

schemist/rest_lookup.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""Tools for querying PubChem."""
+from typing import Dict, Iterable, List, Optional, Union
+from time import sleep
+from xml.etree import ElementTree
+from carabiner import print_err
+from carabiner.cast import cast
+from carabiner.decorators import vectorize
+from requests import Response, Session
+_PUBCHEM_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/{get}/{format}"
+_CACTUS_URL = "https://cactus.nci.nih.gov/chemical/structure/{inchikey}/{get}"
+_OVERLOAD_CODES = {500, 501, 503, 504}
+def _url_request(inchikeys: Union[str, Iterable[str]],
+                 url: str,
+                 session: Optional[Session] = None,
+                 **kwargs) -> Response:
+    if session is None:
+        session = Session()
+    inchikeys = cast(inchikeys, to=list)
+    return session.get(url.format(inchikey=','.join(inchikeys), **kwargs))
+def _inchikey2pubchem_name_id(inchikeys: Union[str, Iterable[str]],
+                        session: Optional[Session] = None,
+                        counter: int = 0,
+                        max_tries: int = 10,
+                        namespace: str = "{http://pubchem.ncbi.nlm.nih.gov/pug_rest}") -> List[Dict[str, Union[None, int, str]]]:
+    r = _url_request(inchikeys, url=_PUBCHEM_URL,
+                     session=session,
+                     get="Title,InchiKey", format="XML")
+    if r.status_code == 200:
+        root = ElementTree.fromstring(r.text)
+        compounds = root.iter(f'{namespace}Properties')
+        result_dict = dict()
+        for cmpd in compounds:
+            cmpd_dict = dict()
+            for child in cmpd:
+                cmpd_dict[child.tag.split(namespace)[1]] = child.text
+            try:
+                inchikey, name, pcid = cmpd_dict['InChIKey'], cmpd_dict['Title'], cmpd_dict['CID']
+            except KeyError:
+                print(cmpd_dict)
+            else:
+                result_dict[inchikey] = {'pubchem_name': name.casefold(),
+                                         'pubchem_id': pcid}
+        print_err(f'PubChem: Looked up InchiKeys: {",".join(inchikeys)}')
+        result_list = [result_dict[inchikey]
+                       if inchikey in result_dict
+                       else {'pubchem_name': None, 'pubchem_id': None}
+                       for inchikey in inchikeys]
+        return result_list
+    elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
+        sleep(1.)
+        return _inchikey2pubchem_name_id(inchikeys,
+                                         session=session,
+                                         counter=counter + 1,
+                                         max_tries=max_tries,
+                                         namespace=namespace)
+    else:
+        print_err(f'PubChem: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
+        return [{'pubchem_name': None, 'pubchem_id': None}
+                for _ in range(len(inchikeys))]
+@vectorize
+def _inchikey2cactus_name(inchikeys: str,
+                          session: Optional[Session] = None,
+                          counter: int = 0,
+                          max_tries: int = 10):
+    r = _url_request(inchikeys, url=_CACTUS_URL,
+                     session=session,
+                     get="names")
+    if r.status_code == 200:
+        return r.text.split('\n')[0].casefold()
+    elif r.status_code in _OVERLOAD_CODES and counter < max_tries:
+        sleep(1.)
+        return _inchikey2cactus_name(inchikeys,
+                                     session=session,
+                                     counter=counter + 1,
+                                     max_tries=max_tries)
+    else:
+        print_err(f'Cactus: InchiKey {",".join(inchikeys)} gave status {r.status_code}')
+        return None

schemist/splitting.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""Tools for splitting tabular datasets, optionally based on chemical features."""
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+from collections import defaultdict
+from math import ceil
+from random import random, seed
+try:
+    from itertools import batched
+except ImportError:
+    from carabiner.itertools import batched
+from tqdm.auto import tqdm
+from .converting import convert_string_representation, _convert_input_to_smiles
+from .typing import DataSplits
+# def _train_test_splits
+def _train_test_val_sizes(total: int,
+                          train: float = 1.,
+                          test: float = 0.) -> Tuple[int]:
+    n_train = int(ceil(train * total))
+    n_test = int(ceil(test * total))
+    n_val = total - n_train - n_test
+    return n_train, n_test, n_val
+def _random_chunk(strings: str,
+                  train: float = 1.,
+                  test: float = 0.,
+                  carry: Optional[Dict[str, List[int]]] = None,
+                  start_from: int = 0) -> Dict[str, List[int]]:
+    carry = carry or defaultdict(list)
+    train_test: float = train + test
+    for i, _ in enumerate(strings):
+        random_number: float = random()
+        if random_number < train:
+            key = 'train'
+        elif random_number < train_test:
+            key = 'test'
+        else:
+            key = 'validation'
+        carry[key].append(start_from + i)
+    return carry
+def split_random(strings: Union[str, Iterable[str]],
+                 train: float = 1.,
+                 test: float = 0.,
+                 chunksize: Optional[int] = None,
+                 set_seed: Optional[int] = None,
+                 *args, **kwargs) -> DataSplits:
+    """
+    """
+    if set_seed is not None:
+        seed(set_seed)
+    if chunksize is None:
+        idx = _random_chunk(strings=strings,
+                            train=train,
+                            test=test)
+    else:
+        idx = defaultdict(list)
+        for i, chunk in enumerate(batched(strings, chunksize)):
+            idx = _random_chunk(strings=chunk,
+                                train=train,
+                                test=test,
+                                carry=idx,
+                                start_from=i * chunksize)
+    seed(None)
+    return DataSplits(**idx)
+@_convert_input_to_smiles
+def _scaffold_chunk(strings: str,
+                    carry: Optional[Dict[str, List[int]]] = None,
+                    start_from: int = 0) -> Dict[str, List[int]]:
+    carry = carry or defaultdict(list)
+    these_scaffolds = convert_string_representation(strings=strings,
+                                                    output_representation='scaffold')
+    for j, scaff in enumerate(these_scaffolds):
+        carry[scaff].append(start_from + j)
+    return carry
+def _scaffold_aggregator(scaffold_sets: Dict[str, List[int]],
+                         train: float = 1.,
+                         test: float = 0.,
+                         progress: bool = False) -> DataSplits:
+    scaffold_sets = {key: sorted(value)
+                     for key, value in scaffold_sets.items()}
+    scaffold_sets = sorted(scaffold_sets.items(),
+                           key=lambda x: (len(x[1]), x[1][0]),
+                           reverse=True)
+    nrows = sum(len(idx) for _, idx in scaffold_sets)
+    n_train, n_test, n_val = _train_test_val_sizes(nrows,
+                                                   train,
+                                                   test)
+    idx = defaultdict(list)
+    iterator = tqdm(scaffold_sets) if progress else scaffold_sets
+    for _, scaffold_idx in iterator:
+        if (len(idx['train']) + len(scaffold_idx)) > n_train:
+            if (len(idx['test']) + len(scaffold_idx)) > n_test:
+                key = 'validation'
+            else:
+                key = 'test'
+        else:
+            key = 'train'
+        idx[key] += scaffold_idx
+    return DataSplits(**idx)
+def split_scaffold(strings: Union[str, Iterable[str]],
+                   train: float = 1.,
+                   test: float = 0.,
+                   chunksize: Optional[int] = None,
+                   progress: bool = True) -> DataSplits:
+    """
+    """
+    if chunksize is None:
+        scaffold_sets = _scaffold_chunk(strings)
+    else:
+        scaffold_sets = defaultdict(list)
+        for i, chunk in enumerate(batched(strings, chunksize)):
+            scaffold_sets = _scaffold_chunk(chunk,
+                                            carry=scaffold_sets,
+                                            start_from=i * chunksize)
+    return _scaffold_aggregator(scaffold_sets,
+                                train=train, test=test,
+                                progress=progress)
+_SPLITTERS = {#'simpd': split_simpd,
+              'scaffold': split_scaffold,
+              'random': split_random}
+# _SPLIT_SUPERTYPES = {'scaffold': 'grouped',
+#                      'random': 'independent'}
+_GROUPED_SPLITTERS = {'scaffold': (_scaffold_chunk, _scaffold_aggregator)}
+assert all(_type in _SPLITTERS
+           for _type in _GROUPED_SPLITTERS)  ## Should never fail!
+def split(split_type: str,
+          *args, **kwargs) -> DataSplits:
+    """
+    """
+    splitter = _SPLITTERS[split_type]
+    return splitter(*args, **kwargs)

schemist/tables.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""Tools for processing tabular data."""
+from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Tuple, Union
+from functools import partial
+try:
+    from itertools import batched
+except ImportError:
+    from carabiner.itertools import batched
+from carabiner.cast import cast, clist
+from carabiner import print_err
+from pandas import DataFrame, concat
+from .cleaning import clean_smiles, clean_selfies
+from .converting import convert_string_representation
+from .features import calculate_feature
+from .generating import sample_peptides_in_length_range, react
+from .splitting import split
+from .typing import DataSplits
+def _get_error_tally(df: DataFrame,
+                     cols: Union[str, List[str]]) -> Dict[str, int]:
+    cols = cast(cols, to=list)
+    try:
+        tally = {col: (df[col].isna() | ~df[col]).sum() for col in cols}
+    except TypeError:
+        tally = {col: df[col].isna().sum() for col in cols}
+    return tally
+def converter(df: DataFrame,
+              column: str = 'smiles',
+              input_representation: str = 'smiles',
+              output_representation: Union[str, List[str]] = 'smiles',
+              prefix: Optional[str] = None,
+              options: Optional[Dict[str, Any]] = None) -> Tuple[Dict[str, int], DataFrame]:
+    """
+    """
+    prefix = prefix or ''
+    converters = {f"{prefix}{rep_out}": partial(convert_string_representation,
+                                                output_representation=rep_out,
+                                                input_representation=input_representation,
+                                                **options)
+                  for rep_out in cast(output_representation, to=list)}
+    column_values = df[column]
+    converted = {col: cast(f(column_values), to=list)
+                 for col, f in converters.items()}
+    df = df.assign(**converted)
+    return  _get_error_tally(df, list(converters)), df
+def cleaner(df: DataFrame,
+            column: str = 'smiles',
+            input_representation: str = 'smiles',
+            prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
+    """
+    """
+    if input_representation.casefold() == 'smiles':
+        cleaner = clean_smiles
+    elif input_representation.casefold() == 'selfies':
+        cleaner = clean_selfies
+    else:
+        raise ValueError(f"Representation {input_representation} is not supported for cleaning.")
+    prefix = prefix or ''
+    new_column = f"{prefix}{column}"
+    df = df.assign(**{new_column: lambda x: cleaner(x[column])})
+    return _get_error_tally(df, new_column), df
+def featurizer(df: DataFrame,
+               feature_type: str,
+               column: str = 'smiles',
+               ids: Optional[Union[str, List[str]]] = None,
+               input_representation: str = 'smiles',
+               prefix: Optional[str] = None) -> Tuple[Dict[str, int], DataFrame]:
+    """
+    """
+    if ids is None:
+        ids = df.columns.tolist()
+    else:
+        ids = cast(ids, to=list)
+    feature_df = calculate_feature(feature_type=feature_type,
+                                   strings=df[column],
+                                   prefix=prefix,
+                                   input_representation=input_representation)
+    if len(ids) > 0:
+        df = concat([df[ids], feature_df], axis=1)
+    return _get_error_tally(feature_df, 'meta_feature_valid'), df
+def assign_groups(df: DataFrame,
+                  grouper: Callable[[Union[str, Iterable[str]]], Dict[str, Tuple[int]]],
+                  group_name: str = 'group',
+                  column: str = 'smiles',
+                  input_representation: str = 'smiles',
+                  *args, **kwargs) -> Tuple[Dict[str, Tuple[int]], DataFrame]:
+    group_idx = grouper(strings=df[column],
+                        input_representation=input_representation,
+                        *args, **kwargs)
+    inv_group_idx = {i: group for group, idx in group_idx.items() for i in idx}
+    groups = [inv_group_idx[i] for i in range(len(inv_group_idx))]
+    return group_idx, df.assign(**{group_name: groups})
+def _assign_splits(df: DataFrame,
+                   split_idx: DataSplits,
+                   use_df_index: bool = False) -> DataFrame:
+    row_index = df.index if use_df_index else tuple(range(df.shape[0]))
+    df = df.assign(**{f'is_{key}': [i in getattr(split_idx, key) for i in row_index]
+                      for key in split_idx._fields})
+    split_counts = {key: sum(df[f'is_{key}'].values) for key in split_idx._fields}
+    return split_counts, df
+def splitter(df: DataFrame,
+             split_type: str = 'random',
+             column: str = 'smiles',
+             input_representation: str = 'smiles',
+             *args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
+    """
+    """
+    split_idx = split(split_type=split_type,
+                      strings=df[column],
+                      input_representation=input_representation,
+                      *args, **kwargs)
+    return _assign_splits(df, split_idx=split_idx)
+def reactor(df: DataFrame,
+            column: str = 'smiles',
+            reaction: Union[str, Iterable[str]] = 'N_to_C_cyclization',
+            prefix: Optional[str] = None,
+            *args, **kwargs) -> Tuple[Dict[str, int], DataFrame]:
+    """
+    """
+    prefix = prefix or ''
+    reactors = {col: partial(react, reaction=col)
+                for col in cast(reaction, to=list)}
+    column_values = df[column]
+    new_columns = {f"{prefix}{col}": list(_reactor(strings=column_values, *args, **kwargs))
+                   for col, _reactor in reactors.items()}
+    df = df.assign(**new_columns)
+    return _get_error_tally(df, reaction), df
+def _peptide_table(max_length: int,
+                   min_length: Optional[int] = None,
+                   by: int = 1,
+                   n: Optional[Union[float, int]] = None,
+                   prefix: str = '',
+                   suffix: str = '',
+                   generator: bool = False,
+                   batch_size: int = 1000,
+                   *args, **kwargs) -> Union[DataFrame, Generator]:
+    min_length = min_length or max_length
+    peptides = sample_peptides_in_length_range(max_length=max_length,
+                                               min_length=min_length,
+                                               by=by,
+                                               n=n,
+                                               *args, **kwargs)
+    if generator:
+        for peps in batched(peptides, batch_size):
+            peps = [f"{prefix}{pep}{suffix}"
+                    for pep in peps]
+            yield DataFrame(dict(peptide_sequence=peps))
+    else:
+        peps = [f"{prefix}{pep}{suffix}"
+                for pep in peptides]
+        return DataFrame(dict(peptide_sequence=peps))

schemist/typing.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Types used in schemist."""
+from collections import namedtuple
+DataSplits = namedtuple('DataSplits',
+                        ['train', 'test', 'validation'],
+                        defaults=[tuple(), tuple(), tuple()])

schemist/utils.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Miscellaneous utilities for schemist."""

test/data/AmpC_screen_table_10k.csv.gz ADDED Viewed

Binary file (171 kB). View file

test/tests.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import doctest
+import schemist as sch
+if __name__ == '__main__':
+    doctest.testmod(sch)