from numbers import Number from typing import Optional, Union import numpy as np from deepscreen.utils import get_logger log = get_logger(__name__) MOLARITY_TO_POTENCY = { 'p': lambda x: x, 'M': lambda x: -np.log10(x), 'mM': lambda x: -np.log10(x) + 3, 'μM': lambda x: -np.log10(x) + 6, 'uM': lambda x: -np.log10(x) + 6, # in case someone doesn't know how to type micromolar lol 'nM': lambda x: -np.log10(x) + 9, 'pM': lambda x: -np.log10(x) + 12, 'fM': lambda x: -np.log10(x) + 15, } # TODO rewrite for swifter.apply def molar_to_p(labels, units): assert units in MOLARITY_TO_POTENCY, f"Allowed units: {', '.join(MOLARITY_TO_POTENCY)}." unit_converted_labels = [] for label, unit in (labels, units): unit_converted_labels.append(MOLARITY_TO_POTENCY[unit](label)) labels = np.array(unit_converted_labels) return labels def label_discretize(labels, thresholds): # if isinstance(threshold, Number): # labels = np.where(labels < threshold, 1, 0) # else: # labels = np.where(labels < threshold[0], 1, np.where(labels > threshold[1], 0, np.nan)) if isinstance(thresholds, Number): labels = 1 - np.digitize(labels, [thresholds]) else: labels = np.digitize(labels, np.sort(thresholds)[::-1]) return labels def label_transform( labels, units: Optional[list[str]], thresholds: Optional[Union[float, list[Number]]], discard_intermediate: Optional[bool] ): f"""Convert labels of all units to p scale (-log10[M]) and binarize them if specified. :param labels: a sequence of labels, continuous or binary values :type labels: array_like :param units: a sequence of label units in {', '.join(MOLARITY_TO_POTENCY)} :type units: array_like, optional :param thresholds: discretization threshold(s) for affinity labels, in p scale (-log10[M]). A single number maps affinities below it to 1 and otherwise to 0. A tuple of two or more thresholds maps affinities to multiple discrete levels descendingly, assigning values values below the lowest threshold to the highest level (e.g. 2) and values above the greatest threshold to 0 :type thresholds: list, float, optional :param discard_intermediate: whether to discard the intermediate (indeterminate) level if provided an odd number of thresholds (>=3) :type discard_intermediate: bool :return: a numpy array of affinity labels in p scale (-log10[M]) or discrete labels """ # # Check if labels are already discrete (ignoring NAs). # discrete = labels.dropna().isin([0, 1]).all() # # if discrete: # assert discretize, "Cannot train a regression model with discrete labels." # if thresholds: # warn("Ignoring 'threshold' because 'Y' (labels) in the data table is already binary.") # if units: # warn("Ignoring 'units' because 'Y' (labels) in the data table is already binary.") # labels = labels if units: labels = molar_to_p(labels, units) if thresholds: labels = label_discretize(labels, thresholds) if discard_intermediate: assert len(thresholds) % 2 == 1 and len(thresholds) >= 3, \ "Must give an odd number of (at least 3) thresholds to discard the intermediate level." intermediate_level = len(thresholds) // 2 # Make the intermediate-level labels NaN (which will be filtered out later) labels[labels == intermediate_level] = np.nan # Reduce all levels above the intermediate level by 1 labels[labels > intermediate_level] -= 1 return labels