|
import os |
|
import numpy as np |
|
|
|
from Bio.Align import substitution_matrices |
|
from sklearn.linear_model import LinearRegression, Ridge |
|
from skopt.learning import GaussianProcessRegressor |
|
from skopt.learning.gaussian_process.kernels import ConstantKernel, Matern |
|
from sklearn.model_selection import cross_val_score |
|
from sklearn.metrics import make_scorer |
|
|
|
import utils |
|
|
|
|
|
REG_COEF_LIST = [1e-1, 1e0, 1e1, 1e2, 1e3] |
|
|
|
|
|
class BasePredictor(): |
|
"""Abstract class for predictors.""" |
|
|
|
def __init__(self, dataset_name, **kwargs): |
|
self.dataset_name = dataset_name |
|
|
|
def select_training_data(self, data, n_train): |
|
return data.sample(n=n_train) |
|
|
|
def train(self, train_seqs, train_labels): |
|
"""Trains the model. |
|
Args: |
|
- train_seqs: a list of sequences |
|
- train_labels: a list of numerical fitness labels |
|
""" |
|
raise NotImplementedError |
|
|
|
def predict(self, predict_seqs): |
|
"""Gets model predictions. |
|
Args: |
|
- predict_seqs: a list of sequences |
|
Returns: |
|
A list of numerical fitness predictions. |
|
""" |
|
raise NotImplementedError |
|
|
|
def predict_unsupervised(self, predict_seqs): |
|
"""Gets model predictions before training. |
|
Args: |
|
- predict_seqs: a list of sequences |
|
Returns: |
|
A list of numerical fitness predictions. |
|
""" |
|
return np.random.randn(len(predict_seqs)) |
|
|
|
|
|
class BoostingPredictor(BasePredictor): |
|
"""Boosting by combining predictors as weak learners.""" |
|
|
|
def __init__(self, dataset_name, weak_learner_classes, **kwargs): |
|
super(BoostingPredictor, self).__init__(dataset_name) |
|
self.weak_learners = [c(dataset_name, **kwargs) for c in |
|
weak_learner_classes] |
|
|
|
def train(self, train_seqs, train_labels): |
|
y = train_labels |
|
for i, model in enumerate(self.weak_learners): |
|
model.train(train_seqs, y) |
|
y -= model.predict(train_seqs) |
|
|
|
def predict(self, predict_seqs): |
|
y = np.zeros(len(predict_seqs)) |
|
for i, model in enumerate(self.weak_learners): |
|
y += model.predict(predict_seqs) |
|
return y |
|
|
|
def predict_unsupervised(self, predict_seqs): |
|
return self.weak_learners[0].predict_unsupervised(predict_seqs) |
|
|
|
def select_training_data(self, data, n_train): |
|
return self.weak_learners[0].select_training_data(data, n_train) |
|
|
|
|
|
class BaseRegressionPredictor(BasePredictor): |
|
|
|
def __init__(self, dataset_name, reg_coef=None, |
|
linear_model_cls=Ridge, **kwargs): |
|
self.dataset_name = dataset_name |
|
self.reg_coef = reg_coef |
|
self.linear_model_cls = linear_model_cls |
|
self.model = None |
|
|
|
def seq2feat(self, seqs): |
|
raise NotImplementedError |
|
|
|
def train(self, train_seqs, train_labels): |
|
X = self.seq2feat(train_seqs) |
|
if self.reg_coef is None or self.reg_coef == 'CV': |
|
best_rc, best_score = None, -np.inf |
|
for rc in REG_COEF_LIST: |
|
model = self.linear_model_cls(alpha=rc) |
|
score = cross_val_score( |
|
model, X, train_labels, cv=5, |
|
scoring=make_scorer(utils.spearman)).mean() |
|
if score > best_score: |
|
best_rc = rc |
|
best_score = score |
|
self.reg_coef = best_rc |
|
|
|
self.model = self.linear_model_cls(alpha=self.reg_coef) |
|
self.model.fit(X, train_labels) |
|
|
|
def predict(self, predict_seqs): |
|
if self.model is None: |
|
return np.random.randn(len(predict_seqs)) |
|
X = self.seq2feat(predict_seqs) |
|
return self.model.predict(X) |
|
|
|
|
|
class JointPredictor(BaseRegressionPredictor): |
|
"""Combining regression predictors by training jointly.""" |
|
|
|
def __init__(self, dataset_name, predictor_classes, predictor_name, |
|
reg_coef='CV', **kwargs): |
|
super(JointPredictor, self).__init__(dataset_name, reg_coef, **kwargs) |
|
self.predictors = [] |
|
for c, name in zip(predictor_classes, predictor_name): |
|
if f'reg_coef_{name}' in kwargs: |
|
self.predictors.append(c(dataset_name, |
|
reg_coef=float(kwargs[f'reg_coef_{name}']), **kwargs)) |
|
else: |
|
self.predictors.append(c(dataset_name, **kwargs)) |
|
|
|
def seq2feat(self, seqs): |
|
|
|
|
|
features = [p.seq2feat(seqs) * np.sqrt(1.0 / p.reg_coef) |
|
for p in self.predictors] |
|
return np.concatenate(features, axis=1) |
|
|
|
|
|
class BLOSUM62Predictor(BaseRegressionPredictor): |
|
|
|
def __init__(self, dataset_name, reg_coef=1e-8, **kwargs): |
|
super(BLOSUM62Predictor, self).__init__(dataset_name, |
|
reg_coef, **kwargs) |
|
self.wt = utils.read_fasta( |
|
os.path.join('data', dataset_name, 'wt.fasta'))[0] |
|
self.matrix = substitution_matrices.load('BLOSUM62') |
|
self.alphabet = self.matrix.alphabet |
|
for i, c in enumerate(self.wt): |
|
assert c in self.alphabet, f'unexpected AA {c} (pos {i})' |
|
|
|
def seq2feat(self, seqs): |
|
scores = np.zeros(len(seqs)) |
|
return utils.get_blosum_scores(seqs, self.wt, self.matrix)[:, None] |
|
|
|
def predict_unsupervised(self, predict_seqs): |
|
return self.seq2feat(predict_seqs).squeeze() |
|
|
|
|
|
class BaseGPPredictor(BasePredictor): |
|
|
|
def __init__(self, dataset_name, noise=0.1, kernel_length_scale=1.0, |
|
kernel_nu=2.5, kernel_const=1.0, **kwargs): |
|
self.dataset_name = dataset_name |
|
self.kernel = ConstantKernel(kernel_const) * Matern( |
|
length_scale=kernel_length_scale, nu=kernel_nu) |
|
self.noise = noise |
|
self.gpr = None |
|
|
|
def seq2feat(self, seqs): |
|
raise NotImplementedError |
|
|
|
def train(self, train_seqs, train_labels): |
|
self.gpr = GaussianProcessRegressor(kernel=self.kernel, |
|
alpha=self.noise**2) |
|
X = self.seq2feat(train_seqs) |
|
|
|
self.gpr = self.gpr.fit(X, -train_labels) |
|
|
|
def predict(self, predict_seqs): |
|
if self.gpr is None: |
|
return np.random.randn(len(predict_seqs)) |
|
X = self.seq2feat(predict_seqs) |
|
return -self.gpr.predict(X, return_std=False) |
|
|
|
|
|
class RandomPredictor(BasePredictor): |
|
|
|
def train(self, train_seqs, train_labels): |
|
self.train_labels = train_labels |
|
|
|
def predict(self, predict_seqs): |
|
return np.random.choice(self.train_labels, size=len(predict_seqs), |
|
replace=True) |
|
|
|
|
|
class MutationRadiusPredictor(BaseRegressionPredictor): |
|
|
|
def __init__(self, dataset_name, reg_coef=1e-8, **kwargs): |
|
super(MutationRadiusPredictor, self).__init__(dataset_name, |
|
reg_coef, **kwargs) |
|
self.wt = utils.read_fasta(os.path.join('data', dataset_name, 'wt.fasta'))[0] |
|
|
|
def seq2feat(self, seqs): |
|
mutation_counts = np.zeros(len(seqs)) |
|
for i, s in enumerate(seqs): |
|
for j in range(len(self.wt)): |
|
if self.wt[j] != s[j]: |
|
mutation_counts[i] += 1 |
|
return -mutation_counts[:, None] |
|
|
|
def predict_unsupervised(self, predict_seqs): |
|
return self.seq2feat(predict_seqs).squeeze() |
|
|
|
|
|
def select_training_data(data, n_train, scores): |
|
sorted_idx = np.argsort(scores) |
|
idx = sorted_idx[-n_train:] |
|
return data.iloc[idx, :].sample(n=n_train) |
|
|