gzhong's picture
Upload folder using huggingface_hub
7718235 verified
import os
import numpy as np
from Bio.Align import substitution_matrices
from sklearn.linear_model import LinearRegression, Ridge
from skopt.learning import GaussianProcessRegressor
from skopt.learning.gaussian_process.kernels import ConstantKernel, Matern
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
import utils
REG_COEF_LIST = [1e-1, 1e0, 1e1, 1e2, 1e3]
class BasePredictor():
"""Abstract class for predictors."""
def __init__(self, dataset_name, **kwargs):
self.dataset_name = dataset_name
def select_training_data(self, data, n_train):
return data.sample(n=n_train)
def train(self, train_seqs, train_labels):
"""Trains the model.
Args:
- train_seqs: a list of sequences
- train_labels: a list of numerical fitness labels
"""
raise NotImplementedError
def predict(self, predict_seqs):
"""Gets model predictions.
Args:
- predict_seqs: a list of sequences
Returns:
A list of numerical fitness predictions.
"""
raise NotImplementedError
def predict_unsupervised(self, predict_seqs):
"""Gets model predictions before training.
Args:
- predict_seqs: a list of sequences
Returns:
A list of numerical fitness predictions.
"""
return np.random.randn(len(predict_seqs))
class BoostingPredictor(BasePredictor):
"""Boosting by combining predictors as weak learners."""
def __init__(self, dataset_name, weak_learner_classes, **kwargs):
super(BoostingPredictor, self).__init__(dataset_name)
self.weak_learners = [c(dataset_name, **kwargs) for c in
weak_learner_classes]
def train(self, train_seqs, train_labels):
y = train_labels
for i, model in enumerate(self.weak_learners):
model.train(train_seqs, y)
y -= model.predict(train_seqs)
def predict(self, predict_seqs):
y = np.zeros(len(predict_seqs))
for i, model in enumerate(self.weak_learners):
y += model.predict(predict_seqs)
return y
def predict_unsupervised(self, predict_seqs):
return self.weak_learners[0].predict_unsupervised(predict_seqs)
def select_training_data(self, data, n_train):
return self.weak_learners[0].select_training_data(data, n_train)
class BaseRegressionPredictor(BasePredictor):
def __init__(self, dataset_name, reg_coef=None,
linear_model_cls=Ridge, **kwargs):
self.dataset_name = dataset_name
self.reg_coef = reg_coef
self.linear_model_cls = linear_model_cls
self.model = None
def seq2feat(self, seqs):
raise NotImplementedError
def train(self, train_seqs, train_labels):
X = self.seq2feat(train_seqs)
if self.reg_coef is None or self.reg_coef == 'CV':
best_rc, best_score = None, -np.inf
for rc in REG_COEF_LIST:
model = self.linear_model_cls(alpha=rc)
score = cross_val_score(
model, X, train_labels, cv=5,
scoring=make_scorer(utils.spearman)).mean()
if score > best_score:
best_rc = rc
best_score = score
self.reg_coef = best_rc
# print(f'Cross validated reg coef {best_rc}')
self.model = self.linear_model_cls(alpha=self.reg_coef)
self.model.fit(X, train_labels)
def predict(self, predict_seqs):
if self.model is None:
return np.random.randn(len(predict_seqs))
X = self.seq2feat(predict_seqs)
return self.model.predict(X)
class JointPredictor(BaseRegressionPredictor):
"""Combining regression predictors by training jointly."""
def __init__(self, dataset_name, predictor_classes, predictor_name,
reg_coef='CV', **kwargs):
super(JointPredictor, self).__init__(dataset_name, reg_coef, **kwargs)
self.predictors = []
for c, name in zip(predictor_classes, predictor_name):
if f'reg_coef_{name}' in kwargs:
self.predictors.append(c(dataset_name,
reg_coef=float(kwargs[f'reg_coef_{name}']), **kwargs))
else:
self.predictors.append(c(dataset_name, **kwargs))
def seq2feat(self, seqs):
# To apply different regularziation coefficients we scale the features
# by a multiplier in Ridge regression
features = [p.seq2feat(seqs) * np.sqrt(1.0 / p.reg_coef)
for p in self.predictors]
return np.concatenate(features, axis=1)
class BLOSUM62Predictor(BaseRegressionPredictor):
def __init__(self, dataset_name, reg_coef=1e-8, **kwargs):
super(BLOSUM62Predictor, self).__init__(dataset_name,
reg_coef, **kwargs)
self.wt = utils.read_fasta(
os.path.join('data', dataset_name, 'wt.fasta'))[0]
self.matrix = substitution_matrices.load('BLOSUM62')
self.alphabet = self.matrix.alphabet
for i, c in enumerate(self.wt):
assert c in self.alphabet, f'unexpected AA {c} (pos {i})'
def seq2feat(self, seqs):
scores = np.zeros(len(seqs))
return utils.get_blosum_scores(seqs, self.wt, self.matrix)[:, None]
def predict_unsupervised(self, predict_seqs):
return self.seq2feat(predict_seqs).squeeze()
class BaseGPPredictor(BasePredictor):
def __init__(self, dataset_name, noise=0.1, kernel_length_scale=1.0,
kernel_nu=2.5, kernel_const=1.0, **kwargs):
self.dataset_name = dataset_name
self.kernel = ConstantKernel(kernel_const) * Matern(
length_scale=kernel_length_scale, nu=kernel_nu)
self.noise = noise
self.gpr = None
def seq2feat(self, seqs):
raise NotImplementedError
def train(self, train_seqs, train_labels):
self.gpr = GaussianProcessRegressor(kernel=self.kernel,
alpha=self.noise**2)
X = self.seq2feat(train_seqs)
# Use negative labels for minimization.
self.gpr = self.gpr.fit(X, -train_labels)
def predict(self, predict_seqs):
if self.gpr is None:
return np.random.randn(len(predict_seqs))
X = self.seq2feat(predict_seqs)
return -self.gpr.predict(X, return_std=False)
class RandomPredictor(BasePredictor):
def train(self, train_seqs, train_labels):
self.train_labels = train_labels
def predict(self, predict_seqs):
return np.random.choice(self.train_labels, size=len(predict_seqs),
replace=True)
class MutationRadiusPredictor(BaseRegressionPredictor):
def __init__(self, dataset_name, reg_coef=1e-8, **kwargs):
super(MutationRadiusPredictor, self).__init__(dataset_name,
reg_coef, **kwargs)
self.wt = utils.read_fasta(os.path.join('data', dataset_name, 'wt.fasta'))[0]
def seq2feat(self, seqs):
mutation_counts = np.zeros(len(seqs))
for i, s in enumerate(seqs):
for j in range(len(self.wt)):
if self.wt[j] != s[j]:
mutation_counts[i] += 1
return -mutation_counts[:, None]
def predict_unsupervised(self, predict_seqs):
return self.seq2feat(predict_seqs).squeeze()
def select_training_data(data, n_train, scores):
sorted_idx = np.argsort(scores)
idx = sorted_idx[-n_train:]
return data.iloc[idx, :].sample(n=n_train)