Spaces:
No application file
No application file
# Copyright 2013 by Michiel de Hoon. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Support for various forms of sequence motif matrices. | |
Implementation of frequency (count) matrices, position-weight matrices, | |
and position-specific scoring matrices. | |
""" | |
import math | |
import numbers | |
try: | |
import numpy as np | |
except ImportError: | |
from Bio import MissingPythonDependencyError | |
raise MissingPythonDependencyError( | |
"Install NumPy if you want to use Bio.motifs.matrix." | |
) | |
from Bio.Seq import Seq | |
from . import _pwm | |
class GenericPositionMatrix(dict): | |
"""Base class for the support of position matrix operations.""" | |
def __init__(self, alphabet, values): | |
"""Initialize the class.""" | |
self.length = None | |
for letter in alphabet: | |
if self.length is None: | |
self.length = len(values[letter]) | |
elif self.length != len(values[letter]): | |
raise Exception("data has inconsistent lengths") | |
self[letter] = list(values[letter]) | |
self.alphabet = alphabet | |
def __str__(self): | |
"""Return a string containing nucleotides and counts of the alphabet in the Matrix.""" | |
words = ["%6d" % i for i in range(self.length)] | |
line = " " + " ".join(words) | |
lines = [line] | |
for letter in self.alphabet: | |
words = ["%6.2f" % value for value in self[letter]] | |
line = "%c: " % letter + " ".join(words) | |
lines.append(line) | |
text = "\n".join(lines) + "\n" | |
return text | |
def __getitem__(self, key): | |
"""Return the position matrix of index key.""" | |
if isinstance(key, tuple): | |
if len(key) == 2: | |
key1, key2 = key | |
if isinstance(key1, slice): | |
start1, stop1, stride1 = key1.indices(len(self.alphabet)) | |
indices1 = range(start1, stop1, stride1) | |
letters1 = [self.alphabet[i] for i in indices1] | |
dim1 = 2 | |
elif isinstance(key1, numbers.Integral): | |
letter1 = self.alphabet[key1] | |
dim1 = 1 | |
elif isinstance(key1, tuple): | |
letters1 = [self.alphabet[i] for i in key1] | |
dim1 = 2 | |
elif isinstance(key1, str): | |
if len(key1) == 1: | |
letter1 = key1 | |
dim1 = 1 | |
else: | |
raise KeyError(key1) | |
else: | |
raise KeyError("Cannot understand key %s" % key1) | |
if isinstance(key2, slice): | |
start2, stop2, stride2 = key2.indices(self.length) | |
indices2 = range(start2, stop2, stride2) | |
dim2 = 2 | |
elif isinstance(key2, numbers.Integral): | |
index2 = key2 | |
dim2 = 1 | |
else: | |
raise KeyError("Cannot understand key %s" % key2) | |
if dim1 == 1 and dim2 == 1: | |
return dict.__getitem__(self, letter1)[index2] | |
elif dim1 == 1 and dim2 == 2: | |
values = dict.__getitem__(self, letter1) | |
return tuple(values[index2] for index2 in indices2) | |
elif dim1 == 2 and dim2 == 1: | |
d = {} | |
for letter1 in letters1: | |
d[letter1] = dict.__getitem__(self, letter1)[index2] | |
return d | |
else: | |
d = {} | |
for letter1 in letters1: | |
values = dict.__getitem__(self, letter1) | |
d[letter1] = [values[_] for _ in indices2] | |
if sorted(letters1) == self.alphabet: | |
return self.__class__(self.alphabet, d) | |
else: | |
return d | |
elif len(key) == 1: | |
key = key[0] | |
else: | |
raise KeyError("keys should be 1- or 2-dimensional") | |
if isinstance(key, slice): | |
start, stop, stride = key.indices(len(self.alphabet)) | |
indices = range(start, stop, stride) | |
letters = [self.alphabet[i] for i in indices] | |
dim = 2 | |
elif isinstance(key, numbers.Integral): | |
letter = self.alphabet[key] | |
dim = 1 | |
elif isinstance(key, tuple): | |
letters = [self.alphabet[i] for i in key] | |
dim = 2 | |
elif isinstance(key, str): | |
if len(key) == 1: | |
letter = key | |
dim = 1 | |
else: | |
raise KeyError(key) | |
else: | |
raise KeyError("Cannot understand key %s" % key) | |
if dim == 1: | |
return dict.__getitem__(self, letter) | |
elif dim == 2: | |
d = {} | |
for letter in letters: | |
d[letter] = dict.__getitem__(self, letter) | |
return d | |
else: | |
raise RuntimeError("Should not get here") | |
def consensus(self): | |
"""Return the consensus sequence.""" | |
sequence = "" | |
for i in range(self.length): | |
maximum = -math.inf | |
for letter in self.alphabet: | |
count = self[letter][i] | |
if count > maximum: | |
maximum = count | |
sequence_letter = letter | |
sequence += sequence_letter | |
return Seq(sequence) | |
def anticonsensus(self): | |
"""Return the anticonsensus sequence.""" | |
sequence = "" | |
for i in range(self.length): | |
minimum = math.inf | |
for letter in self.alphabet: | |
count = self[letter][i] | |
if count < minimum: | |
minimum = count | |
sequence_letter = letter | |
sequence += sequence_letter | |
return Seq(sequence) | |
def degenerate_consensus(self): | |
"""Return the degenerate consensus sequence.""" | |
# Following the rules adapted from | |
# D. R. Cavener: "Comparison of the consensus sequence flanking | |
# translational start sites in Drosophila and vertebrates." | |
# Nucleic Acids Research 15(4): 1353-1361. (1987). | |
# The same rules are used by TRANSFAC. | |
degenerate_nucleotide = { | |
"A": "A", | |
"C": "C", | |
"G": "G", | |
"T": "T", | |
"AC": "M", | |
"AG": "R", | |
"AT": "W", | |
"CG": "S", | |
"CT": "Y", | |
"GT": "K", | |
"ACG": "V", | |
"ACT": "H", | |
"AGT": "D", | |
"CGT": "B", | |
"ACGT": "N", | |
} | |
sequence = "" | |
for i in range(self.length): | |
def get(nucleotide): | |
return self[nucleotide][i] # noqa: B023 | |
nucleotides = sorted(self, key=get, reverse=True) | |
counts = [self[c][i] for c in nucleotides] | |
# Follow the Cavener rules: | |
if counts[0] > sum(counts[1:]) and counts[0] > 2 * counts[1]: | |
key = nucleotides[0] | |
elif 4 * sum(counts[:2]) > 3 * sum(counts): | |
key = "".join(sorted(nucleotides[:2])) | |
elif counts[3] == 0: | |
key = "".join(sorted(nucleotides[:3])) | |
else: | |
key = "ACGT" | |
nucleotide = degenerate_nucleotide.get(key, key) | |
sequence += nucleotide | |
return Seq(sequence) | |
def gc_content(self): | |
"""Compute the fraction GC content.""" | |
alphabet = self.alphabet | |
gc_total = 0.0 | |
total = 0.0 | |
for i in range(self.length): | |
for letter in alphabet: | |
if letter in "CG": | |
gc_total += self[letter][i] | |
total += self[letter][i] | |
return gc_total / total | |
def reverse_complement(self): | |
"""Compute reverse complement.""" | |
values = {} | |
if self.alphabet == "ACGU": | |
values["A"] = self["U"][::-1] | |
values["U"] = self["A"][::-1] | |
else: | |
values["A"] = self["T"][::-1] | |
values["T"] = self["A"][::-1] | |
values["G"] = self["C"][::-1] | |
values["C"] = self["G"][::-1] | |
alphabet = self.alphabet | |
return self.__class__(alphabet, values) | |
class FrequencyPositionMatrix(GenericPositionMatrix): | |
"""Class for the support of frequency calculations on the Position Matrix.""" | |
def normalize(self, pseudocounts=None): | |
"""Create and return a position-weight matrix by normalizing the counts matrix. | |
If pseudocounts is None (default), no pseudocounts are added | |
to the counts. | |
If pseudocounts is a number, it is added to the counts before | |
calculating the position-weight matrix. | |
Alternatively, the pseudocounts can be a dictionary with a key | |
for each letter in the alphabet associated with the motif. | |
""" | |
counts = {} | |
if pseudocounts is None: | |
for letter in self.alphabet: | |
counts[letter] = [0.0] * self.length | |
elif isinstance(pseudocounts, dict): | |
for letter in self.alphabet: | |
counts[letter] = [float(pseudocounts[letter])] * self.length | |
else: | |
for letter in self.alphabet: | |
counts[letter] = [float(pseudocounts)] * self.length | |
for i in range(self.length): | |
for letter in self.alphabet: | |
counts[letter][i] += self[letter][i] | |
# Actual normalization is done in the PositionWeightMatrix initializer | |
return PositionWeightMatrix(self.alphabet, counts) | |
class PositionWeightMatrix(GenericPositionMatrix): | |
"""Class for the support of weight calculations on the Position Matrix.""" | |
def __init__(self, alphabet, counts): | |
"""Initialize the class.""" | |
GenericPositionMatrix.__init__(self, alphabet, counts) | |
for i in range(self.length): | |
total = sum(self[letter][i] for letter in alphabet) | |
for letter in alphabet: | |
self[letter][i] /= total | |
for letter in alphabet: | |
self[letter] = tuple(self[letter]) | |
def log_odds(self, background=None): | |
"""Return the Position-Specific Scoring Matrix. | |
The Position-Specific Scoring Matrix (PSSM) contains the log-odds | |
scores computed from the probability matrix and the background | |
probabilities. If the background is None, a uniform background | |
distribution is assumed. | |
""" | |
values = {} | |
alphabet = self.alphabet | |
if background is None: | |
background = dict.fromkeys(self.alphabet, 1.0) | |
else: | |
background = dict(background) | |
total = sum(background.values()) | |
for letter in alphabet: | |
background[letter] /= total | |
values[letter] = [] | |
for i in range(self.length): | |
for letter in alphabet: | |
b = background[letter] | |
if b > 0: | |
p = self[letter][i] | |
if p > 0: | |
logodds = math.log(p / b, 2) | |
else: | |
logodds = -math.inf | |
else: | |
p = self[letter][i] | |
if p > 0: | |
logodds = math.inf | |
else: | |
logodds = math.nan | |
values[letter].append(logodds) | |
pssm = PositionSpecificScoringMatrix(alphabet, values) | |
return pssm | |
class PositionSpecificScoringMatrix(GenericPositionMatrix): | |
"""Class for the support of Position Specific Scoring Matrix calculations.""" | |
def calculate(self, sequence): | |
"""Return the PWM score for a given sequence for all positions. | |
Notes: | |
- the sequence can only be a DNA sequence | |
- the search is performed only on one strand | |
- if the sequence and the motif have the same length, a single | |
number is returned | |
- otherwise, the result is a one-dimensional numpy array | |
""" | |
# TODO - Code itself tolerates ambiguous bases (as NaN). | |
if sorted(self.alphabet) != ["A", "C", "G", "T"]: | |
raise ValueError( | |
"PSSM has wrong alphabet: %s - Use only with DNA motifs" % self.alphabet | |
) | |
# NOTE: The C code handles mixed case input as this could be large | |
# (e.g. contig or chromosome), so requiring it be all upper or lower | |
# case would impose an overhead to allocate the extra memory. | |
try: | |
sequence = bytes(sequence) | |
except TypeError: # str | |
try: | |
sequence = bytes(sequence, "ASCII") | |
except TypeError: | |
raise ValueError( | |
"sequence should be a Seq, MutableSeq, string, or bytes-like object" | |
) from None | |
except UnicodeEncodeError: | |
raise ValueError( | |
"sequence should contain ASCII characters only" | |
) from None | |
except Exception: | |
raise ValueError( | |
"sequence should be a Seq, MutableSeq, string, or bytes-like object" | |
) from None | |
n = len(sequence) | |
m = self.length | |
# Create the numpy arrays here; the C module then does not rely on numpy | |
# Use a float32 for the scores array to save space | |
scores = np.empty(n - m + 1, np.float32) | |
logodds = np.array( | |
[[self[letter][i] for letter in "ACGT"] for i in range(m)], float | |
) | |
_pwm.calculate(sequence, logodds, scores) | |
if len(scores) == 1: | |
return scores[0] | |
else: | |
return scores | |
def search(self, sequence, threshold=0.0, both=True, chunksize=10**6): | |
"""Find hits with PWM score above given threshold. | |
A generator function, returning found hits in the given sequence | |
with the pwm score higher than the threshold. | |
""" | |
sequence = sequence.upper() | |
seq_len = len(sequence) | |
motif_l = self.length | |
chunk_starts = np.arange(0, seq_len, chunksize) | |
if both: | |
rc = self.reverse_complement() | |
for chunk_start in chunk_starts: | |
subseq = sequence[chunk_start : chunk_start + chunksize + motif_l - 1] | |
pos_scores = self.calculate(subseq) | |
pos_ind = pos_scores >= threshold | |
pos_positions = np.where(pos_ind)[0] + chunk_start | |
pos_scores = pos_scores[pos_ind] | |
if both: | |
neg_scores = rc.calculate(subseq) | |
neg_ind = neg_scores >= threshold | |
neg_positions = np.where(neg_ind)[0] + chunk_start | |
neg_scores = neg_scores[neg_ind] | |
else: | |
neg_positions = np.empty((0), dtype=int) | |
neg_scores = np.empty((0), dtype=int) | |
chunk_positions = np.append(pos_positions, neg_positions - seq_len) | |
chunk_scores = np.append(pos_scores, neg_scores) | |
order = np.argsort(np.append(pos_positions, neg_positions)) | |
chunk_positions = chunk_positions[order] | |
chunk_scores = chunk_scores[order] | |
yield from zip(chunk_positions, chunk_scores) | |
def max(self): | |
"""Maximal possible score for this motif. | |
returns the score computed for the consensus sequence. | |
""" | |
score = 0.0 | |
letters = self.alphabet | |
for position in range(0, self.length): | |
score += max(self[letter][position] for letter in letters) | |
return score | |
def min(self): | |
"""Minimal possible score for this motif. | |
returns the score computed for the anticonsensus sequence. | |
""" | |
score = 0.0 | |
letters = self.alphabet | |
for position in range(0, self.length): | |
score += min(self[letter][position] for letter in letters) | |
return score | |
def gc_content(self): | |
"""Compute the GC-ratio.""" | |
raise Exception("Cannot compute the %GC composition of a PSSM") | |
def mean(self, background=None): | |
"""Return expected value of the score of a motif.""" | |
if background is None: | |
background = dict.fromkeys(self.alphabet, 1.0) | |
else: | |
background = dict(background) | |
total = sum(background.values()) | |
for letter in self.alphabet: | |
background[letter] /= total | |
sx = 0.0 | |
for i in range(self.length): | |
for letter in self.alphabet: | |
logodds = self[letter, i] | |
if math.isnan(logodds): | |
continue | |
if math.isinf(logodds) and logodds < 0: | |
continue | |
b = background[letter] | |
p = b * math.pow(2, logodds) | |
sx += p * logodds | |
return sx | |
def std(self, background=None): | |
"""Return standard deviation of the score of a motif.""" | |
if background is None: | |
background = dict.fromkeys(self.alphabet, 1.0) | |
else: | |
background = dict(background) | |
total = sum(background.values()) | |
for letter in self.alphabet: | |
background[letter] /= total | |
variance = 0.0 | |
for i in range(self.length): | |
sx = 0.0 | |
sxx = 0.0 | |
for letter in self.alphabet: | |
logodds = self[letter, i] | |
if math.isnan(logodds): | |
continue | |
if math.isinf(logodds) and logodds < 0: | |
continue | |
b = background[letter] | |
p = b * math.pow(2, logodds) | |
sx += p * logodds | |
sxx += p * logodds * logodds | |
sxx -= sx * sx | |
variance += sxx | |
variance = max(variance, 0) # to avoid roundoff problems | |
return math.sqrt(variance) | |
def dist_pearson(self, other): | |
"""Return the similarity score based on pearson correlation for the given motif against self. | |
We use the Pearson's correlation of the respective probabilities. | |
""" | |
if self.alphabet != other.alphabet: | |
raise ValueError("Cannot compare motifs with different alphabets") | |
max_p = -2 | |
for offset in range(-self.length + 1, other.length): | |
if offset < 0: | |
p = self.dist_pearson_at(other, -offset) | |
else: # offset>=0 | |
p = other.dist_pearson_at(self, offset) | |
if max_p < p: | |
max_p = p | |
max_o = -offset | |
return 1 - max_p, max_o | |
def dist_pearson_at(self, other, offset): | |
"""Return the similarity score based on pearson correlation at the given offset.""" | |
letters = self.alphabet | |
sx = 0.0 # \sum x | |
sy = 0.0 # \sum y | |
sxx = 0.0 # \sum x^2 | |
sxy = 0.0 # \sum x \cdot y | |
syy = 0.0 # \sum y^2 | |
norm = max(self.length, offset + other.length) * len(letters) | |
for pos in range(min(self.length - offset, other.length)): | |
xi = [self[letter, pos + offset] for letter in letters] | |
yi = [other[letter, pos] for letter in letters] | |
sx += sum(xi) | |
sy += sum(yi) | |
sxx += sum(x * x for x in xi) | |
sxy += sum(x * y for x, y in zip(xi, yi)) | |
syy += sum(y * y for y in yi) | |
sx /= norm | |
sy /= norm | |
sxx /= norm | |
sxy /= norm | |
syy /= norm | |
numerator = sxy - sx * sy | |
denominator = math.sqrt((sxx - sx * sx) * (syy - sy * sy)) | |
return numerator / denominator | |
def distribution(self, background=None, precision=10**3): | |
"""Calculate the distribution of the scores at the given precision.""" | |
from .thresholds import ScoreDistribution | |
if background is None: | |
background = dict.fromkeys(self.alphabet, 1.0) | |
else: | |
background = dict(background) | |
total = sum(background.values()) | |
for letter in self.alphabet: | |
background[letter] /= total | |
return ScoreDistribution(precision=precision, pssm=self, background=background) | |