Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /motifs /matrix.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

20.6 kB

	# Copyright 2013 by Michiel de Hoon. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.

	"""Support for various forms of sequence motif matrices.

	Implementation of frequency (count) matrices, position-weight matrices,
	and position-specific scoring matrices.
	"""

	import math
	import numbers

	try:
	import numpy as np
	except ImportError:
	from Bio import MissingPythonDependencyError

	raise MissingPythonDependencyError(
	"Install NumPy if you want to use Bio.motifs.matrix."
	)

	from Bio.Seq import Seq

	from . import _pwm


	class GenericPositionMatrix(dict):
	"""Base class for the support of position matrix operations."""

	def __init__(self, alphabet, values):
	"""Initialize the class."""
	self.length = None
	for letter in alphabet:
	if self.length is None:
	self.length = len(values[letter])
	elif self.length != len(values[letter]):
	raise Exception("data has inconsistent lengths")
	self[letter] = list(values[letter])
	self.alphabet = alphabet

	def __str__(self):
	"""Return a string containing nucleotides and counts of the alphabet in the Matrix."""
	words = ["%6d" % i for i in range(self.length)]
	line = " " + " ".join(words)
	lines = [line]
	for letter in self.alphabet:
	words = ["%6.2f" % value for value in self[letter]]
	line = "%c: " % letter + " ".join(words)
	lines.append(line)
	text = "\n".join(lines) + "\n"
	return text

	def __getitem__(self, key):
	"""Return the position matrix of index key."""
	if isinstance(key, tuple):
	if len(key) == 2:
	key1, key2 = key
	if isinstance(key1, slice):
	start1, stop1, stride1 = key1.indices(len(self.alphabet))
	indices1 = range(start1, stop1, stride1)
	letters1 = [self.alphabet[i] for i in indices1]
	dim1 = 2
	elif isinstance(key1, numbers.Integral):
	letter1 = self.alphabet[key1]
	dim1 = 1
	elif isinstance(key1, tuple):
	letters1 = [self.alphabet[i] for i in key1]
	dim1 = 2
	elif isinstance(key1, str):
	if len(key1) == 1:
	letter1 = key1
	dim1 = 1
	else:
	raise KeyError(key1)
	else:
	raise KeyError("Cannot understand key %s" % key1)
	if isinstance(key2, slice):
	start2, stop2, stride2 = key2.indices(self.length)
	indices2 = range(start2, stop2, stride2)
	dim2 = 2
	elif isinstance(key2, numbers.Integral):
	index2 = key2
	dim2 = 1
	else:
	raise KeyError("Cannot understand key %s" % key2)
	if dim1 == 1 and dim2 == 1:
	return dict.__getitem__(self, letter1)[index2]
	elif dim1 == 1 and dim2 == 2:
	values = dict.__getitem__(self, letter1)
	return tuple(values[index2] for index2 in indices2)
	elif dim1 == 2 and dim2 == 1:
	d = {}
	for letter1 in letters1:
	d[letter1] = dict.__getitem__(self, letter1)[index2]
	return d
	else:
	d = {}
	for letter1 in letters1:
	values = dict.__getitem__(self, letter1)
	d[letter1] = [values[_] for _ in indices2]
	if sorted(letters1) == self.alphabet:
	return self.__class__(self.alphabet, d)
	else:
	return d
	elif len(key) == 1:
	key = key[0]
	else:
	raise KeyError("keys should be 1- or 2-dimensional")
	if isinstance(key, slice):
	start, stop, stride = key.indices(len(self.alphabet))
	indices = range(start, stop, stride)
	letters = [self.alphabet[i] for i in indices]
	dim = 2
	elif isinstance(key, numbers.Integral):
	letter = self.alphabet[key]
	dim = 1
	elif isinstance(key, tuple):
	letters = [self.alphabet[i] for i in key]
	dim = 2
	elif isinstance(key, str):
	if len(key) == 1:
	letter = key
	dim = 1
	else:
	raise KeyError(key)
	else:
	raise KeyError("Cannot understand key %s" % key)
	if dim == 1:
	return dict.__getitem__(self, letter)
	elif dim == 2:
	d = {}
	for letter in letters:
	d[letter] = dict.__getitem__(self, letter)
	return d
	else:
	raise RuntimeError("Should not get here")

	@property
	def consensus(self):
	"""Return the consensus sequence."""
	sequence = ""
	for i in range(self.length):
	maximum = -math.inf
	for letter in self.alphabet:
	count = self[letter][i]
	if count > maximum:
	maximum = count
	sequence_letter = letter
	sequence += sequence_letter
	return Seq(sequence)

	@property
	def anticonsensus(self):
	"""Return the anticonsensus sequence."""
	sequence = ""
	for i in range(self.length):
	minimum = math.inf
	for letter in self.alphabet:
	count = self[letter][i]
	if count < minimum:
	minimum = count
	sequence_letter = letter
	sequence += sequence_letter
	return Seq(sequence)

	@property
	def degenerate_consensus(self):
	"""Return the degenerate consensus sequence."""
	# Following the rules adapted from
	# D. R. Cavener: "Comparison of the consensus sequence flanking
	# translational start sites in Drosophila and vertebrates."
	# Nucleic Acids Research 15(4): 1353-1361. (1987).
	# The same rules are used by TRANSFAC.
	degenerate_nucleotide = {
	"A": "A",
	"C": "C",
	"G": "G",
	"T": "T",
	"AC": "M",
	"AG": "R",
	"AT": "W",
	"CG": "S",
	"CT": "Y",
	"GT": "K",
	"ACG": "V",
	"ACT": "H",
	"AGT": "D",
	"CGT": "B",
	"ACGT": "N",
	}
	sequence = ""
	for i in range(self.length):

	def get(nucleotide):
	return self[nucleotide][i] # noqa: B023

	nucleotides = sorted(self, key=get, reverse=True)
	counts = [self[c][i] for c in nucleotides]
	# Follow the Cavener rules:
	if counts[0] > sum(counts[1:]) and counts[0] > 2 * counts[1]:
	key = nucleotides[0]
	elif 4 * sum(counts[:2]) > 3 * sum(counts):
	key = "".join(sorted(nucleotides[:2]))
	elif counts[3] == 0:
	key = "".join(sorted(nucleotides[:3]))
	else:
	key = "ACGT"
	nucleotide = degenerate_nucleotide.get(key, key)
	sequence += nucleotide
	return Seq(sequence)

	@property
	def gc_content(self):
	"""Compute the fraction GC content."""
	alphabet = self.alphabet
	gc_total = 0.0
	total = 0.0
	for i in range(self.length):
	for letter in alphabet:
	if letter in "CG":
	gc_total += self[letter][i]
	total += self[letter][i]
	return gc_total / total

	def reverse_complement(self):
	"""Compute reverse complement."""
	values = {}
	if self.alphabet == "ACGU":
	values["A"] = self["U"][::-1]
	values["U"] = self["A"][::-1]
	else:
	values["A"] = self["T"][::-1]
	values["T"] = self["A"][::-1]
	values["G"] = self["C"][::-1]
	values["C"] = self["G"][::-1]
	alphabet = self.alphabet
	return self.__class__(alphabet, values)


	class FrequencyPositionMatrix(GenericPositionMatrix):
	"""Class for the support of frequency calculations on the Position Matrix."""

	def normalize(self, pseudocounts=None):
	"""Create and return a position-weight matrix by normalizing the counts matrix.

	If pseudocounts is None (default), no pseudocounts are added
	to the counts.

	If pseudocounts is a number, it is added to the counts before
	calculating the position-weight matrix.

	Alternatively, the pseudocounts can be a dictionary with a key
	for each letter in the alphabet associated with the motif.
	"""
	counts = {}
	if pseudocounts is None:
	for letter in self.alphabet:
	counts[letter] = [0.0] * self.length
	elif isinstance(pseudocounts, dict):
	for letter in self.alphabet:
	counts[letter] = [float(pseudocounts[letter])] * self.length
	else:
	for letter in self.alphabet:
	counts[letter] = [float(pseudocounts)] * self.length
	for i in range(self.length):
	for letter in self.alphabet:
	counts[letter][i] += self[letter][i]
	# Actual normalization is done in the PositionWeightMatrix initializer
	return PositionWeightMatrix(self.alphabet, counts)


	class PositionWeightMatrix(GenericPositionMatrix):
	"""Class for the support of weight calculations on the Position Matrix."""

	def __init__(self, alphabet, counts):
	"""Initialize the class."""
	GenericPositionMatrix.__init__(self, alphabet, counts)
	for i in range(self.length):
	total = sum(self[letter][i] for letter in alphabet)
	for letter in alphabet:
	self[letter][i] /= total
	for letter in alphabet:
	self[letter] = tuple(self[letter])

	def log_odds(self, background=None):
	"""Return the Position-Specific Scoring Matrix.

	The Position-Specific Scoring Matrix (PSSM) contains the log-odds
	scores computed from the probability matrix and the background
	probabilities. If the background is None, a uniform background
	distribution is assumed.
	"""
	values = {}
	alphabet = self.alphabet
	if background is None:
	background = dict.fromkeys(self.alphabet, 1.0)
	else:
	background = dict(background)
	total = sum(background.values())
	for letter in alphabet:
	background[letter] /= total
	values[letter] = []
	for i in range(self.length):
	for letter in alphabet:
	b = background[letter]
	if b > 0:
	p = self[letter][i]
	if p > 0:
	logodds = math.log(p / b, 2)
	else:
	logodds = -math.inf
	else:
	p = self[letter][i]
	if p > 0:
	logodds = math.inf
	else:
	logodds = math.nan
	values[letter].append(logodds)
	pssm = PositionSpecificScoringMatrix(alphabet, values)
	return pssm


	class PositionSpecificScoringMatrix(GenericPositionMatrix):
	"""Class for the support of Position Specific Scoring Matrix calculations."""

	def calculate(self, sequence):
	"""Return the PWM score for a given sequence for all positions.

	Notes:
	- the sequence can only be a DNA sequence
	- the search is performed only on one strand
	- if the sequence and the motif have the same length, a single
	number is returned
	- otherwise, the result is a one-dimensional numpy array

	"""
	# TODO - Code itself tolerates ambiguous bases (as NaN).
	if sorted(self.alphabet) != ["A", "C", "G", "T"]:
	raise ValueError(
	"PSSM has wrong alphabet: %s - Use only with DNA motifs" % self.alphabet
	)

	# NOTE: The C code handles mixed case input as this could be large
	# (e.g. contig or chromosome), so requiring it be all upper or lower
	# case would impose an overhead to allocate the extra memory.
	try:
	sequence = bytes(sequence)
	except TypeError: # str
	try:
	sequence = bytes(sequence, "ASCII")
	except TypeError:
	raise ValueError(
	"sequence should be a Seq, MutableSeq, string, or bytes-like object"
	) from None
	except UnicodeEncodeError:
	raise ValueError(
	"sequence should contain ASCII characters only"
	) from None
	except Exception:
	raise ValueError(
	"sequence should be a Seq, MutableSeq, string, or bytes-like object"
	) from None

	n = len(sequence)
	m = self.length
	# Create the numpy arrays here; the C module then does not rely on numpy
	# Use a float32 for the scores array to save space
	scores = np.empty(n - m + 1, np.float32)
	logodds = np.array(
	[[self[letter][i] for letter in "ACGT"] for i in range(m)], float
	)
	_pwm.calculate(sequence, logodds, scores)

	if len(scores) == 1:
	return scores[0]
	else:
	return scores

	def search(self, sequence, threshold=0.0, both=True, chunksize=10**6):
	"""Find hits with PWM score above given threshold.

	A generator function, returning found hits in the given sequence
	with the pwm score higher than the threshold.
	"""
	sequence = sequence.upper()
	seq_len = len(sequence)
	motif_l = self.length
	chunk_starts = np.arange(0, seq_len, chunksize)
	if both:
	rc = self.reverse_complement()
	for chunk_start in chunk_starts:
	subseq = sequence[chunk_start : chunk_start + chunksize + motif_l - 1]
	pos_scores = self.calculate(subseq)
	pos_ind = pos_scores >= threshold
	pos_positions = np.where(pos_ind)[0] + chunk_start
	pos_scores = pos_scores[pos_ind]
	if both:
	neg_scores = rc.calculate(subseq)
	neg_ind = neg_scores >= threshold
	neg_positions = np.where(neg_ind)[0] + chunk_start
	neg_scores = neg_scores[neg_ind]
	else:
	neg_positions = np.empty((0), dtype=int)
	neg_scores = np.empty((0), dtype=int)
	chunk_positions = np.append(pos_positions, neg_positions - seq_len)
	chunk_scores = np.append(pos_scores, neg_scores)
	order = np.argsort(np.append(pos_positions, neg_positions))
	chunk_positions = chunk_positions[order]
	chunk_scores = chunk_scores[order]
	yield from zip(chunk_positions, chunk_scores)

	@property
	def max(self):
	"""Maximal possible score for this motif.

	returns the score computed for the consensus sequence.
	"""
	score = 0.0
	letters = self.alphabet
	for position in range(0, self.length):
	score += max(self[letter][position] for letter in letters)
	return score

	@property
	def min(self):
	"""Minimal possible score for this motif.

	returns the score computed for the anticonsensus sequence.
	"""
	score = 0.0
	letters = self.alphabet
	for position in range(0, self.length):
	score += min(self[letter][position] for letter in letters)
	return score

	@property
	def gc_content(self):
	"""Compute the GC-ratio."""
	raise Exception("Cannot compute the %GC composition of a PSSM")

	def mean(self, background=None):
	"""Return expected value of the score of a motif."""
	if background is None:
	background = dict.fromkeys(self.alphabet, 1.0)
	else:
	background = dict(background)
	total = sum(background.values())
	for letter in self.alphabet:
	background[letter] /= total
	sx = 0.0
	for i in range(self.length):
	for letter in self.alphabet:
	logodds = self[letter, i]
	if math.isnan(logodds):
	continue
	if math.isinf(logodds) and logodds < 0:
	continue
	b = background[letter]
	p = b * math.pow(2, logodds)
	sx += p * logodds
	return sx

	def std(self, background=None):
	"""Return standard deviation of the score of a motif."""
	if background is None:
	background = dict.fromkeys(self.alphabet, 1.0)
	else:
	background = dict(background)
	total = sum(background.values())
	for letter in self.alphabet:
	background[letter] /= total
	variance = 0.0
	for i in range(self.length):
	sx = 0.0
	sxx = 0.0
	for letter in self.alphabet:
	logodds = self[letter, i]
	if math.isnan(logodds):
	continue
	if math.isinf(logodds) and logodds < 0:
	continue
	b = background[letter]
	p = b * math.pow(2, logodds)
	sx += p * logodds
	sxx += p * logodds * logodds
	sxx -= sx * sx
	variance += sxx
	variance = max(variance, 0) # to avoid roundoff problems
	return math.sqrt(variance)

	def dist_pearson(self, other):
	"""Return the similarity score based on pearson correlation for the given motif against self.

	We use the Pearson's correlation of the respective probabilities.
	"""
	if self.alphabet != other.alphabet:
	raise ValueError("Cannot compare motifs with different alphabets")

	max_p = -2
	for offset in range(-self.length + 1, other.length):
	if offset < 0:
	p = self.dist_pearson_at(other, -offset)
	else: # offset>=0
	p = other.dist_pearson_at(self, offset)
	if max_p < p:
	max_p = p
	max_o = -offset
	return 1 - max_p, max_o

	def dist_pearson_at(self, other, offset):
	"""Return the similarity score based on pearson correlation at the given offset."""
	letters = self.alphabet
	sx = 0.0 # \sum x
	sy = 0.0 # \sum y
	sxx = 0.0 # \sum x^2
	sxy = 0.0 # \sum x \cdot y
	syy = 0.0 # \sum y^2
	norm = max(self.length, offset + other.length) * len(letters)
	for pos in range(min(self.length - offset, other.length)):
	xi = [self[letter, pos + offset] for letter in letters]
	yi = [other[letter, pos] for letter in letters]
	sx += sum(xi)
	sy += sum(yi)
	sxx += sum(x * x for x in xi)
	sxy += sum(x * y for x, y in zip(xi, yi))
	syy += sum(y * y for y in yi)
	sx /= norm
	sy /= norm
	sxx /= norm
	sxy /= norm
	syy /= norm
	numerator = sxy - sx * sy
	denominator = math.sqrt((sxx - sx * sx) * (syy - sy * sy))
	return numerator / denominator

	def distribution(self, background=None, precision=10**3):
	"""Calculate the distribution of the scores at the given precision."""
	from .thresholds import ScoreDistribution

	if background is None:
	background = dict.fromkeys(self.alphabet, 1.0)
	else:
	background = dict(background)
	total = sum(background.values())
	for letter in self.alphabet:
	background[letter] /= total
	return ScoreDistribution(precision=precision, pssm=self, background=background)