Spaces:
No application file
No application file
# Copyright 2003, 2007 by Sebastian Bassi. [email protected] | |
# All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Local Composition Complexity.""" | |
import math | |
def lcc_mult(seq, wsize): | |
"""Calculate Local Composition Complexity (LCC) values over sliding window. | |
Returns a list of floats, the LCC values for a sliding window over | |
the sequence. | |
seq - an unambiguous DNA sequence (a string or Seq object) | |
wsize - window size, integer | |
The result is the same as applying lcc_simp multiple times, but this | |
version is optimized for speed. The optimization works by using the | |
value of previous window as a base to compute the next one. | |
""" | |
l4 = math.log(4) | |
seq = seq.upper() | |
tamseq = len(seq) | |
compone = [0] | |
lccsal = [] | |
for i in range(wsize): | |
compone.append(((i + 1) / wsize) * math.log((i + 1) / wsize) / l4) | |
window = seq[0:wsize] | |
cant_a = window.count("A") | |
cant_c = window.count("C") | |
cant_t = window.count("T") | |
cant_g = window.count("G") | |
term_a = compone[cant_a] | |
term_c = compone[cant_c] | |
term_t = compone[cant_t] | |
term_g = compone[cant_g] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
tail = seq[0] | |
for x in range(tamseq - wsize): | |
window = seq[x + 1 : wsize + x + 1] | |
if tail == window[-1]: | |
lccsal.append(lccsal[-1]) | |
elif tail == "A": | |
cant_a -= 1 | |
if window.endswith("C"): | |
cant_c += 1 | |
term_a = compone[cant_a] | |
term_c = compone[cant_c] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
elif window.endswith("T"): | |
cant_t += 1 | |
term_a = compone[cant_a] | |
term_t = compone[cant_t] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
elif window.endswith("G"): | |
cant_g += 1 | |
term_a = compone[cant_a] | |
term_g = compone[cant_g] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
elif tail == "C": | |
cant_c -= 1 | |
if window.endswith("A"): | |
cant_a += 1 | |
term_a = compone[cant_a] | |
term_c = compone[cant_c] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
elif window.endswith("T"): | |
cant_t += 1 | |
term_c = compone[cant_c] | |
term_t = compone[cant_t] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
elif window.endswith("G"): | |
cant_g += 1 | |
term_c = compone[cant_c] | |
term_g = compone[cant_g] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
elif tail == "T": | |
cant_t -= 1 | |
if window.endswith("A"): | |
cant_a += 1 | |
term_a = compone[cant_a] | |
term_t = compone[cant_t] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
elif window.endswith("C"): | |
cant_c += 1 | |
term_c = compone[cant_c] | |
term_t = compone[cant_t] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
elif window.endswith("G"): | |
cant_g += 1 | |
term_t = compone[cant_t] | |
term_g = compone[cant_g] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
elif tail == "G": | |
cant_g -= 1 | |
if window.endswith("A"): | |
cant_a += 1 | |
term_a = compone[cant_a] | |
term_g = compone[cant_g] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
elif window.endswith("C"): | |
cant_c += 1 | |
term_c = compone[cant_c] | |
term_g = compone[cant_g] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
elif window.endswith("T"): | |
cant_t += 1 | |
term_t = compone[cant_t] | |
term_g = compone[cant_g] | |
lccsal.append(-(term_a + term_c + term_t + term_g)) | |
tail = window[0] | |
return lccsal | |
def lcc_simp(seq): | |
"""Calculate Local Composition Complexity (LCC) for a sequence. | |
seq - an unambiguous DNA sequence (a string or Seq object) | |
Returns the Local Composition Complexity (LCC) value for the entire | |
sequence (as a float). | |
Reference: | |
Andrzej K Konopka (2005) Sequence Complexity and Composition | |
https://doi.org/10.1038/npg.els.0005260 | |
""" | |
wsize = len(seq) | |
seq = seq.upper() | |
l4 = math.log(4) | |
# Check to avoid calculating the log of 0. | |
if "A" not in seq: | |
term_a = 0 | |
else: | |
term_a = (seq.count("A") / wsize) * math.log(seq.count("A") / wsize) / l4 | |
if "C" not in seq: | |
term_c = 0 | |
else: | |
term_c = (seq.count("C") / wsize) * math.log(seq.count("C") / wsize) / l4 | |
if "T" not in seq: | |
term_t = 0 | |
else: | |
term_t = (seq.count("T") / wsize) * math.log(seq.count("T") / wsize) / l4 | |
if "G" not in seq: | |
term_g = 0 | |
else: | |
term_g = (seq.count("G") / wsize) * math.log(seq.count("G") / wsize) / l4 | |
return -(term_a + term_c + term_t + term_g) | |