aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2003, 2007 by Sebastian Bassi. [email protected]
# All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Local Composition Complexity."""
import math
def lcc_mult(seq, wsize):
"""Calculate Local Composition Complexity (LCC) values over sliding window.
Returns a list of floats, the LCC values for a sliding window over
the sequence.
seq - an unambiguous DNA sequence (a string or Seq object)
wsize - window size, integer
The result is the same as applying lcc_simp multiple times, but this
version is optimized for speed. The optimization works by using the
value of previous window as a base to compute the next one.
"""
l4 = math.log(4)
seq = seq.upper()
tamseq = len(seq)
compone = [0]
lccsal = []
for i in range(wsize):
compone.append(((i + 1) / wsize) * math.log((i + 1) / wsize) / l4)
window = seq[0:wsize]
cant_a = window.count("A")
cant_c = window.count("C")
cant_t = window.count("T")
cant_g = window.count("G")
term_a = compone[cant_a]
term_c = compone[cant_c]
term_t = compone[cant_t]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
tail = seq[0]
for x in range(tamseq - wsize):
window = seq[x + 1 : wsize + x + 1]
if tail == window[-1]:
lccsal.append(lccsal[-1])
elif tail == "A":
cant_a -= 1
if window.endswith("C"):
cant_c += 1
term_a = compone[cant_a]
term_c = compone[cant_c]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("T"):
cant_t += 1
term_a = compone[cant_a]
term_t = compone[cant_t]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("G"):
cant_g += 1
term_a = compone[cant_a]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif tail == "C":
cant_c -= 1
if window.endswith("A"):
cant_a += 1
term_a = compone[cant_a]
term_c = compone[cant_c]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("T"):
cant_t += 1
term_c = compone[cant_c]
term_t = compone[cant_t]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("G"):
cant_g += 1
term_c = compone[cant_c]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif tail == "T":
cant_t -= 1
if window.endswith("A"):
cant_a += 1
term_a = compone[cant_a]
term_t = compone[cant_t]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("C"):
cant_c += 1
term_c = compone[cant_c]
term_t = compone[cant_t]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("G"):
cant_g += 1
term_t = compone[cant_t]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif tail == "G":
cant_g -= 1
if window.endswith("A"):
cant_a += 1
term_a = compone[cant_a]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("C"):
cant_c += 1
term_c = compone[cant_c]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
elif window.endswith("T"):
cant_t += 1
term_t = compone[cant_t]
term_g = compone[cant_g]
lccsal.append(-(term_a + term_c + term_t + term_g))
tail = window[0]
return lccsal
def lcc_simp(seq):
"""Calculate Local Composition Complexity (LCC) for a sequence.
seq - an unambiguous DNA sequence (a string or Seq object)
Returns the Local Composition Complexity (LCC) value for the entire
sequence (as a float).
Reference:
Andrzej K Konopka (2005) Sequence Complexity and Composition
https://doi.org/10.1038/npg.els.0005260
"""
wsize = len(seq)
seq = seq.upper()
l4 = math.log(4)
# Check to avoid calculating the log of 0.
if "A" not in seq:
term_a = 0
else:
term_a = (seq.count("A") / wsize) * math.log(seq.count("A") / wsize) / l4
if "C" not in seq:
term_c = 0
else:
term_c = (seq.count("C") / wsize) * math.log(seq.count("C") / wsize) / l4
if "T" not in seq:
term_t = 0
else:
term_t = (seq.count("T") / wsize) * math.log(seq.count("T") / wsize) / l4
if "G" not in seq:
term_g = 0
else:
term_g = (seq.count("G") / wsize) * math.log(seq.count("G") / wsize) / l4
return -(term_a + term_c + term_t + term_g)