Spaces:
No application file
No application file
# Copyright 2003 Yair Benita. All rights reserved. | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Methods for codon usage calculations.""" | |
import math | |
import warnings | |
from .CodonUsageIndices import SharpEcoliIndex | |
from Bio import SeqIO # To parse a FASTA file | |
from Bio import BiopythonDeprecationWarning | |
warnings.warn( | |
"This module has been DEPRECATED. Please use the CodonAdaptationIndex " | |
"class in Bio.SeqUtils instead. Note that this class has been updated to " | |
"use modern Python, and may give slightly different results from the " | |
"CodonAdaptationIndex class in Bio.SeqUtils.CodonUsage, as the code was " | |
"changed to be consistent with the published paper by Sharp and Li. The " | |
"code in the old CodonAdaptationIndex class in Bio.SeqUtils.CodonUsage was " | |
"not changed.", | |
BiopythonDeprecationWarning, | |
) | |
# Turn black code style off | |
# fmt: off | |
CodonsDict = { | |
"TTT": 0, "TTC": 0, "TTA": 0, "TTG": 0, | |
"CTT": 0, "CTC": 0, "CTA": 0, "CTG": 0, | |
"ATT": 0, "ATC": 0, "ATA": 0, "ATG": 0, | |
"GTT": 0, "GTC": 0, "GTA": 0, "GTG": 0, | |
"TAT": 0, "TAC": 0, "TAA": 0, "TAG": 0, | |
"CAT": 0, "CAC": 0, "CAA": 0, "CAG": 0, | |
"AAT": 0, "AAC": 0, "AAA": 0, "AAG": 0, | |
"GAT": 0, "GAC": 0, "GAA": 0, "GAG": 0, | |
"TCT": 0, "TCC": 0, "TCA": 0, "TCG": 0, | |
"CCT": 0, "CCC": 0, "CCA": 0, "CCG": 0, | |
"ACT": 0, "ACC": 0, "ACA": 0, "ACG": 0, | |
"GCT": 0, "GCC": 0, "GCA": 0, "GCG": 0, | |
"TGT": 0, "TGC": 0, "TGA": 0, "TGG": 0, | |
"CGT": 0, "CGC": 0, "CGA": 0, "CGG": 0, | |
"AGT": 0, "AGC": 0, "AGA": 0, "AGG": 0, | |
"GGT": 0, "GGC": 0, "GGA": 0, "GGG": 0} | |
# Turn black code style on | |
# fmt: on | |
# this dictionary shows which codons encode the same AA | |
SynonymousCodons = { | |
"CYS": ["TGT", "TGC"], | |
"ASP": ["GAT", "GAC"], | |
"SER": ["TCT", "TCG", "TCA", "TCC", "AGC", "AGT"], | |
"GLN": ["CAA", "CAG"], | |
"MET": ["ATG"], | |
"ASN": ["AAC", "AAT"], | |
"PRO": ["CCT", "CCG", "CCA", "CCC"], | |
"LYS": ["AAG", "AAA"], | |
"STOP": ["TAG", "TGA", "TAA"], | |
"THR": ["ACC", "ACA", "ACG", "ACT"], | |
"PHE": ["TTT", "TTC"], | |
"ALA": ["GCA", "GCC", "GCG", "GCT"], | |
"GLY": ["GGT", "GGG", "GGA", "GGC"], | |
"ILE": ["ATC", "ATA", "ATT"], | |
"LEU": ["TTA", "TTG", "CTC", "CTT", "CTG", "CTA"], | |
"HIS": ["CAT", "CAC"], | |
"ARG": ["CGA", "CGC", "CGG", "CGT", "AGG", "AGA"], | |
"TRP": ["TGG"], | |
"VAL": ["GTA", "GTC", "GTG", "GTT"], | |
"GLU": ["GAG", "GAA"], | |
"TYR": ["TAT", "TAC"], | |
} | |
class CodonAdaptationIndex: | |
"""A codon adaptation index (CAI) implementation. | |
Implements the codon adaptation index (CAI) described by Sharp and | |
Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95). | |
NOTE - This implementation does not currently cope with alternative genetic | |
codes: only the synonymous codons in the standard table are considered. | |
""" | |
def __init__(self): | |
"""Initialize the class.""" | |
self.index = {} | |
self.codon_count = {} | |
# use this method with predefined CAI index | |
def set_cai_index(self, index): | |
"""Set up an index to be used when calculating CAI for a gene. | |
Just pass a dictionary similar to the SharpEcoliIndex in the | |
CodonUsageIndices module. | |
""" | |
self.index = index | |
def generate_index(self, fasta_file): | |
"""Generate a codon usage index from a FASTA file of CDS sequences. | |
Takes a location of a Fasta file containing CDS sequences | |
(which must all have a whole number of codons) and generates a codon | |
usage index. | |
""" | |
# first make sure we're not overwriting an existing index: | |
if self.index != {} or self.codon_count != {}: | |
raise ValueError( | |
"an index has already been set or a codon count " | |
"has been done. Cannot overwrite either." | |
) | |
# count codon occurrences in the file. | |
self._count_codons(fasta_file) | |
# now to calculate the index we first need to sum the number of times | |
# synonymous codons were used all together. | |
for aa in SynonymousCodons: | |
codons = SynonymousCodons[aa] | |
count_max = max(self.codon_count[codon] for codon in codons) | |
if count_max == 0: # the residue does not occur at all | |
for codon in codons: | |
self.index[codon] = None | |
else: | |
# now generate the index W=RCSUi/RCSUmax = COUNTi/COUNTmax: | |
# see equation 2 in Sharp & Li 1987 NAR | |
for codon in codons: | |
self.index[codon] = self.codon_count[codon] / count_max | |
def cai_for_gene(self, dna_sequence): | |
"""Calculate the CAI (float) for the provided DNA sequence (string). | |
This method uses the Index (either the one you set or the one you | |
generated) and returns the CAI for the DNA sequence. | |
""" | |
cai_value, cai_length = 0, 0 | |
# if no index is set or generated, the default SharpEcoliIndex will | |
# be used. | |
if self.index == {}: | |
self.set_cai_index(SharpEcoliIndex) | |
dna_sequence = dna_sequence.upper() | |
for i in range(0, len(dna_sequence), 3): | |
codon = dna_sequence[i : i + 3] | |
if codon in self.index: | |
# these two codons are always one, exclude them: | |
if codon not in ["ATG", "TGG"]: | |
cai_value += math.log(self.index[codon]) | |
cai_length += 1 | |
# some indices may not include stop codons: | |
elif codon not in ["TGA", "TAA", "TAG"]: | |
raise TypeError(f"illegal codon in sequence: {codon}.\n{self.index}") | |
return math.exp(cai_value / (cai_length - 1.0)) | |
def _count_codons(self, fasta_file): | |
with open(fasta_file) as handle: | |
# make the codon dictionary local | |
self.codon_count = CodonsDict.copy() | |
# iterate over sequence and count all the codons in the FastaFile. | |
for record in SeqIO.parse(handle, "fasta"): | |
sequence = record.seq.upper() | |
for i in range(0, len(sequence), 3): | |
codon = sequence[i : i + 3] | |
try: | |
self.codon_count[codon] += 1 | |
except KeyError: | |
raise ValueError( | |
f"illegal codon '{codon}' in gene: {record.id}" | |
) from None | |
def __str__(self): | |
lines = [] | |
for i in sorted(self.index): | |
line = f"{i}\t{self.index[i]:.3f}" | |
lines.append(line) | |
return "\n".join(lines) + "\n" | |
def print_index(self): | |
"""Print out the index used. | |
This just gives the index when the objects is printed. | |
""" | |
warnings.warn( | |
"The print_index method is deprecated; instead of " | |
"self.print_index(), please use print(self).", | |
BiopythonDeprecationWarning, | |
) | |
print(self) | |