aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2000 Andrew Dalke. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Information about the IUPAC alphabets."""
protein_letters = "ACDEFGHIKLMNPQRSTVWY"
extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO"
# B = "Asx"; aspartic acid or asparagine (D or N)
# X = "Xxx"; unknown or 'other' amino acid
# Z = "Glx"; glutamic acid or glutamine (E or Q)
# http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212
#
# J = "Xle"; leucine or isoleucine (L or I, used in NMR)
# Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
# Also the International Nucleotide Sequence Database Collaboration (INSDC)
# (i.e. GenBank, EMBL, DDBJ) adopted this in 2006
# http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html
#
# Xle (J); Leucine or Isoleucine
# The residue abbreviations, Xle (the three-letter abbreviation) and J
# (the one-letter abbreviation) are reserved for the case that cannot
# experimentally distinguish leucine from isoleucine.
#
# U = "Sec"; selenocysteine
# http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
#
# O = "Pyl"; pyrrolysine
# http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35
protein_letters_1to3 = {
"A": "Ala",
"C": "Cys",
"D": "Asp",
"E": "Glu",
"F": "Phe",
"G": "Gly",
"H": "His",
"I": "Ile",
"K": "Lys",
"L": "Leu",
"M": "Met",
"N": "Asn",
"P": "Pro",
"Q": "Gln",
"R": "Arg",
"S": "Ser",
"T": "Thr",
"V": "Val",
"W": "Trp",
"Y": "Tyr",
}
protein_letters_1to3_extended = {
**protein_letters_1to3,
**{"B": "Asx", "X": "Xaa", "Z": "Glx", "J": "Xle", "U": "Sec", "O": "Pyl"},
}
protein_letters_3to1 = {value: key for key, value in protein_letters_1to3.items()}
protein_letters_3to1_extended = {
value: key for key, value in protein_letters_1to3_extended.items()
}
ambiguous_dna_letters = "GATCRYWSMKHBVDN"
unambiguous_dna_letters = "GATC"
ambiguous_rna_letters = "GAUCRYWSMKHBVDN"
unambiguous_rna_letters = "GAUC"
# B == 5-bromouridine
# D == 5,6-dihydrouridine
# S == thiouridine
# W == wyosine
extended_dna_letters = "GATCBDSW"
# are there extended forms?
# extended_rna_letters = "GAUCBDSW"
# "X" is included in the following _values and _complement dictionaries,
# for historical reasons although it is not an IUPAC nucleotide,
# and so is not in the corresponding _letters strings above
ambiguous_dna_values = {
"A": "A",
"C": "C",
"G": "G",
"T": "T",
"M": "AC",
"R": "AG",
"W": "AT",
"S": "CG",
"Y": "CT",
"K": "GT",
"V": "ACG",
"H": "ACT",
"D": "AGT",
"B": "CGT",
"X": "GATC",
"N": "GATC",
}
ambiguous_rna_values = {
"A": "A",
"C": "C",
"G": "G",
"U": "U",
"M": "AC",
"R": "AG",
"W": "AU",
"S": "CG",
"Y": "CU",
"K": "GU",
"V": "ACG",
"H": "ACU",
"D": "AGU",
"B": "CGU",
"X": "GAUC",
"N": "GAUC",
}
ambiguous_dna_complement = {
"A": "T",
"C": "G",
"G": "C",
"T": "A",
"M": "K",
"R": "Y",
"W": "W",
"S": "S",
"Y": "R",
"K": "M",
"V": "B",
"H": "D",
"D": "H",
"B": "V",
"X": "X",
"N": "N",
}
ambiguous_rna_complement = {
"A": "U",
"C": "G",
"G": "C",
"U": "A",
"M": "K",
"R": "Y",
"W": "W",
"S": "S",
"Y": "R",
"K": "M",
"V": "B",
"H": "D",
"D": "H",
"B": "V",
"X": "X",
"N": "N",
}
def _make_ranges(mydict):
d = {}
for key, value in mydict.items():
d[key] = (value, value)
return d
# Mass data taken from PubChem
# Average masses of monophosphate deoxy nucleotides
unambiguous_dna_weights = {"A": 331.2218, "C": 307.1971, "G": 347.2212, "T": 322.2085}
# Monoisotopic masses of monophospate deoxy nucleotides
monoisotopic_unambiguous_dna_weights = {
"A": 331.06817,
"C": 307.056936,
"G": 347.063084,
"T": 322.056602,
}
unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights)
unambiguous_rna_weights = {"A": 347.2212, "C": 323.1965, "G": 363.2206, "U": 324.1813}
monoisotopic_unambiguous_rna_weights = {
"A": 347.063084,
"C": 323.051851,
"G": 363.057999,
"U": 324.035867,
}
unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights)
def _make_ambiguous_ranges(mydict, weight_table):
range_d = {}
avg_d = {}
for letter, values in mydict.items():
# Following line is a quick hack to skip undefined weights for U and O
if len(values) == 1 and values[0] not in weight_table:
continue
weights = [weight_table.get(x) for x in values]
range_d[letter] = (min(weights), max(weights))
total_w = 0.0
for w in weights:
total_w = total_w + w
avg_d[letter] = total_w / len(weights)
return range_d, avg_d
ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = _make_ambiguous_ranges(
ambiguous_dna_values, unambiguous_dna_weights
)
ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = _make_ambiguous_ranges(
ambiguous_rna_values, unambiguous_rna_weights
)
protein_weights = {
"A": 89.0932,
"C": 121.1582,
"D": 133.1027,
"E": 147.1293,
"F": 165.1891,
"G": 75.0666,
"H": 155.1546,
"I": 131.1729,
"K": 146.1876,
"L": 131.1729,
"M": 149.2113,
"N": 132.1179,
"O": 255.3134,
"P": 115.1305,
"Q": 146.1445,
"R": 174.201,
"S": 105.0926,
"T": 119.1192,
"U": 168.0532,
"V": 117.1463,
"W": 204.2252,
"Y": 181.1885,
}
monoisotopic_protein_weights = {
"A": 89.047678,
"C": 121.019749,
"D": 133.037508,
"E": 147.053158,
"F": 165.078979,
"G": 75.032028,
"H": 155.069477,
"I": 131.094629,
"K": 146.105528,
"L": 131.094629,
"M": 149.051049,
"N": 132.053492,
"O": 255.158292,
"P": 115.063329,
"Q": 146.069142,
"R": 174.111676,
"S": 105.042593,
"T": 119.058243,
"U": 168.964203,
"V": 117.078979,
"W": 204.089878,
"Y": 181.073893,
}
extended_protein_values = {
"A": "A",
"B": "ND",
"C": "C",
"D": "D",
"E": "E",
"F": "F",
"G": "G",
"H": "H",
"I": "I",
"J": "IL",
"K": "K",
"L": "L",
"M": "M",
"N": "N",
"O": "O",
"P": "P",
"Q": "Q",
"R": "R",
"S": "S",
"T": "T",
"U": "U",
"V": "V",
"W": "W",
"X": "ACDEFGHIKLMNPQRSTVWY",
# TODO - Include U and O in the possible values of X?
# This could alter the extended_protein_weight_ranges ...
# by MP: Won't do this, because they are so rare.
"Y": "Y",
"Z": "QE",
}
protein_weight_ranges = _make_ranges(protein_weights)
extended_protein_weight_ranges, avg_extended_protein_weights = _make_ambiguous_ranges(
extended_protein_values, protein_weights
)
# For Center of Mass Calculation.
# Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol
atom_weights = {
"H": 1.00794,
"D": 2.01410,
"He": 4.002602,
"Li": 6.941,
"Be": 9.012182,
"B": 10.811,
"C": 12.0107,
"N": 14.0067,
"O": 15.9994,
"F": 18.9984032,
"Ne": 20.1797,
"Na": 22.989770,
"Mg": 24.3050,
"Al": 26.981538,
"Si": 28.0855,
"P": 30.973761,
"S": 32.065,
"Cl": 35.453,
"Ar": 39.948,
"K": 39.0983,
"Ca": 40.078,
"Sc": 44.955910,
"Ti": 47.867,
"V": 50.9415,
"Cr": 51.9961,
"Mn": 54.938049,
"Fe": 55.845,
"Co": 58.933200,
"Ni": 58.6934,
"Cu": 63.546,
"Zn": 65.39,
"Ga": 69.723,
"Ge": 72.64,
"As": 74.92160,
"Se": 78.96,
"Br": 79.904,
"Kr": 83.80,
"Rb": 85.4678,
"Sr": 87.62,
"Y": 88.90585,
"Zr": 91.224,
"Nb": 92.90638,
"Mo": 95.94,
"Tc": 98.0,
"Ru": 101.07,
"Rh": 102.90550,
"Pd": 106.42,
"Ag": 107.8682,
"Cd": 112.411,
"In": 114.818,
"Sn": 118.710,
"Sb": 121.760,
"Te": 127.60,
"I": 126.90447,
"Xe": 131.293,
"Cs": 132.90545,
"Ba": 137.327,
"La": 138.9055,
"Ce": 140.116,
"Pr": 140.90765,
"Nd": 144.24,
"Pm": 145.0,
"Sm": 150.36,
"Eu": 151.964,
"Gd": 157.25,
"Tb": 158.92534,
"Dy": 162.50,
"Ho": 164.93032,
"Er": 167.259,
"Tm": 168.93421,
"Yb": 173.04,
"Lu": 174.967,
"Hf": 178.49,
"Ta": 180.9479,
"W": 183.84,
"Re": 186.207,
"Os": 190.23,
"Ir": 192.217,
"Pt": 195.078,
"Au": 196.96655,
"Hg": 200.59,
"Tl": 204.3833,
"Pb": 207.2,
"Bi": 208.98038,
"Po": 208.98,
"At": 209.99,
"Rn": 222.02,
"Fr": 223.02,
"Ra": 226.03,
"Ac": 227.03,
"Th": 232.0381,
"Pa": 231.03588,
"U": 238.02891,
"Np": 237.05,
"Pu": 244.06,
"Am": 243.06,
"Cm": 247.07,
"Bk": 247.07,
"Cf": 251.08,
"Es": 252.08,
"Fm": 257.10,
"Md": 258.10,
"No": 259.10,
"Lr": 262.11,
"Rf": 261.11,
"Db": 262.11,
"Sg": 266.12,
"Bh": 264.12,
"Hs": 269.13,
"Mt": 268.14,
}