Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /Data /IUPACData.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

9.22 kB

	# Copyright 2000 Andrew Dalke. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Information about the IUPAC alphabets."""


	protein_letters = "ACDEFGHIKLMNPQRSTVWY"
	extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO"
	# B = "Asx"; aspartic acid or asparagine (D or N)
	# X = "Xxx"; unknown or 'other' amino acid
	# Z = "Glx"; glutamic acid or glutamine (E or Q)
	# http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212
	#
	# J = "Xle"; leucine or isoleucine (L or I, used in NMR)
	# Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
	# Also the International Nucleotide Sequence Database Collaboration (INSDC)
	# (i.e. GenBank, EMBL, DDBJ) adopted this in 2006
	# http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html
	#
	# Xle (J); Leucine or Isoleucine
	# The residue abbreviations, Xle (the three-letter abbreviation) and J
	# (the one-letter abbreviation) are reserved for the case that cannot
	# experimentally distinguish leucine from isoleucine.
	#
	# U = "Sec"; selenocysteine
	# http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
	#
	# O = "Pyl"; pyrrolysine
	# http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35

	protein_letters_1to3 = {
	"A": "Ala",
	"C": "Cys",
	"D": "Asp",
	"E": "Glu",
	"F": "Phe",
	"G": "Gly",
	"H": "His",
	"I": "Ile",
	"K": "Lys",
	"L": "Leu",
	"M": "Met",
	"N": "Asn",
	"P": "Pro",
	"Q": "Gln",
	"R": "Arg",
	"S": "Ser",
	"T": "Thr",
	"V": "Val",
	"W": "Trp",
	"Y": "Tyr",
	}
	protein_letters_1to3_extended = {
	**protein_letters_1to3,
	**{"B": "Asx", "X": "Xaa", "Z": "Glx", "J": "Xle", "U": "Sec", "O": "Pyl"},
	}

	protein_letters_3to1 = {value: key for key, value in protein_letters_1to3.items()}
	protein_letters_3to1_extended = {
	value: key for key, value in protein_letters_1to3_extended.items()
	}

	ambiguous_dna_letters = "GATCRYWSMKHBVDN"
	unambiguous_dna_letters = "GATC"
	ambiguous_rna_letters = "GAUCRYWSMKHBVDN"
	unambiguous_rna_letters = "GAUC"

	# B == 5-bromouridine
	# D == 5,6-dihydrouridine
	# S == thiouridine
	# W == wyosine
	extended_dna_letters = "GATCBDSW"

	# are there extended forms?
	# extended_rna_letters = "GAUCBDSW"

	# "X" is included in the following _values and _complement dictionaries,
	# for historical reasons although it is not an IUPAC nucleotide,
	# and so is not in the corresponding _letters strings above
	ambiguous_dna_values = {
	"A": "A",
	"C": "C",
	"G": "G",
	"T": "T",
	"M": "AC",
	"R": "AG",
	"W": "AT",
	"S": "CG",
	"Y": "CT",
	"K": "GT",
	"V": "ACG",
	"H": "ACT",
	"D": "AGT",
	"B": "CGT",
	"X": "GATC",
	"N": "GATC",
	}
	ambiguous_rna_values = {
	"A": "A",
	"C": "C",
	"G": "G",
	"U": "U",
	"M": "AC",
	"R": "AG",
	"W": "AU",
	"S": "CG",
	"Y": "CU",
	"K": "GU",
	"V": "ACG",
	"H": "ACU",
	"D": "AGU",
	"B": "CGU",
	"X": "GAUC",
	"N": "GAUC",
	}

	ambiguous_dna_complement = {
	"A": "T",
	"C": "G",
	"G": "C",
	"T": "A",
	"M": "K",
	"R": "Y",
	"W": "W",
	"S": "S",
	"Y": "R",
	"K": "M",
	"V": "B",
	"H": "D",
	"D": "H",
	"B": "V",
	"X": "X",
	"N": "N",
	}

	ambiguous_rna_complement = {
	"A": "U",
	"C": "G",
	"G": "C",
	"U": "A",
	"M": "K",
	"R": "Y",
	"W": "W",
	"S": "S",
	"Y": "R",
	"K": "M",
	"V": "B",
	"H": "D",
	"D": "H",
	"B": "V",
	"X": "X",
	"N": "N",
	}


	def _make_ranges(mydict):
	d = {}
	for key, value in mydict.items():
	d[key] = (value, value)
	return d


	# Mass data taken from PubChem


	# Average masses of monophosphate deoxy nucleotides
	unambiguous_dna_weights = {"A": 331.2218, "C": 307.1971, "G": 347.2212, "T": 322.2085}

	# Monoisotopic masses of monophospate deoxy nucleotides
	monoisotopic_unambiguous_dna_weights = {
	"A": 331.06817,
	"C": 307.056936,
	"G": 347.063084,
	"T": 322.056602,
	}

	unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights)

	unambiguous_rna_weights = {"A": 347.2212, "C": 323.1965, "G": 363.2206, "U": 324.1813}

	monoisotopic_unambiguous_rna_weights = {
	"A": 347.063084,
	"C": 323.051851,
	"G": 363.057999,
	"U": 324.035867,
	}

	unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights)


	def _make_ambiguous_ranges(mydict, weight_table):
	range_d = {}
	avg_d = {}
	for letter, values in mydict.items():
	# Following line is a quick hack to skip undefined weights for U and O
	if len(values) == 1 and values[0] not in weight_table:
	continue

	weights = [weight_table.get(x) for x in values]
	range_d[letter] = (min(weights), max(weights))
	total_w = 0.0
	for w in weights:
	total_w = total_w + w
	avg_d[letter] = total_w / len(weights)
	return range_d, avg_d


	ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = _make_ambiguous_ranges(
	ambiguous_dna_values, unambiguous_dna_weights
	)

	ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = _make_ambiguous_ranges(
	ambiguous_rna_values, unambiguous_rna_weights
	)

	protein_weights = {
	"A": 89.0932,
	"C": 121.1582,
	"D": 133.1027,
	"E": 147.1293,
	"F": 165.1891,
	"G": 75.0666,
	"H": 155.1546,
	"I": 131.1729,
	"K": 146.1876,
	"L": 131.1729,
	"M": 149.2113,
	"N": 132.1179,
	"O": 255.3134,
	"P": 115.1305,
	"Q": 146.1445,
	"R": 174.201,
	"S": 105.0926,
	"T": 119.1192,
	"U": 168.0532,
	"V": 117.1463,
	"W": 204.2252,
	"Y": 181.1885,
	}

	monoisotopic_protein_weights = {
	"A": 89.047678,
	"C": 121.019749,
	"D": 133.037508,
	"E": 147.053158,
	"F": 165.078979,
	"G": 75.032028,
	"H": 155.069477,
	"I": 131.094629,
	"K": 146.105528,
	"L": 131.094629,
	"M": 149.051049,
	"N": 132.053492,
	"O": 255.158292,
	"P": 115.063329,
	"Q": 146.069142,
	"R": 174.111676,
	"S": 105.042593,
	"T": 119.058243,
	"U": 168.964203,
	"V": 117.078979,
	"W": 204.089878,
	"Y": 181.073893,
	}

	extended_protein_values = {
	"A": "A",
	"B": "ND",
	"C": "C",
	"D": "D",
	"E": "E",
	"F": "F",
	"G": "G",
	"H": "H",
	"I": "I",
	"J": "IL",
	"K": "K",
	"L": "L",
	"M": "M",
	"N": "N",
	"O": "O",
	"P": "P",
	"Q": "Q",
	"R": "R",
	"S": "S",
	"T": "T",
	"U": "U",
	"V": "V",
	"W": "W",
	"X": "ACDEFGHIKLMNPQRSTVWY",
	# TODO - Include U and O in the possible values of X?
	# This could alter the extended_protein_weight_ranges ...
	# by MP: Won't do this, because they are so rare.
	"Y": "Y",
	"Z": "QE",
	}

	protein_weight_ranges = _make_ranges(protein_weights)

	extended_protein_weight_ranges, avg_extended_protein_weights = _make_ambiguous_ranges(
	extended_protein_values, protein_weights
	)


	# For Center of Mass Calculation.
	# Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol
	atom_weights = {
	"H": 1.00794,
	"D": 2.01410,
	"He": 4.002602,
	"Li": 6.941,
	"Be": 9.012182,
	"B": 10.811,
	"C": 12.0107,
	"N": 14.0067,
	"O": 15.9994,
	"F": 18.9984032,
	"Ne": 20.1797,
	"Na": 22.989770,
	"Mg": 24.3050,
	"Al": 26.981538,
	"Si": 28.0855,
	"P": 30.973761,
	"S": 32.065,
	"Cl": 35.453,
	"Ar": 39.948,
	"K": 39.0983,
	"Ca": 40.078,
	"Sc": 44.955910,
	"Ti": 47.867,
	"V": 50.9415,
	"Cr": 51.9961,
	"Mn": 54.938049,
	"Fe": 55.845,
	"Co": 58.933200,
	"Ni": 58.6934,
	"Cu": 63.546,
	"Zn": 65.39,
	"Ga": 69.723,
	"Ge": 72.64,
	"As": 74.92160,
	"Se": 78.96,
	"Br": 79.904,
	"Kr": 83.80,
	"Rb": 85.4678,
	"Sr": 87.62,
	"Y": 88.90585,
	"Zr": 91.224,
	"Nb": 92.90638,
	"Mo": 95.94,
	"Tc": 98.0,
	"Ru": 101.07,
	"Rh": 102.90550,
	"Pd": 106.42,
	"Ag": 107.8682,
	"Cd": 112.411,
	"In": 114.818,
	"Sn": 118.710,
	"Sb": 121.760,
	"Te": 127.60,
	"I": 126.90447,
	"Xe": 131.293,
	"Cs": 132.90545,
	"Ba": 137.327,
	"La": 138.9055,
	"Ce": 140.116,
	"Pr": 140.90765,
	"Nd": 144.24,
	"Pm": 145.0,
	"Sm": 150.36,
	"Eu": 151.964,
	"Gd": 157.25,
	"Tb": 158.92534,
	"Dy": 162.50,
	"Ho": 164.93032,
	"Er": 167.259,
	"Tm": 168.93421,
	"Yb": 173.04,
	"Lu": 174.967,
	"Hf": 178.49,
	"Ta": 180.9479,
	"W": 183.84,
	"Re": 186.207,
	"Os": 190.23,
	"Ir": 192.217,
	"Pt": 195.078,
	"Au": 196.96655,
	"Hg": 200.59,
	"Tl": 204.3833,
	"Pb": 207.2,
	"Bi": 208.98038,
	"Po": 208.98,
	"At": 209.99,
	"Rn": 222.02,
	"Fr": 223.02,
	"Ra": 226.03,
	"Ac": 227.03,
	"Th": 232.0381,
	"Pa": 231.03588,
	"U": 238.02891,
	"Np": 237.05,
	"Pu": 244.06,
	"Am": 243.06,
	"Cm": 247.07,
	"Bk": 247.07,
	"Cf": 251.08,
	"Es": 252.08,
	"Fm": 257.10,
	"Md": 258.10,
	"No": 259.10,
	"Lr": 262.11,
	"Rf": 261.11,
	"Db": 262.11,
	"Sg": 266.12,
	"Bh": 264.12,
	"Hs": 269.13,
	"Mt": 268.14,
	}