Spaces:
No application file
No application file
# Copyright 2002 by Yves Bastide and Brad Chapman. | |
# Copyright 2007 by Sebastian Bassi | |
# All rights reserved. | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Functions to calculate assorted sequence checksums.""" | |
# crc32, crc64, gcg, and seguid | |
# crc64 is adapted from BioPerl | |
import binascii | |
def crc32(seq): | |
"""Return the crc32 checksum for a sequence (string or Seq object). | |
Note that the case is important: | |
>>> crc32("ACGTACGTACGT") | |
20049947 | |
>>> crc32("acgtACGTacgt") | |
1688586483 | |
""" | |
try: | |
# Assume it's a Seq object | |
s = bytes(seq) | |
except TypeError: | |
# Assume it's a string | |
s = seq.encode() | |
return binascii.crc32(s) | |
def _init_table_h(): | |
_table_h = [] | |
for i in range(256): | |
part_l = i | |
part_h = 0 | |
for j in range(8): | |
rflag = part_l & 1 | |
part_l >>= 1 | |
if part_h & 1: | |
part_l |= 1 << 31 | |
part_h >>= 1 | |
if rflag: | |
part_h ^= 0xD8000000 | |
_table_h.append(part_h) | |
return _table_h | |
# Initialisation | |
_table_h = _init_table_h() | |
def crc64(s): | |
"""Return the crc64 checksum for a sequence (string or Seq object). | |
Note that the case is important: | |
>>> crc64("ACGTACGTACGT") | |
'CRC-C4FBB762C4A87EBD' | |
>>> crc64("acgtACGTacgt") | |
'CRC-DA4509DC64A87EBD' | |
""" | |
crcl = 0 | |
crch = 0 | |
for c in s: | |
shr = (crch & 0xFF) << 24 | |
temp1h = crch >> 8 | |
temp1l = (crcl >> 8) | shr | |
idx = (crcl ^ ord(c)) & 0xFF | |
crch = temp1h ^ _table_h[idx] | |
crcl = temp1l | |
return f"CRC-{crch:08X}{crcl:08X}" | |
def gcg(seq): | |
"""Return the GCG checksum (int) for a sequence (string or Seq object). | |
Given a nucleotide or amino-acid sequence (or any string), | |
returns the GCG checksum (int). Checksum used by GCG program. | |
seq type = str. | |
Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi | |
with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. | |
All sequences are converted to uppercase. | |
>>> gcg("ACGTACGTACGT") | |
5688 | |
>>> gcg("acgtACGTacgt") | |
5688 | |
""" | |
index = checksum = 0 | |
for char in seq: | |
index += 1 | |
checksum += index * ord(char.upper()) | |
if index == 57: | |
index = 0 | |
return checksum % 10000 | |
def seguid(seq): | |
"""Return the SEGUID (string) for a sequence (string or Seq object). | |
Given a nucleotide or amino-acid sequence (or any string), | |
returns the SEGUID string (A SEquence Globally Unique IDentifier). | |
seq type = str. | |
Note that the case is not important: | |
>>> seguid("ACGTACGTACGT") | |
'If6HIvcnRSQDVNiAoefAzySc6i4' | |
>>> seguid("acgtACGTacgt") | |
'If6HIvcnRSQDVNiAoefAzySc6i4' | |
For more information about SEGUID, see: | |
http://bioinformatics.anl.gov/seguid/ | |
https://doi.org/10.1002/pmic.200600032 | |
""" | |
import hashlib | |
import base64 | |
m = hashlib.sha1() | |
try: | |
# Assume it's a Seq object | |
seq = bytes(seq) | |
except TypeError: | |
# Assume it's a string | |
seq = seq.encode() | |
m.update(seq.upper()) | |
tmp = base64.encodebytes(m.digest()) | |
return tmp.decode().replace("\n", "").rstrip("=") | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest() | |