aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2002 by Yves Bastide and Brad Chapman.
# Copyright 2007 by Sebastian Bassi
# All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Functions to calculate assorted sequence checksums."""
# crc32, crc64, gcg, and seguid
# crc64 is adapted from BioPerl
import binascii
def crc32(seq):
"""Return the crc32 checksum for a sequence (string or Seq object).
Note that the case is important:
>>> crc32("ACGTACGTACGT")
20049947
>>> crc32("acgtACGTacgt")
1688586483
"""
try:
# Assume it's a Seq object
s = bytes(seq)
except TypeError:
# Assume it's a string
s = seq.encode()
return binascii.crc32(s)
def _init_table_h():
_table_h = []
for i in range(256):
part_l = i
part_h = 0
for j in range(8):
rflag = part_l & 1
part_l >>= 1
if part_h & 1:
part_l |= 1 << 31
part_h >>= 1
if rflag:
part_h ^= 0xD8000000
_table_h.append(part_h)
return _table_h
# Initialisation
_table_h = _init_table_h()
def crc64(s):
"""Return the crc64 checksum for a sequence (string or Seq object).
Note that the case is important:
>>> crc64("ACGTACGTACGT")
'CRC-C4FBB762C4A87EBD'
>>> crc64("acgtACGTacgt")
'CRC-DA4509DC64A87EBD'
"""
crcl = 0
crch = 0
for c in s:
shr = (crch & 0xFF) << 24
temp1h = crch >> 8
temp1l = (crcl >> 8) | shr
idx = (crcl ^ ord(c)) & 0xFF
crch = temp1h ^ _table_h[idx]
crcl = temp1l
return f"CRC-{crch:08X}{crcl:08X}"
def gcg(seq):
"""Return the GCG checksum (int) for a sequence (string or Seq object).
Given a nucleotide or amino-acid sequence (or any string),
returns the GCG checksum (int). Checksum used by GCG program.
seq type = str.
Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi
with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina.
All sequences are converted to uppercase.
>>> gcg("ACGTACGTACGT")
5688
>>> gcg("acgtACGTacgt")
5688
"""
index = checksum = 0
for char in seq:
index += 1
checksum += index * ord(char.upper())
if index == 57:
index = 0
return checksum % 10000
def seguid(seq):
"""Return the SEGUID (string) for a sequence (string or Seq object).
Given a nucleotide or amino-acid sequence (or any string),
returns the SEGUID string (A SEquence Globally Unique IDentifier).
seq type = str.
Note that the case is not important:
>>> seguid("ACGTACGTACGT")
'If6HIvcnRSQDVNiAoefAzySc6i4'
>>> seguid("acgtACGTacgt")
'If6HIvcnRSQDVNiAoefAzySc6i4'
For more information about SEGUID, see:
http://bioinformatics.anl.gov/seguid/
https://doi.org/10.1002/pmic.200600032
"""
import hashlib
import base64
m = hashlib.sha1()
try:
# Assume it's a Seq object
seq = bytes(seq)
except TypeError:
# Assume it's a string
seq = seq.encode()
m.update(seq.upper())
tmp = base64.encodebytes(m.digest())
return tmp.decode().replace("\n", "").rstrip("=")
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()