File size: 3,511 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Copyright 2002 by Yves Bastide and Brad Chapman.
# Copyright 2007 by Sebastian Bassi
# All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Functions to calculate assorted sequence checksums."""

# crc32, crc64, gcg, and seguid
# crc64 is adapted from BioPerl


import binascii


def crc32(seq):
    """Return the crc32 checksum for a sequence (string or Seq object).

    Note that the case is important:

    >>> crc32("ACGTACGTACGT")
    20049947
    >>> crc32("acgtACGTacgt")
    1688586483

    """
    try:
        # Assume it's a Seq object
        s = bytes(seq)
    except TypeError:
        # Assume it's a string
        s = seq.encode()
    return binascii.crc32(s)


def _init_table_h():
    _table_h = []
    for i in range(256):
        part_l = i
        part_h = 0
        for j in range(8):
            rflag = part_l & 1
            part_l >>= 1
            if part_h & 1:
                part_l |= 1 << 31
            part_h >>= 1
            if rflag:
                part_h ^= 0xD8000000
        _table_h.append(part_h)
    return _table_h


# Initialisation
_table_h = _init_table_h()


def crc64(s):
    """Return the crc64 checksum for a sequence (string or Seq object).

    Note that the case is important:

    >>> crc64("ACGTACGTACGT")
    'CRC-C4FBB762C4A87EBD'
    >>> crc64("acgtACGTacgt")
    'CRC-DA4509DC64A87EBD'

    """
    crcl = 0
    crch = 0
    for c in s:
        shr = (crch & 0xFF) << 24
        temp1h = crch >> 8
        temp1l = (crcl >> 8) | shr
        idx = (crcl ^ ord(c)) & 0xFF
        crch = temp1h ^ _table_h[idx]
        crcl = temp1l

    return f"CRC-{crch:08X}{crcl:08X}"


def gcg(seq):
    """Return the GCG checksum (int) for a sequence (string or Seq object).

    Given a nucleotide or amino-acid sequence (or any string),
    returns the GCG checksum (int). Checksum used by GCG program.
    seq type = str.

    Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi
    with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina.

    All sequences are converted to uppercase.

    >>> gcg("ACGTACGTACGT")
    5688
    >>> gcg("acgtACGTacgt")
    5688

    """
    index = checksum = 0
    for char in seq:
        index += 1
        checksum += index * ord(char.upper())
        if index == 57:
            index = 0
    return checksum % 10000


def seguid(seq):
    """Return the SEGUID (string) for a sequence (string or Seq object).

    Given a nucleotide or amino-acid sequence (or any string),
    returns the SEGUID string (A SEquence Globally Unique IDentifier).
    seq type = str.

    Note that the case is not important:

    >>> seguid("ACGTACGTACGT")
    'If6HIvcnRSQDVNiAoefAzySc6i4'
    >>> seguid("acgtACGTacgt")
    'If6HIvcnRSQDVNiAoefAzySc6i4'

    For more information about SEGUID, see:
    http://bioinformatics.anl.gov/seguid/
    https://doi.org/10.1002/pmic.200600032
    """
    import hashlib
    import base64

    m = hashlib.sha1()
    try:
        # Assume it's a Seq object
        seq = bytes(seq)
    except TypeError:
        # Assume it's a string
        seq = seq.encode()
    m.update(seq.upper())
    tmp = base64.encodebytes(m.digest())
    return tmp.decode().replace("\n", "").rstrip("=")


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()