Spaces:
No application file
No application file
static const char bases[][4] = {"TTTT", /* 00 00 00 00 */ | |
"TTTC", /* 00 00 00 01 */ | |
"TTTA", /* 00 00 00 10 */ | |
"TTTG", /* 00 00 00 11 */ | |
"TTCT", /* 00 00 01 00 */ | |
"TTCC", /* 00 00 01 01 */ | |
"TTCA", /* 00 00 01 10 */ | |
"TTCG", /* 00 00 01 11 */ | |
"TTAT", /* 00 00 10 00 */ | |
"TTAC", /* 00 00 10 01 */ | |
"TTAA", /* 00 00 10 10 */ | |
"TTAG", /* 00 00 10 11 */ | |
"TTGT", /* 00 00 11 00 */ | |
"TTGC", /* 00 00 11 01 */ | |
"TTGA", /* 00 00 11 10 */ | |
"TTGG", /* 00 00 11 11 */ | |
"TCTT", /* 00 01 00 00 */ | |
"TCTC", /* 00 01 00 01 */ | |
"TCTA", /* 00 01 00 10 */ | |
"TCTG", /* 00 01 00 11 */ | |
"TCCT", /* 00 01 01 00 */ | |
"TCCC", /* 00 01 01 01 */ | |
"TCCA", /* 00 01 01 10 */ | |
"TCCG", /* 00 01 01 11 */ | |
"TCAT", /* 00 01 10 00 */ | |
"TCAC", /* 00 01 10 01 */ | |
"TCAA", /* 00 01 10 10 */ | |
"TCAG", /* 00 01 10 11 */ | |
"TCGT", /* 00 01 11 00 */ | |
"TCGC", /* 00 01 11 01 */ | |
"TCGA", /* 00 01 11 10 */ | |
"TCGG", /* 00 01 11 11 */ | |
"TATT", /* 00 10 00 00 */ | |
"TATC", /* 00 10 00 01 */ | |
"TATA", /* 00 10 00 10 */ | |
"TATG", /* 00 10 00 11 */ | |
"TACT", /* 00 10 01 00 */ | |
"TACC", /* 00 10 01 01 */ | |
"TACA", /* 00 10 01 10 */ | |
"TACG", /* 00 10 01 11 */ | |
"TAAT", /* 00 10 10 00 */ | |
"TAAC", /* 00 10 10 01 */ | |
"TAAA", /* 00 10 10 10 */ | |
"TAAG", /* 00 10 10 11 */ | |
"TAGT", /* 00 10 11 00 */ | |
"TAGC", /* 00 10 11 01 */ | |
"TAGA", /* 00 10 11 10 */ | |
"TAGG", /* 00 10 11 11 */ | |
"TGTT", /* 00 11 00 00 */ | |
"TGTC", /* 00 11 00 01 */ | |
"TGTA", /* 00 11 00 10 */ | |
"TGTG", /* 00 11 00 11 */ | |
"TGCT", /* 00 11 01 00 */ | |
"TGCC", /* 00 11 01 01 */ | |
"TGCA", /* 00 11 01 10 */ | |
"TGCG", /* 00 11 01 11 */ | |
"TGAT", /* 00 11 10 00 */ | |
"TGAC", /* 00 11 10 01 */ | |
"TGAA", /* 00 11 10 10 */ | |
"TGAG", /* 00 11 10 11 */ | |
"TGGT", /* 00 11 11 00 */ | |
"TGGC", /* 00 11 11 01 */ | |
"TGGA", /* 00 11 11 10 */ | |
"TGGG", /* 00 11 11 11 */ | |
"CTTT", /* 01 00 00 00 */ | |
"CTTC", /* 01 00 00 01 */ | |
"CTTA", /* 01 00 00 10 */ | |
"CTTG", /* 01 00 00 11 */ | |
"CTCT", /* 01 00 01 00 */ | |
"CTCC", /* 01 00 01 01 */ | |
"CTCA", /* 01 00 01 10 */ | |
"CTCG", /* 01 00 01 11 */ | |
"CTAT", /* 01 00 10 00 */ | |
"CTAC", /* 01 00 10 01 */ | |
"CTAA", /* 01 00 10 10 */ | |
"CTAG", /* 01 00 10 11 */ | |
"CTGT", /* 01 00 11 00 */ | |
"CTGC", /* 01 00 11 01 */ | |
"CTGA", /* 01 00 11 10 */ | |
"CTGG", /* 01 00 11 11 */ | |
"CCTT", /* 01 01 00 00 */ | |
"CCTC", /* 01 01 00 01 */ | |
"CCTA", /* 01 01 00 10 */ | |
"CCTG", /* 01 01 00 11 */ | |
"CCCT", /* 01 01 01 00 */ | |
"CCCC", /* 01 01 01 01 */ | |
"CCCA", /* 01 01 01 10 */ | |
"CCCG", /* 01 01 01 11 */ | |
"CCAT", /* 01 01 10 00 */ | |
"CCAC", /* 01 01 10 01 */ | |
"CCAA", /* 01 01 10 10 */ | |
"CCAG", /* 01 01 10 11 */ | |
"CCGT", /* 01 01 11 00 */ | |
"CCGC", /* 01 01 11 01 */ | |
"CCGA", /* 01 01 11 10 */ | |
"CCGG", /* 01 01 11 11 */ | |
"CATT", /* 01 10 00 00 */ | |
"CATC", /* 01 10 00 01 */ | |
"CATA", /* 01 10 00 10 */ | |
"CATG", /* 01 10 00 11 */ | |
"CACT", /* 01 10 01 00 */ | |
"CACC", /* 01 10 01 01 */ | |
"CACA", /* 01 10 01 10 */ | |
"CACG", /* 01 10 01 11 */ | |
"CAAT", /* 01 10 10 00 */ | |
"CAAC", /* 01 10 10 01 */ | |
"CAAA", /* 01 10 10 10 */ | |
"CAAG", /* 01 10 10 11 */ | |
"CAGT", /* 01 10 11 00 */ | |
"CAGC", /* 01 10 11 01 */ | |
"CAGA", /* 01 10 11 10 */ | |
"CAGG", /* 01 10 11 11 */ | |
"CGTT", /* 01 11 00 00 */ | |
"CGTC", /* 01 11 00 01 */ | |
"CGTA", /* 01 11 00 10 */ | |
"CGTG", /* 01 11 00 11 */ | |
"CGCT", /* 01 11 01 00 */ | |
"CGCC", /* 01 11 01 01 */ | |
"CGCA", /* 01 11 01 10 */ | |
"CGCG", /* 01 11 01 11 */ | |
"CGAT", /* 01 11 10 00 */ | |
"CGAC", /* 01 11 10 01 */ | |
"CGAA", /* 01 11 10 10 */ | |
"CGAG", /* 01 11 10 11 */ | |
"CGGT", /* 01 11 11 00 */ | |
"CGGC", /* 01 11 11 01 */ | |
"CGGA", /* 01 11 11 10 */ | |
"CGGG", /* 01 11 11 11 */ | |
"ATTT", /* 10 00 00 00 */ | |
"ATTC", /* 10 00 00 01 */ | |
"ATTA", /* 10 00 00 10 */ | |
"ATTG", /* 10 00 00 11 */ | |
"ATCT", /* 10 00 01 00 */ | |
"ATCC", /* 10 00 01 01 */ | |
"ATCA", /* 10 00 01 10 */ | |
"ATCG", /* 10 00 01 11 */ | |
"ATAT", /* 10 00 10 00 */ | |
"ATAC", /* 10 00 10 01 */ | |
"ATAA", /* 10 00 10 10 */ | |
"ATAG", /* 10 00 10 11 */ | |
"ATGT", /* 10 00 11 00 */ | |
"ATGC", /* 10 00 11 01 */ | |
"ATGA", /* 10 00 11 10 */ | |
"ATGG", /* 10 00 11 11 */ | |
"ACTT", /* 10 01 00 00 */ | |
"ACTC", /* 10 01 00 01 */ | |
"ACTA", /* 10 01 00 10 */ | |
"ACTG", /* 10 01 00 11 */ | |
"ACCT", /* 10 01 01 00 */ | |
"ACCC", /* 10 01 01 01 */ | |
"ACCA", /* 10 01 01 10 */ | |
"ACCG", /* 10 01 01 11 */ | |
"ACAT", /* 10 01 10 00 */ | |
"ACAC", /* 10 01 10 01 */ | |
"ACAA", /* 10 01 10 10 */ | |
"ACAG", /* 10 01 10 11 */ | |
"ACGT", /* 10 01 11 00 */ | |
"ACGC", /* 10 01 11 01 */ | |
"ACGA", /* 10 01 11 10 */ | |
"ACGG", /* 10 01 11 11 */ | |
"AATT", /* 10 10 00 00 */ | |
"AATC", /* 10 10 00 01 */ | |
"AATA", /* 10 10 00 10 */ | |
"AATG", /* 10 10 00 11 */ | |
"AACT", /* 10 10 01 00 */ | |
"AACC", /* 10 10 01 01 */ | |
"AACA", /* 10 10 01 10 */ | |
"AACG", /* 10 10 01 11 */ | |
"AAAT", /* 10 10 10 00 */ | |
"AAAC", /* 10 10 10 01 */ | |
"AAAA", /* 10 10 10 10 */ | |
"AAAG", /* 10 10 10 11 */ | |
"AAGT", /* 10 10 11 00 */ | |
"AAGC", /* 10 10 11 01 */ | |
"AAGA", /* 10 10 11 10 */ | |
"AAGG", /* 10 10 11 11 */ | |
"AGTT", /* 10 11 00 00 */ | |
"AGTC", /* 10 11 00 01 */ | |
"AGTA", /* 10 11 00 10 */ | |
"AGTG", /* 10 11 00 11 */ | |
"AGCT", /* 10 11 01 00 */ | |
"AGCC", /* 10 11 01 01 */ | |
"AGCA", /* 10 11 01 10 */ | |
"AGCG", /* 10 11 01 11 */ | |
"AGAT", /* 10 11 10 00 */ | |
"AGAC", /* 10 11 10 01 */ | |
"AGAA", /* 10 11 10 10 */ | |
"AGAG", /* 10 11 10 11 */ | |
"AGGT", /* 10 11 11 00 */ | |
"AGGC", /* 10 11 11 01 */ | |
"AGGA", /* 10 11 11 10 */ | |
"AGGG", /* 10 11 11 11 */ | |
"GTTT", /* 11 00 00 00 */ | |
"GTTC", /* 11 00 00 01 */ | |
"GTTA", /* 11 00 00 10 */ | |
"GTTG", /* 11 00 00 11 */ | |
"GTCT", /* 11 00 01 00 */ | |
"GTCC", /* 11 00 01 01 */ | |
"GTCA", /* 11 00 01 10 */ | |
"GTCG", /* 11 00 01 11 */ | |
"GTAT", /* 11 00 10 00 */ | |
"GTAC", /* 11 00 10 01 */ | |
"GTAA", /* 11 00 10 10 */ | |
"GTAG", /* 11 00 10 11 */ | |
"GTGT", /* 11 00 11 00 */ | |
"GTGC", /* 11 00 11 01 */ | |
"GTGA", /* 11 00 11 10 */ | |
"GTGG", /* 11 00 11 11 */ | |
"GCTT", /* 11 01 00 00 */ | |
"GCTC", /* 11 01 00 01 */ | |
"GCTA", /* 11 01 00 10 */ | |
"GCTG", /* 11 01 00 11 */ | |
"GCCT", /* 11 01 01 00 */ | |
"GCCC", /* 11 01 01 01 */ | |
"GCCA", /* 11 01 01 10 */ | |
"GCCG", /* 11 01 01 11 */ | |
"GCAT", /* 11 01 10 00 */ | |
"GCAC", /* 11 01 10 01 */ | |
"GCAA", /* 11 01 10 10 */ | |
"GCAG", /* 11 01 10 11 */ | |
"GCGT", /* 11 01 11 00 */ | |
"GCGC", /* 11 01 11 01 */ | |
"GCGA", /* 11 01 11 10 */ | |
"GCGG", /* 11 01 11 11 */ | |
"GATT", /* 11 10 00 00 */ | |
"GATC", /* 11 10 00 01 */ | |
"GATA", /* 11 10 00 10 */ | |
"GATG", /* 11 10 00 11 */ | |
"GACT", /* 11 10 01 00 */ | |
"GACC", /* 11 10 01 01 */ | |
"GACA", /* 11 10 01 10 */ | |
"GACG", /* 11 10 01 11 */ | |
"GAAT", /* 11 10 10 00 */ | |
"GAAC", /* 11 10 10 01 */ | |
"GAAA", /* 11 10 10 10 */ | |
"GAAG", /* 11 10 10 11 */ | |
"GAGT", /* 11 10 11 00 */ | |
"GAGC", /* 11 10 11 01 */ | |
"GAGA", /* 11 10 11 10 */ | |
"GAGG", /* 11 10 11 11 */ | |
"GGTT", /* 11 11 00 00 */ | |
"GGTC", /* 11 11 00 01 */ | |
"GGTA", /* 11 11 00 10 */ | |
"GGTG", /* 11 11 00 11 */ | |
"GGCT", /* 11 11 01 00 */ | |
"GGCC", /* 11 11 01 01 */ | |
"GGCA", /* 11 11 01 10 */ | |
"GGCG", /* 11 11 01 11 */ | |
"GGAT", /* 11 11 10 00 */ | |
"GGAC", /* 11 11 10 01 */ | |
"GGAA", /* 11 11 10 10 */ | |
"GGAG", /* 11 11 10 11 */ | |
"GGGT", /* 11 11 11 00 */ | |
"GGGC", /* 11 11 11 01 */ | |
"GGGA", /* 11 11 11 10 */ | |
"GGGG", /* 11 11 11 11 */ | |
}; | |
static int | |
extract(const unsigned char* bytes, uint32_t byteSize, uint32_t start, uint32_t end, char sequence[]) { | |
uint32_t i; | |
const uint32_t size = end - start; | |
const uint32_t byteStart = start / 4; | |
const uint32_t byteEnd = (end + 3) / 4; | |
if (byteSize != byteEnd - byteStart) { | |
PyErr_Format(PyExc_RuntimeError, | |
"unexpected number of bytes %u (expected %u)", | |
byteSize, byteEnd - byteStart); | |
return -1; | |
} | |
start -= byteStart * 4; | |
if (byteStart + 1 == byteEnd) { | |
/* one byte only */ | |
memcpy(sequence, &(bases[*bytes][start]), size); | |
} | |
else { | |
end -= byteEnd * 4; | |
/* end is now a negative number equal to the distance to the byte end */ | |
memcpy(sequence, &(bases[*bytes][start]), 4 - start); | |
bytes++; | |
sequence += (4 - start); | |
for (i = byteStart+1; i < byteEnd-1; i++, bytes++, sequence += 4) | |
memcpy(sequence, bases[*bytes], 4); | |
memcpy(sequence, bases[*bytes], end + 4); | |
bytes++; | |
bytes -= byteSize; | |
} | |
return 0; | |
} | |
static void | |
applyNs(char sequence[], uint32_t start, uint32_t end, Py_buffer *nBlocks) | |
{ | |
const Py_ssize_t nBlockCount = nBlocks->shape[0]; | |
const uint32_t* const nBlockPositions = nBlocks->buf; | |
Py_ssize_t i; | |
for (i = 0; i < nBlockCount; i++) { | |
uint32_t nBlockStart = nBlockPositions[2*i]; | |
uint32_t nBlockEnd = nBlockPositions[2*i+1]; | |
if (nBlockEnd < start) continue; | |
if (end < nBlockStart) break; | |
if (nBlockStart < start) nBlockStart = start; | |
if (end < nBlockEnd) nBlockEnd = end; | |
memset(sequence + nBlockStart - start, 'N', nBlockEnd - nBlockStart); | |
} | |
} | |
static void | |
applyMask(char sequence[], uint32_t start, uint32_t end, Py_buffer* maskBlocks) | |
{ | |
const Py_ssize_t maskBlockCount = maskBlocks->shape[0]; | |
const uint32_t* const maskBlockPositions = maskBlocks->buf; | |
const char diff = 'a' - 'A'; | |
Py_ssize_t i; | |
for (i = 0; i < maskBlockCount; i++) { | |
uint32_t j; | |
uint32_t maskBlockStart = maskBlockPositions[2*i]; | |
uint32_t maskBlockEnd = maskBlockPositions[2*i+1]; | |
if (maskBlockEnd < start) continue; | |
if (end < maskBlockStart) break; | |
if (maskBlockStart < start) maskBlockStart = start; | |
if (end < maskBlockEnd) maskBlockEnd = end; | |
for (j = maskBlockStart - start; j < maskBlockEnd - start; j++) | |
sequence[j] += diff; | |
} | |
} | |
static int | |
blocks_converter(PyObject* object, void* pointer) | |
{ | |
const int flag = PyBUF_ND | PyBUF_FORMAT; | |
Py_buffer *view = pointer; | |
if (object == NULL) goto exit; | |
if (PyObject_GetBuffer(object, view, flag) == -1) { | |
PyErr_SetString(PyExc_RuntimeError, "blocks have unexpected format."); | |
return 0; | |
} | |
if (view->itemsize != sizeof(uint32_t) | |
|| (strcmp(view->format, "I") != 0 && strcmp(view->format, "L") != 0 )) { | |
PyErr_Format(PyExc_RuntimeError, | |
"blocks have incorrect data type (itemsize %zd, format %s)", | |
view->itemsize, view->format); | |
goto exit; | |
} | |
if (view->ndim != 2) { | |
PyErr_Format(PyExc_RuntimeError, | |
"blocks have incorrect rank %d (expected 2)", view->ndim); | |
goto exit; | |
} | |
if (view->shape[1] != 2) { | |
PyErr_Format(PyExc_RuntimeError, | |
"blocks should have two columns (found %zd)", | |
view->shape[1]); | |
goto exit; | |
} | |
return Py_CLEANUP_SUPPORTED; | |
exit: | |
PyBuffer_Release(view); | |
return 0; | |
} | |
static char TwoBit_convert__doc__[] = "convert twoBit data to the DNA sequence, apply blocks of N's (representing unknown sequences) and masked (lower case) blocks, and return the sequence as a bytes object"; | |
static PyObject* | |
TwoBit_convert(PyObject* self, PyObject* args, PyObject* keywords) | |
{ | |
const unsigned char *data; | |
Py_ssize_t start; | |
Py_ssize_t end; | |
Py_ssize_t step; | |
Py_ssize_t size; | |
Py_ssize_t length; | |
Py_buffer nBlocks; | |
Py_buffer maskBlocks; | |
PyObject *object; | |
char *sequence; | |
static char* kwlist[] = {"data", "start", "end", "step", | |
"nBlocks", "maskBlocks", NULL}; | |
if (!PyArg_ParseTupleAndKeywords(args, keywords, "y#nnnO&O&", kwlist, | |
&data, &length, &start, &end, &step, | |
&blocks_converter, &nBlocks, | |
&blocks_converter, &maskBlocks)) | |
return NULL; | |
size = (end - start) / step; | |
object = PyBytes_FromStringAndSize(NULL, size); | |
if (!object) goto exit; | |
sequence = PyBytes_AS_STRING(object); | |
if (step == 1) { | |
if (extract(data, length, start, end, sequence) < 0) { | |
Py_DECREF(object); | |
object = NULL; | |
goto exit; | |
} | |
applyNs(sequence, start, end, &nBlocks); | |
applyMask(sequence, start, end, &maskBlocks); | |
} | |
else { | |
Py_ssize_t current, i; | |
Py_ssize_t full_start, full_end; | |
char* full_sequence; | |
if (start <= end) { | |
full_start = start; | |
full_end = end; | |
current = 0; /* first position in sequence */ | |
} | |
else { | |
full_start = end + 1; | |
full_end = start + 1; | |
current = start - end - 1; /* last position in sequence */ | |
} | |
full_sequence = PyMem_Malloc((full_end-full_start+1)*sizeof(char)); | |
full_sequence[full_end-full_start] = '\0'; | |
if (!full_sequence) { | |
Py_DECREF(object); | |
object = NULL; | |
goto exit; | |
} | |
if (extract(data, length, full_start, full_end, full_sequence) < 0) { | |
PyMem_Free(full_sequence); | |
Py_DECREF(object); | |
object = NULL; | |
goto exit; | |
} | |
applyNs(full_sequence, full_start, full_end, &nBlocks); | |
applyMask(full_sequence, full_start, full_end, &maskBlocks); | |
for (i = 0; i < size; current += step, i++) | |
sequence[i] = full_sequence[current]; | |
PyMem_Free(full_sequence); | |
} | |
exit: | |
blocks_converter(NULL, &nBlocks); | |
blocks_converter(NULL, &maskBlocks); | |
return object; | |
} | |
static struct PyMethodDef _twoBitIO_methods[] = { | |
{"convert", | |
(PyCFunction)TwoBit_convert, | |
METH_VARARGS | METH_KEYWORDS, | |
TwoBit_convert__doc__ | |
}, | |
{NULL, NULL, 0, NULL} /* sentinel */ | |
}; | |
static struct PyModuleDef moduledef = { | |
PyModuleDef_HEAD_INIT, | |
"_twoBitIO", | |
"Parser for DNA sequence data in 2bit format", | |
-1, | |
_twoBitIO_methods, | |
NULL, | |
NULL, | |
NULL, | |
NULL | |
}; | |
PyObject * | |
PyInit__twoBitIO(void) | |
{ | |
return PyModule_Create(&moduledef); | |
} | |