Spaces:
No application file
No application file
# Copyright 2004 by Frank Kauff and Cymon J. Cox. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Parser for ACE files output by PHRAP. | |
Written by Frank Kauff ([email protected]) and | |
Cymon J. Cox ([email protected]) | |
Usage: | |
There are two ways of reading an ace file: | |
1. The function 'read' reads the whole file at once; | |
2. The function 'parse' reads the file contig after contig. | |
First option, parse whole ace file at once:: | |
from Bio.Sequencing import Ace | |
acefilerecord = Ace.read(open('my_ace_file.ace')) | |
This gives you: | |
- acefilerecord.ncontigs (the number of contigs in the ace file) | |
- acefilerecord.nreads (the number of reads in the ace file) | |
- acefilerecord.contigs[] (one instance of the Contig class for each contig) | |
The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used | |
for this contig in a list of instances of the Read class, e.g.:: | |
contig3 = acefilerecord.contigs[2] | |
read4 = contig3.reads[3] | |
RD_of_read4 = read4.rd | |
DS_of_read4 = read4.ds | |
CT, WA, RT tags from the end of the file can appear anywhere are automatically | |
sorted into the right place. | |
see _RecordConsumer for details. | |
The second option is to iterate over the contigs of an ace file one by one | |
in the usual way:: | |
from Bio.Sequencing import Ace | |
contigs = Ace.parse(open('my_ace_file.ace')) | |
for contig in contigs: | |
print(contig.name) | |
... | |
Please note that for memory efficiency, when using the iterator approach, only one | |
contig is kept in memory at once. However, there can be a footer to the ACE file | |
containing WA, CT, RT or WR tags which contain additional meta-data on the contigs. | |
Because the parser doesn't see this data until the final record, it cannot be added to | |
the appropriate records. Instead these tags will be returned with the last contig record. | |
Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags | |
are needed, the 'read' function rather than the 'parse' function might be more appropriate. | |
""" | |
class rd: | |
"""RD (reads), store a read with its name, sequence etc. | |
The location and strand each read is mapped to is held in the AF lines. | |
""" | |
def __init__(self): | |
"""Initialize the class.""" | |
self.name = "" | |
self.padded_bases = None | |
self.info_items = None | |
self.read_tags = None | |
self.sequence = "" | |
class qa: | |
"""QA (read quality), including which part if any was used as the consensus.""" | |
def __init__(self, line=None): | |
"""Initialize the class.""" | |
self.qual_clipping_start = None | |
self.qual_clipping_end = None | |
self.align_clipping_start = None | |
self.align_clipping_end = None | |
if line: | |
header = line.split() | |
self.qual_clipping_start = int(header[1]) | |
self.qual_clipping_end = int(header[2]) | |
self.align_clipping_start = int(header[3]) | |
self.align_clipping_end = int(header[4]) | |
class ds: | |
"""DS lines, include file name of a read's chromatogram file.""" | |
def __init__(self, line=None): | |
"""Initialize the class.""" | |
self.chromat_file = "" | |
self.phd_file = "" | |
self.time = "" | |
self.chem = "" | |
self.dye = "" | |
self.template = "" | |
self.direction = "" | |
if line: | |
tags = [ | |
"CHROMAT_FILE", | |
"PHD_FILE", | |
"TIME", | |
"CHEM", | |
"DYE", | |
"TEMPLATE", | |
"DIRECTION", | |
] | |
poss = [line.find(x) for x in tags] | |
tagpos = dict(zip(poss, tags)) | |
if -1 in tagpos: | |
del tagpos[-1] | |
ps = sorted(tagpos) # the keys | |
for (p1, p2) in zip(ps, ps[1:] + [len(line) + 1]): | |
setattr( | |
self, | |
tagpos[p1].lower(), | |
line[p1 + len(tagpos[p1]) + 1 : p2].strip(), | |
) | |
class af: | |
"""AF lines, define the location of the read within the contig. | |
Note attribute coru is short for complemented (C) or uncomplemented (U), | |
since the strand information is stored in an ACE file using either the | |
C or U character. | |
""" | |
def __init__(self, line=None): | |
"""Initialize the class.""" | |
self.name = "" | |
self.coru = None | |
self.padded_start = None | |
if line: | |
header = line.split() | |
self.name = header[1] | |
self.coru = header[2] | |
self.padded_start = int(header[3]) | |
class bs: | |
"""BS (base segment), which read was chosen as the consensus at each position.""" | |
def __init__(self, line=None): | |
"""Initialize the class.""" | |
self.name = "" | |
self.padded_start = None | |
self.padded_end = None | |
if line: | |
header = line.split() | |
self.padded_start = int(header[1]) | |
self.padded_end = int(header[2]) | |
self.name = header[3] | |
class rt: | |
"""RT (transient read tags), generated by crossmatch and phrap.""" | |
def __init__(self, line=None): | |
"""Initialize the class.""" | |
self.name = "" | |
self.tag_type = "" | |
self.program = "" | |
self.padded_start = None | |
self.padded_end = None | |
self.date = "" | |
self.comment = [] | |
if line: | |
header = line.split() | |
self.name = header[0] | |
self.tag_type = header[1] | |
self.program = header[2] | |
self.padded_start = int(header[3]) | |
self.padded_end = int(header[4]) | |
self.date = header[5] | |
class ct: | |
"""CT (consensus tags).""" | |
def __init__(self, line=None): | |
"""Initialize the class.""" | |
self.name = "" | |
self.tag_type = "" | |
self.program = "" | |
self.padded_start = None | |
self.padded_end = None | |
self.date = "" | |
self.notrans = "" | |
self.info = [] | |
self.comment = [] | |
if line: | |
header = line.split() | |
self.name = header[0] | |
self.tag_type = header[1] | |
self.program = header[2] | |
self.padded_start = int(header[3]) | |
self.padded_end = int(header[4]) | |
self.date = header[5] | |
if len(header) == 7: | |
self.notrans = header[6] | |
class wa: | |
"""WA (whole assembly tag), holds the assembly program name, version, etc.""" | |
def __init__(self, line=None): | |
"""Initialize the class.""" | |
self.tag_type = "" | |
self.program = "" | |
self.date = "" | |
self.info = [] | |
if line: | |
header = line.split() | |
self.tag_type = header[0] | |
self.program = header[1] | |
self.date = header[2] | |
class wr: | |
"""WR lines.""" | |
def __init__(self, line=None): | |
"""Initialize the class.""" | |
self.name = "" | |
self.aligned = "" | |
self.program = "" | |
self.date = [] | |
if line: | |
header = line.split() | |
self.name = header[0] | |
self.aligned = header[1] | |
self.program = header[2] | |
self.date = header[3] | |
class Reads: | |
"""Holds information about a read supporting an ACE contig.""" | |
def __init__(self, line=None): | |
"""Initialize the class.""" | |
self.rd = None # one per read | |
self.qa = None # one per read | |
self.ds = None # none or one per read | |
self.rt = None # none or many per read | |
self.wr = None # none or many per read | |
if line: | |
self.rd = rd() | |
header = line.split() | |
self.rd.name = header[1] | |
self.rd.padded_bases = int(header[2]) | |
self.rd.info_items = int(header[3]) | |
self.rd.read_tags = int(header[4]) | |
class Contig: | |
"""Holds information about a contig from an ACE record.""" | |
def __init__(self, line=None): | |
"""Initialize the class.""" | |
self.name = "" | |
self.nbases = None | |
self.nreads = None | |
self.nsegments = None | |
self.uorc = None | |
self.sequence = "" | |
self.quality = [] | |
self.af = [] | |
self.bs = [] | |
self.reads = [] | |
self.ct = None # none or many | |
self.wa = None # none or many | |
if line: | |
header = line.split() | |
self.name = header[1] | |
self.nbases = int(header[2]) | |
self.nreads = int(header[3]) | |
self.nsegments = int(header[4]) | |
self.uorc = header[5] | |
def parse(source): | |
"""Iterate of ACE file contig by contig. | |
Argument source is a file-like object or a path to a file. | |
This function returns an iterator that allows you to iterate | |
over the ACE file record by record:: | |
records = parse(source) | |
for record in records: | |
# do something with the record | |
where each record is a Contig object. | |
""" | |
try: | |
handle = open(source) | |
except TypeError: | |
handle = source | |
if handle.read(0) != "": | |
raise ValueError("Ace files must be opened in text mode.") from None | |
try: | |
line = "" | |
while True: | |
# at beginning, skip the AS and look for first CO command | |
try: | |
while True: | |
if line.startswith("CO"): | |
break | |
line = next(handle) | |
except StopIteration: | |
return | |
record = Contig(line) | |
for line in handle: | |
line = line.strip() | |
if not line: | |
break | |
record.sequence += line | |
for line in handle: | |
if line.strip(): | |
break | |
if not line.startswith("BQ"): | |
raise ValueError("Failed to find BQ line") | |
for line in handle: | |
if not line.strip(): | |
break | |
record.quality.extend(int(x) for x in line.split()) | |
for line in handle: | |
if line.strip(): | |
break | |
while True: | |
if not line.startswith("AF "): | |
break | |
record.af.append(af(line)) | |
try: | |
line = next(handle) | |
except StopIteration: | |
raise ValueError("Unexpected end of AF block") from None | |
while True: | |
if line.strip(): | |
break | |
try: | |
line = next(handle) | |
except StopIteration: | |
raise ValueError("Unexpected end of file") from None | |
while True: | |
if not line.startswith("BS "): | |
break | |
record.bs.append(bs(line)) | |
try: | |
line = next(handle) | |
except StopIteration: | |
raise ValueError("Failed to find end of BS block") from None | |
# now read all the read data | |
# it starts with a 'RD', and then a mandatory QA | |
# then follows an optional DS | |
# CT,RT,WA,WR may or may not be there in unlimited quantity. | |
# They might refer to the actual read or contig, or, if | |
# encountered at the end of file, to any previous read or contig. | |
# The sort() method deals with that later. | |
while True: | |
# each read must have a rd and qa | |
try: | |
while True: | |
# If I've met the condition, then stop reading the line. | |
if line.startswith("RD "): | |
break | |
line = next(handle) | |
except StopIteration: | |
raise ValueError("Failed to find RD line") from None | |
record.reads.append(Reads(line)) | |
for line in handle: | |
line = line.strip() | |
if not line: | |
break | |
record.reads[-1].rd.sequence += line | |
for line in handle: | |
if line.strip(): | |
break | |
if not line.startswith("QA "): | |
raise ValueError("Failed to find QA line") | |
record.reads[-1].qa = qa(line) | |
# now one ds can follow | |
for line in handle: | |
if line.strip(): | |
break | |
else: | |
break | |
if line.startswith("DS "): | |
record.reads[-1].ds = ds(line) | |
line = "" | |
# the file could just end, or there's some more stuff. | |
# In ace files, anything can happen. | |
# the following tags are interspersed between reads and can appear multiple times. | |
while True: | |
# something left | |
try: | |
while True: | |
if line.strip(): | |
break | |
line = next(handle) | |
except StopIteration: | |
# file ends here | |
break | |
if line.startswith("RT{"): | |
# now if we're at the end of the file, this rt could | |
# belong to a previous read, not the actual one. | |
# we store it here were it appears, the user can sort later. | |
if record.reads[-1].rt is None: | |
record.reads[-1].rt = [] | |
for line in handle: | |
line = line.strip() | |
# if line=="COMMENT{": | |
if line.startswith("COMMENT{"): | |
if line[8:].strip(): | |
# MIRA 3.0.5 would miss the new line out :( | |
record.reads[-1].rt[-1].comment.append(line[8:]) | |
for line in handle: | |
line = line.strip() | |
if line.endswith("C}"): | |
break | |
record.reads[-1].rt[-1].comment.append(line) | |
elif line == "}": | |
break | |
else: | |
record.reads[-1].rt.append(rt(line)) | |
line = "" | |
elif line.startswith("WR{"): | |
if record.reads[-1].wr is None: | |
record.reads[-1].wr = [] | |
for line in handle: | |
line = line.strip() | |
if line == "}": | |
break | |
record.reads[-1].wr.append(wr(line)) | |
line = "" | |
elif line.startswith("WA{"): | |
if record.wa is None: | |
record.wa = [] | |
try: | |
line = next(handle) | |
except StopIteration: | |
raise ValueError("Failed to read WA block") from None | |
record.wa.append(wa(line)) | |
for line in handle: | |
line = line.strip() | |
if line == "}": | |
break | |
record.wa[-1].info.append(line) | |
line = "" | |
elif line.startswith("CT{"): | |
if record.ct is None: | |
record.ct = [] | |
try: | |
line = next(handle) | |
except StopIteration: | |
raise ValueError("Failed to read CT block") from None | |
record.ct.append(ct(line)) | |
for line in handle: | |
line = line.strip() | |
if line == "COMMENT{": | |
for line in handle: | |
line = line.strip() | |
if line.endswith("C}"): | |
break | |
record.ct[-1].comment.append(line) | |
elif line == "}": | |
break | |
else: | |
record.ct[-1].info.append(line) | |
line = "" | |
else: | |
break | |
if not line.startswith("RD"): # another read? | |
break | |
yield record | |
finally: | |
if handle is not source: | |
handle.close() | |
class ACEFileRecord: | |
"""Holds data of an ACE file.""" | |
def __init__(self): | |
"""Initialize the class.""" | |
self.ncontigs = None | |
self.nreads = None | |
self.contigs = [] | |
self.wa = None # none or many | |
def sort(self): | |
"""Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible.""" | |
ct = [] | |
rt = [] | |
wr = [] | |
# search for tags that aren't in the right position | |
for i, c in enumerate(self.contigs): | |
if c.wa: | |
if not self.wa: | |
self.wa = [] | |
self.wa.extend(c.wa) | |
if c.ct: | |
newcts = [ct_tag for ct_tag in c.ct if ct_tag.name != c.name] | |
for x in newcts: | |
self.contigs[i].ct.remove(x) | |
ct.extend(newcts) | |
for j, r in enumerate(c.reads): | |
if r.rt: | |
newrts = [rt_tag for rt_tag in r.rt if rt_tag.name != r.rd.name] | |
for x in newrts: | |
self.contigs[i].reads[j].rt.remove(x) | |
rt.extend(newrts) | |
if r.wr: | |
newwrs = [wr_tag for wr_tag in r.wr if wr_tag.name != r.rd.name] | |
for x in newwrs: | |
self.contigs[i].reads[j].wr.remove(x) | |
wr.extend(newwrs) | |
# now sort them into their proper place | |
for i, c in enumerate(self.contigs): | |
for ct_tag in ct: | |
if ct_tag.name == c.name: | |
if self.contigs[i].ct is None: | |
self.contigs[i].ct = [] | |
self.contigs[i].ct.append(ct_tag) | |
if rt or wr: | |
for j, r in enumerate(c.reads): | |
for rt_tag in rt: | |
if rt_tag.name == r.rd.name: | |
if self.contigs[i].reads[j].rt is None: | |
self.contigs[i].reads[j].rt = [] | |
self.contigs[i].reads[j].rt.append(rt_tag) | |
for wr_tag in wr: | |
if wr_tag.name == r.rd.name: | |
if self.contigs[i].reads[j].wr is None: | |
self.contigs[i].reads[j].wr = [] | |
self.contigs[i].reads[j].wr.append(wr_tag) | |
def read(handle): | |
"""Parse a full ACE file into a list of contigs.""" | |
handle = iter(handle) | |
record = ACEFileRecord() | |
try: | |
line = next(handle) | |
except StopIteration: | |
raise ValueError("Premature end of file") from None | |
# check if the file starts correctly | |
if not line.startswith("AS"): | |
raise ValueError("File does not start with 'AS'.") | |
words = line.split() | |
record.ncontigs = int(words[1]) | |
record.nreads = int(words[2]) | |
# now read all the records | |
record.contigs = list(parse(handle)) | |
# wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). | |
# If the iterator is used, the tags are returned with the contig or the read after which they appear, | |
# if all tags are at the end, they are read with the last contig. The concept of an | |
# iterator leaves no other choice. But if the user uses the ACEParser, we can check | |
# them and put them into the appropriate contig/read instance. | |
# Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... | |
record.sort() | |
return record | |