Spaces:
No application file
No application file
# Copyright 2008-2016 by Peter Cock. All rights reserved. | |
# Revisions copyright 2009 by Cymon J. Cox. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.SeqIO support for the "phd" file format. | |
PHD files are output by PHRED and used by PHRAP and CONSED. | |
You are expected to use this module via the Bio.SeqIO functions, under the | |
format name "phd". See also the underlying Bio.Sequencing.Phd module. | |
For example, using Bio.SeqIO we can read in one of the example PHRED files | |
from the Biopython unit tests: | |
>>> from Bio import SeqIO | |
>>> for record in SeqIO.parse("Phd/phd1", "phd"): | |
... print(record.id) | |
... print("%s..." % record.seq[:10]) | |
... print("%s..." % record.letter_annotations["phred_quality"][:10]) | |
34_222_(80-A03-19).b.ab1 | |
ctccgtcgga... | |
[9, 9, 10, 19, 22, 37, 28, 28, 24, 22]... | |
425_103_(81-A03-19).g.ab1 | |
cgggatccca... | |
[14, 17, 22, 10, 10, 10, 15, 8, 8, 9]... | |
425_7_(71-A03-19).b.ab1 | |
acataaatca... | |
[10, 10, 10, 10, 8, 8, 6, 6, 6, 6]... | |
Since PHRED files contain quality scores, you can save them as FASTQ or as | |
QUAL files, for example using Bio.SeqIO.write(...), or simply with the format | |
method of the SeqRecord object: | |
>>> print(record[:50].format("fastq")) | |
@425_7_(71-A03-19).b.ab1 | |
acataaatcaaattactnaccaacacacaaaccngtctcgcgtagtggag | |
+ | |
++++))'''')(''')$!$''')''''(+.''$!$))))+)))''''''' | |
<BLANKLINE> | |
Or, | |
>>> print(record[:50].format("qual")) | |
>425_7_(71-A03-19).b.ab1 | |
10 10 10 10 8 8 6 6 6 6 8 7 6 6 6 8 3 0 3 6 6 6 8 6 6 6 6 7 | |
10 13 6 6 3 0 3 8 8 8 8 10 8 8 8 6 6 6 6 6 6 6 | |
<BLANKLINE> | |
Note these examples only show the first 50 bases to keep the output short. | |
""" | |
from Bio.SeqIO import QualityIO | |
from Bio.SeqRecord import SeqRecord | |
from Bio.Sequencing import Phd | |
from .Interfaces import SequenceWriter | |
def PhdIterator(source): | |
"""Return SeqRecord objects from a PHD file. | |
Arguments: | |
- source - input stream opened in text mode, or a path to a file | |
This uses the Bio.Sequencing.Phd module to do the hard work. | |
""" | |
phd_records = Phd.parse(source) | |
for phd_record in phd_records: | |
# Convert the PHY record into a SeqRecord... | |
# The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1' | |
# from unit test example file phd_solexa. | |
# This will cause problems if used as the record identifier | |
# (e.g. output for FASTQ format). | |
name = phd_record.file_name.split(None, 1)[0] | |
seq_record = SeqRecord( | |
phd_record.seq, id=name, name=name, description=phd_record.file_name | |
) | |
# Just re-use the comments dictionary as the SeqRecord's annotations | |
seq_record.annotations = phd_record.comments | |
seq_record.annotations["molecule_type"] = "DNA" | |
# And store the qualities and peak locations as per-letter-annotation | |
seq_record.letter_annotations["phred_quality"] = [ | |
int(site[1]) for site in phd_record.sites | |
] | |
try: | |
seq_record.letter_annotations["peak_location"] = [ | |
int(site[2]) for site in phd_record.sites | |
] | |
except IndexError: | |
# peak locations are not always there according to | |
# David Gordon (the Consed author) | |
pass | |
yield seq_record | |
# All done | |
class PhdWriter(SequenceWriter): | |
"""Class to write Phd format files.""" | |
def __init__(self, handle): | |
"""Initialize the class.""" | |
super().__init__(handle) | |
def write_record(self, record): | |
"""Write a single Phd record to the file.""" | |
assert record.seq, "No sequence present in SeqRecord" | |
# This method returns the 'phred_quality' scores or converted | |
# 'solexa_quality' scores if present, else raises a value error | |
phred_qualities = QualityIO._get_phred_quality(record) | |
peak_locations = record.letter_annotations.get("peak_location") | |
if len(record.seq) != len(phred_qualities): | |
raise ValueError( | |
"Number of phd quality scores does not match length of sequence" | |
) | |
if peak_locations: | |
if len(record.seq) != len(peak_locations): | |
raise ValueError( | |
"Number of peak location scores does not " | |
"match length of sequence" | |
) | |
if None in phred_qualities: | |
raise ValueError("A quality value of None was found") | |
if record.description.startswith(f"{record.id} "): | |
title = record.description | |
else: | |
title = f"{record.id} {record.description}" | |
self.handle.write(f"BEGIN_SEQUENCE {self.clean(title)}\nBEGIN_COMMENT\n") | |
for annot in [k.lower() for k in Phd.CKEYWORDS]: | |
value = None | |
if annot == "trim": | |
if record.annotations.get("trim"): | |
value = "%s %s %.4f" % record.annotations["trim"] | |
elif annot == "trace_peak_area_ratio": | |
if record.annotations.get("trace_peak_area_ratio"): | |
value = f"{record.annotations['trace_peak_area_ratio']:.4f}" | |
else: | |
value = record.annotations.get(annot) | |
if value or value == 0: | |
self.handle.write(f"{annot.upper()}: {value}\n") | |
self.handle.write("END_COMMENT\nBEGIN_DNA\n") | |
for i, site in enumerate(record.seq): | |
if peak_locations: | |
self.handle.write( | |
"%s %i %i\n" % (site, round(phred_qualities[i]), peak_locations[i]) | |
) | |
else: | |
self.handle.write("%s %i\n" % (site, round(phred_qualities[i]))) | |
self.handle.write("END_DNA\nEND_SEQUENCE\n") | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest() | |