Spaces:
No application file
No application file
# Copyright 2011 by Wibowo Arindrarto ([email protected]) | |
# Revisions copyright 2011-2016 by Peter Cock. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.SeqIO parser for the ABI format. | |
ABI is the format used by Applied Biosystem's sequencing machines to store | |
sequencing results. | |
For more details on the format specification, visit: | |
http://www6.appliedbiosystems.com/support/software_community/ABIF_File_Format.pdf | |
""" | |
import datetime | |
import struct | |
import sys | |
from os.path import basename | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from .Interfaces import SequenceIterator | |
# dictionary for determining which tags goes into SeqRecord annotation | |
# each key is tag_name + tag_number | |
# if a tag entry needs to be added, just add its key and its key | |
# for the annotations dictionary as the value | |
# dictionary for tags that require preprocessing before use in creating | |
# seqrecords | |
_EXTRACT = { | |
"TUBE1": "sample_well", | |
"DySN1": "dye", | |
"GTyp1": "polymer", | |
"MODL1": "machine_model", | |
} | |
# Complete data structure representing 98% of the API. The general section | |
# represents the part of the API that's common to ALL instruments, whereas the | |
# instrument specific sections are labelled as they are in the ABIF spec | |
# | |
# Keys don't seem to clash from machine to machine, so when we parse, we look | |
# for ANY key, and store that in the raw ABIF data structure attached to the | |
# annotations, with the assumption that anyone parsing the data can look up the | |
# spec themself | |
# | |
# Key definitions are retained in case end users want "nice" labels pre-made | |
# for them for all of the available fields. | |
_INSTRUMENT_SPECIFIC_TAGS = {} | |
# fmt: off | |
_INSTRUMENT_SPECIFIC_TAGS["general"] = { | |
"APFN2": "Sequencing Analysis parameters file name", | |
"APXV1": "Analysis Protocol XML schema version", | |
"APrN1": "Analysis Protocol settings name", | |
"APrV1": "Analysis Protocol settings version", | |
"APrX1": "Analysis Protocol XML string", | |
"CMNT1": "Sample Comment", | |
"CTID1": "Container Identifier, a.k.a. plate barcode", | |
"CTNM1": "Container name, usually identical to CTID, but not necessarily so", | |
"CTTL1": "Comment Title", | |
"CpEP1": "Capillary type electrophoresis. 1 for a capillary based machine. 0 for a slab gel based machine.", | |
"DATA1": "Channel 1 raw data", | |
"DATA2": "Channel 2 raw data", | |
"DATA3": "Channel 3 raw data", | |
"DATA4": "Channel 4 raw data", | |
"DATA5": "Short Array holding measured volts/10 (EP voltage) during run", | |
"DATA6": "Short Array holding measured milliAmps trace (EP current) during run", | |
"DATA7": "Short Array holding measured milliWatts trace (Laser EP Power) during run", | |
"DATA8": "Short Array holding measured oven Temperature (polymer temperature) trace during run", | |
"DATA9": "Channel 9 processed data", | |
"DATA10": "Channel 10 processed data", | |
"DATA11": "Channel 11 processed data", | |
"DATA12": "Channel 12 processed data", | |
# Prism 3100/3100-Avant may provide DATA105 | |
# 3130/3130-XL may provide DATA105 | |
# 3530/3530-XL may provide DATA105-199, 9-12, 205-299 | |
"DSam1": "Downsampling factor", | |
"DySN1": "Dye set name", | |
"Dye#1": "Number of dyes", | |
"DyeN1": "Dye 1 name", | |
"DyeN2": "Dye 2 name", | |
"DyeN3": "Dye 3 name", | |
"DyeN4": "Dye 4 name", | |
"DyeW1": "Dye 1 wavelength", | |
"DyeW2": "Dye 2 wavelength", | |
"DyeW3": "Dye 3 wavelength", | |
"DyeW4": "Dye 4 wavelength", | |
# 'DyeN5-N': 'Dye 5-N Name', | |
# 'DyeW5-N': 'Dye 5-N Wavelength', | |
"EPVt1": "Electrophoresis voltage setting (volts)", | |
"EVNT1": "Start Run event", | |
"EVNT2": "Stop Run event", | |
"EVNT3": "Start Collection event", | |
"EVNT4": "Stop Collection event", | |
"FWO_1": 'Base Order. Sequencing Analysis Filter wheel order. Fixed for 3500 at "GATC"', | |
"GTyp1": "Gel or polymer Type", | |
"InSc1": "Injection time (seconds)", | |
"InVt1": "Injection voltage (volts)", | |
"LANE1": "Lane/Capillary", | |
"LIMS1": "Sample tracking ID", | |
"LNTD1": "Length to detector", | |
"LsrP1": "Laser Power setting (micro Watts)", | |
"MCHN1": "Instrument name and serial number", | |
"MODF1": "Data collection module file", | |
"MODL1": "Model number", | |
"NAVG1": "Pixels averaged per lane", | |
"NLNE1": "Number of capillaries", | |
"OfSc1": "List of scans that are marked off scale in Collection. (optional)", | |
# OvrI and OrvV are listed as "1-N", and "One for each dye (unanalyzed | |
# and/or analyzed data)" | |
"OvrI1": "List of scan number indexes that have values greater than 32767 but did not " | |
"saturate the camera. In Genemapper samples, this can have indexes with " | |
"values greater than 32000. In sequencing samples, this cannot have " | |
"indexes with values greater than 32000.", | |
"OvrI2": "List of scan number indexes that have values greater than 32767 but did not " | |
"saturate the camera. In Genemapper samples, this can have indexes with " | |
"values greater than 32000. In sequencing samples, this cannot have " | |
"indexes with values greater than 32000.", | |
"OvrI3": "List of scan number indexes that have values greater than 32767 but did not " | |
"saturate the camera. In Genemapper samples, this can have indexes with " | |
"values greater than 32000. In sequencing samples, this cannot have " | |
"indexes with values greater than 32000.", | |
"OvrI4": "List of scan number indexes that have values greater than 32767 but did not " | |
"saturate the camera. In Genemapper samples, this can have indexes with " | |
"values greater than 32000. In sequencing samples, this cannot have " | |
"indexes with values greater than 32000.", | |
"OvrV1": "List of color data values found at the locations listed in the OvrI tag. " | |
"There must be exactly as many numbers in this array as in the OvrI array.", | |
"OvrV2": "List of color data values found at the locations listed in the OvrI tag. " | |
"There must be exactly as many numbers in this array as in the OvrI array.", | |
"OvrV3": "List of color data values found at the locations listed in the OvrI tag. " | |
"There must be exactly as many numbers in this array as in the OvrI array.", | |
"OvrV4": "List of color data values found at the locations listed in the OvrI tag. " | |
"There must be exactly as many numbers in this array as in the OvrI array.", | |
"PDMF1": "Sequencing Analysis Mobility file name chosen in collection", | |
"RMXV1": "Run Module XML schema version", | |
"RMdN1": "Run Module name (same as MODF)", | |
"RMdX1": "Run Module XML string", | |
"RPrN1": "Run Protocol name", | |
"RPrV1": "Run Protocol version", | |
"RUND1": "Run Started Date", | |
"RUND2": "Run Stopped Date", | |
"RUND3": "Data Collection Started Date", | |
"RUND4": "Data Collection Stopped date", | |
"RUNT1": "Run Started Time", | |
"RUNT2": "Run Stopped Time", | |
"RUNT3": "Data Collection Started Time", | |
"RUNT4": "Data Collection Stopped Time", | |
"Rate1": "Scanning Rate. Milliseconds per frame.", | |
"RunN1": "Run Name", | |
"SCAN1": "Number of scans", | |
"SMED1": "Polymer lot expiration date", | |
"SMLt1": "Polymer lot number", | |
"SMPL1": "Sample name", | |
"SVER1": "Data collection software version", | |
"SVER3": "Data collection firmware version", | |
"Satd1": "Array of longs representing the scan numbers of data points, which are flagged as saturated by data collection (optional)", | |
"Scal1": "Rescaling divisor for color data", | |
"Scan1": "Number of scans (legacy - use SCAN)", | |
"TUBE1": "Well ID", | |
"Tmpr1": "Run temperature setting", | |
"User1": "Name of user who created the plate (optional)", | |
} | |
# No instrument specific tags | |
# _INSTRUMENT_SPECIFIC_TAGS['abi_prism_3100/3100-Avant'] = { | |
# } | |
_INSTRUMENT_SPECIFIC_TAGS["abi_3130/3130xl"] = { | |
"CTOw1": "Container owner", | |
"HCFG1": "Instrument Class", | |
"HCFG2": "Instrument Family", | |
"HCFG3": "Official Instrument Name", | |
"HCFG4": "Instrument Parameters", | |
"RMdVa1": "Run Module version", | |
} | |
_INSTRUMENT_SPECIFIC_TAGS["abi_3530/3530xl"] = { | |
"AAct1": "Primary Analysis Audit Active indication. True if system auditing was enabled during the last write of this file, " | |
"false if system auditing was disabled.", | |
"ABED1": "Anode buffer expiration date using ISO 8601 format using the patterns YYYY-MM-DDTHH:MM:SS.ss+/-HH:MM. Hundredths of a second are optional.", | |
"ABID1": "Anode buffer tray first installed date", | |
"ABLt1": "Anode buffer lot number", | |
"ABRn1": "Number of runs (injections) processed with the current Anode Buffer (runs allowed - runs remaining)", | |
"ABTp1": "Anode buffer type", | |
"AEPt1": "Analysis Ending scan number for basecalling on initial analysis", | |
"AEPt2": "Analysis Ending scan number for basecalling on last analysis", | |
"APCN1": "Amplicon name", | |
"ARTN1": "Analysis Return code. Produced only by 5 Prime basecaller 1.0b3", | |
"ASPF1": "Flag to indicate whether adaptive processing worked or not", | |
"ASPt1": "Analysis Starting scan number for first analysis", | |
"ASPt2": "Analysis Starting scan number for last analysis", | |
"AUDT2": "Audit log used across 3500 software (optional)", | |
"AVld1": "Assay validation flag (true or false)", | |
"AmbT1": "Record of ambient temperature readings", | |
"AsyC1": "The assay contents (xml format)", | |
"AsyN1": "The assay name", | |
"AsyV1": "The assay version", | |
"B1Pt1": "Reference scan number for mobility and spacing curves for first analysis", | |
"B1Pt2": "Reference scan number for mobility and spacing curves for last analysis", | |
"BCTS1": "Basecaller timestamp. Time of completion of most recent analysis", | |
"BcRn1": "Basecalling qc code", | |
"BcRs1": "Basecalling warnings, a concatenated comma separated string", | |
"BcRs2": "Basecalling errors, a concatenated comma separated string", | |
"CAED1": "Capillary array expiration", | |
"CALt1": "Capillary array lot number", | |
"CARn1": "Number of injections processed (including the one of which this sample was a part) through the capillary array", | |
"CASN1": "Capillary array serial number", | |
"CBED1": "Cathode buffer expiration date", | |
"CBID1": "Cathode buffer tray first installed date", | |
"CBLt1": "Cathode buffer lot number", | |
"CBRn1": "Number of runs (injections) processed with the current Cathode Buffer (runs allowed - runs remaining)", | |
"CBTp1": "Cathode buffer type", | |
"CLRG1": "Start of the clear range (inclusive).", | |
"CLRG2": "Clear range length", | |
"CRLn1": "Contiguous read length", | |
"CRLn2": 'One of "Pass", "Fail", or "Check"', | |
"CTOw1": "The name entered as the Owner of a plate, in the plate editor", | |
"CkSm1": "File checksum", | |
"DCEv1": "A list of door-close events, separated by semicolon. Door open events are generally paired with door close events.", | |
"DCHT1": "Reserved for backward compatibility. The detection cell heater temperature setting from the Run Module. Not used for 3500.", | |
"DOEv1": "A list of door-open events, separated by semicolon. Door close events are generally paired with door open events.", | |
"ESig2": "Electronic signature record used across 3500 software", | |
"FTab1": "Feature table. Can be created by Nibbler for Clear Range.", | |
"FVoc1": "Feature table vocabulary. Can be created by Nibbler for Clear Range.", | |
"Feat1": "Features. Can be created by Nibbler for Clear Range.", | |
"HCFG1": "The Instrument Class. All upper case, no spaces. Initial valid value: CE", | |
"HCFG2": "The Instrument Family. All upper case, no spaces. Valid values: 31XX or 37XX for UDC, 35XX (for 3500)", | |
"HCFG3": "The official instrument name. Mixed case, minus any special formatting. Initial valid values: 3130, 3130xl, 3730, 3730xl, 3500, 3500xl.", | |
"HCFG4": "Instrument parameters. Contains key-value pairs of instrument configuration information, separated by semicolons. " | |
"Four parameters are included initially: UnitID=<UNITD number>, CPUBoard=<board type>, " | |
"ArraySize=<# of capillaries>, SerialNumber=<Instrument Serial#>.", | |
"InjN1": "Injection name", | |
"LAST1": "Parameter settings information", | |
"NOIS1": "The estimate of rms baseline noise (S/N ratio) for each dye for a successfully analyzed sample. " | |
"Corresponds in order to the raw data in tags DATA 1-4. KB basecaller only.", | |
"P1AM1": "Amplitude of primary peak, which is not necessarily equal to corresponding signal strength at that position", | |
"P1RL1": "Deviation of primary peak position from (PLoc,2), times 100, rounded to integer", | |
"P1WD1": "Full-width Half-max of primary peak, times 100, rounded to integer. " | |
"Corresponding signal intensity is not necessarily equal to one half of primary peak amplitude", | |
"P2AM1": "Amplitude of secondary peak, which is not necessarily equal to corresponding signal strength at that position", | |
"P2BA1": "Base of secondary peak", | |
"P2RL1": "Deviation of secondary peak position from (PLoc,2), times 100, rounded to integer", | |
"PBAS1": "Array of sequence characters edited by user", | |
"PBAS2": "Array of sequence characters as called by Basecaller", | |
"PCON1": "Array of quality Values (0-255) as edited by user", | |
"PCON2": "Array of quality values (0-255) as called by Basecaller", | |
"PDMF2": "Mobility file name chosen in most recent analysis (identical to PDMF1)", | |
"PLOC1": "Array of peak locations edited by user", | |
"PLOC2": "Array of peak locations as called by Basecaller", | |
"PRJT1": "SeqScape 2.0 project template name", | |
"PROJ4": "SeqScape 2.0 project name", | |
"PSZE1": "Plate size. The number of sample positions in the container. Current allowed values: 96, 384.", | |
"PTYP1": "Plate type. Current allowed values: 96-Well, 384-Well.", | |
"PuSc1": "Median pupscore", | |
"QV201": "QV20+ value", | |
"QV202": 'One of "Pass", "Fail", or "Check"', | |
"QcPa1": "QC parameters", | |
"QcRn1": "Trimming and QC code", | |
"QcRs1": "QC warnings, a concatenated comma separated string", | |
"QcRs2": "QC errors, a concatenated comma separated string", | |
"RGOw1": "The name entered as the Owner of a Results Group, in the Results Group Editor. Implemented as the user name from the results group.", | |
"RInj1": "Reinjection number. The reinjection number that this sample belongs to. Not present if there was no reinjection.", | |
"RNmF1": "Raman normalization factor", | |
"RevC1": "for whether the sequence has been complemented", | |
"RunN1": "Run name (which, for 3500, is different from injection name)", | |
"S/N%1": "Signal strength for each dye", | |
"SMID1": "Polymer first installed date", | |
"SMRn1": "Number of runs (injections) processed with the current polymer (runs allowed - runs remaining)", | |
"SPAC1": "Average peak spacing used in last analysis", | |
"SPAC2": "Basecaller name - corresponds to name of bcp file.", | |
"SPAC3": "Average peak spacing last calculated by the Basecaller.", | |
"SPEC1": "Sequencing Analysis Specimen Name", | |
"SVER2": "Basecaller version number", | |
"SVER4": "Sample File Format Version String", | |
"ScPa1": "The parameter string of size caller", | |
"ScSt1": "Raw data start point. Set to 0 for 3500 data collection.", | |
"SpeN1": "Active spectral calibration name", | |
"TrPa1": "Trimming parameters", | |
"TrSc1": "Trace score.", | |
"TrSc2": 'One of "Pass", "Fail", or "Check"', | |
"phAR1": "Trace peak aria ratio", | |
"phCH1": 'Chemistry type ("term", "prim", "unknown"), based on DYE_1 information', | |
"phDY1": 'Dye ("big", "d-rhod", "unknown"), based on mob file information', | |
"phQL1": "Maximum Quality Value", | |
"phTR1": "Set Trim region", | |
"phTR2": "Trim probability", | |
} | |
_INSTRUMENT_SPECIFIC_TAGS["abi_3730/3730xl"] = { | |
"BufT1": "Buffer tray heater temperature (degrees C)", | |
} | |
# fmt: on | |
# dictionary for data unpacking format | |
_BYTEFMT = { | |
1: "b", # byte | |
2: "s", # char | |
3: "H", # word | |
4: "h", # short | |
5: "i", # long | |
6: "2i", # rational, legacy unsupported | |
7: "f", # float | |
8: "d", # double | |
10: "h2B", # date | |
11: "4B", # time | |
12: "2i2b", # thumb | |
13: "B", # bool | |
14: "2h", # point, legacy unsupported | |
15: "4h", # rect, legacy unsupported | |
16: "2i", # vPoint, legacy unsupported | |
17: "4i", # vRect, legacy unsupported | |
18: "s", # pString | |
19: "s", # cString | |
20: "2i", # tag, legacy unsupported | |
} | |
# header data structure (excluding 4 byte ABIF marker) | |
_HEADFMT = ">H4sI2H3I" | |
# directory data structure | |
_DIRFMT = ">4sI2H4I" | |
__global_tag_listing = [] | |
for tag in _INSTRUMENT_SPECIFIC_TAGS.values(): | |
__global_tag_listing += tag.keys() | |
def _get_string_tag(opt_bytes_value, default=None): | |
"""Return the string value of the given an optional raw bytes tag value. | |
If the bytes value is None, return the given default value. | |
""" | |
if opt_bytes_value is None: | |
return default | |
try: | |
return opt_bytes_value.decode() | |
except UnicodeDecodeError: | |
return opt_bytes_value.decode(encoding=sys.getdefaultencoding()) | |
class AbiIterator(SequenceIterator): | |
"""Parser for Abi files.""" | |
def __init__(self, source, trim=False): | |
"""Return an iterator for the Abi file format.""" | |
self.trim = trim | |
super().__init__(source, mode="b", fmt="ABI") | |
def parse(self, handle): | |
"""Start parsing the file, and return a SeqRecord generator.""" | |
# check if input file is a valid Abi file | |
marker = handle.read(4) | |
if not marker: | |
# handle empty file gracefully | |
raise ValueError("Empty file.") | |
if marker != b"ABIF": | |
raise OSError(f"File should start ABIF, not {marker!r}") | |
records = self.iterate(handle) | |
return records | |
def iterate(self, handle): | |
"""Parse the file and generate SeqRecord objects.""" | |
# dirty hack for handling time information | |
times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""} | |
# initialize annotations | |
annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT))) | |
# parse header and extract data from directories | |
header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT))) | |
# Set default sample ID value, which we expect to be present in most | |
# cases in the SMPL1 tag, but may be missing. | |
sample_id = "<unknown id>" | |
raw = {} | |
seq = qual = None | |
for tag_name, tag_number, tag_data in _abi_parse_header(header, handle): | |
key = tag_name + str(tag_number) | |
raw[key] = tag_data | |
# PBAS2 is base-called sequence, only available in 3530 | |
if key == "PBAS2": | |
seq = tag_data.decode() | |
# PCON2 is quality values of base-called sequence | |
elif key == "PCON2": | |
qual = [ord(val) for val in tag_data.decode()] | |
# SMPL1 is sample id entered before sequencing run, it must be | |
# a string. | |
elif key == "SMPL1": | |
sample_id = _get_string_tag(tag_data) | |
elif key in times: | |
times[key] = tag_data | |
else: | |
if key in _EXTRACT: | |
annot[_EXTRACT[key]] = tag_data | |
# set time annotations | |
annot["run_start"] = f"{times['RUND1']} {times['RUNT1']}" | |
annot["run_finish"] = f"{times['RUND2']} {times['RUNT2']}" | |
# raw data (for advanced end users benefit) | |
annot["abif_raw"] = raw | |
# fsa check | |
is_fsa_file = all(tn not in raw for tn in ("PBAS1", "PBAS2")) | |
if is_fsa_file: | |
try: | |
file_name = basename(handle.name).replace(".fsa", "") | |
except AttributeError: | |
file_name = "" | |
sample_id = _get_string_tag(raw.get("LIMS1"), sample_id) | |
description = _get_string_tag(raw.get("CTID1"), "<unknown description>") | |
record = SeqRecord( | |
Seq(""), | |
id=sample_id, | |
name=file_name, | |
description=description, | |
annotations=annot, | |
) | |
else: | |
# use the file name as SeqRecord.name if available | |
try: | |
file_name = basename(handle.name).replace(".ab1", "") | |
except AttributeError: | |
file_name = "" | |
record = SeqRecord( | |
Seq(seq), | |
id=sample_id, | |
name=file_name, | |
description="", | |
annotations=annot, | |
) | |
if qual: | |
# Expect this to be missing for FSA files. | |
record.letter_annotations["phred_quality"] = qual | |
elif not is_fsa_file and not qual and self.trim: | |
raise ValueError( | |
"The 'abi-trim' format can not be used for files without" | |
" quality values." | |
) | |
if self.trim and not is_fsa_file: | |
record = _abi_trim(record) | |
record.annotations["molecule_type"] = "DNA" | |
yield record | |
def _AbiTrimIterator(handle): | |
"""Return an iterator for the Abi file format that yields trimmed SeqRecord objects (PRIVATE).""" | |
return AbiIterator(handle, trim=True) | |
def _abi_parse_header(header, handle): | |
"""Return directory contents (PRIVATE).""" | |
# header structure (after ABIF marker): | |
# file version, tag name, tag number, | |
# element type code, element size, number of elements | |
# data size, data offset, handle (not file handle) | |
head_elem_size = header[4] | |
head_elem_num = header[5] | |
head_offset = header[7] | |
index = 0 | |
while index < head_elem_num: | |
start = head_offset + index * head_elem_size | |
# add directory offset to tuple | |
# to handle directories with data size <= 4 bytes | |
handle.seek(start) | |
dir_entry = struct.unpack(_DIRFMT, handle.read(struct.calcsize(_DIRFMT))) + ( | |
start, | |
) | |
index += 1 | |
# only parse desired dirs | |
key = dir_entry[0].decode() | |
key += str(dir_entry[1]) | |
tag_name = dir_entry[0].decode() | |
tag_number = dir_entry[1] | |
elem_code = dir_entry[2] | |
elem_num = dir_entry[4] | |
data_size = dir_entry[5] | |
data_offset = dir_entry[6] | |
tag_offset = dir_entry[8] | |
# if data size <= 4 bytes, data is stored inside tag | |
# so offset needs to be changed | |
if data_size <= 4: | |
data_offset = tag_offset + 20 | |
handle.seek(data_offset) | |
data = handle.read(data_size) | |
yield tag_name, tag_number, _parse_tag_data(elem_code, elem_num, data) | |
def _abi_trim(seq_record): | |
"""Trims the sequence using Richard Mott's modified trimming algorithm (PRIVATE). | |
Arguments: | |
- seq_record - SeqRecord object to be trimmed. | |
Trimmed bases are determined from their segment score, which is a | |
cumulative sum of each base's score. Base scores are calculated from | |
their quality values. | |
More about the trimming algorithm: | |
http://www.phrap.org/phredphrap/phred.html | |
http://resources.qiagenbioinformatics.com/manuals/clcgenomicsworkbench/650/Quality_trimming.html | |
""" | |
start = False # flag for starting position of trimmed sequence | |
segment = 20 # minimum sequence length | |
trim_start = 0 # init start index | |
cutoff = 0.05 # default cutoff value for calculating base score | |
if len(seq_record) <= segment: | |
return seq_record | |
else: | |
# calculate base score | |
score_list = [ | |
cutoff - (10 ** (qual / -10.0)) | |
for qual in seq_record.letter_annotations["phred_quality"] | |
] | |
# calculate cumulative score | |
# if cumulative value < 0, set it to 0 | |
# first value is set to 0, because of the assumption that | |
# the first base will always be trimmed out | |
cummul_score = [0] | |
for i in range(1, len(score_list)): | |
score = cummul_score[-1] + score_list[i] | |
if score < 0: | |
cummul_score.append(0) | |
else: | |
cummul_score.append(score) | |
if not start: | |
# trim_start = value when cumulative score is first > 0 | |
trim_start = i | |
start = True | |
# trim_finish = index of highest cumulative score, | |
# marking the end of sequence segment with highest cumulative score | |
trim_finish = cummul_score.index(max(cummul_score)) | |
return seq_record[trim_start:trim_finish] | |
def _parse_tag_data(elem_code, elem_num, raw_data): | |
"""Return single data value (PRIVATE). | |
Arguments: | |
- elem_code - What kind of data | |
- elem_num - How many data points | |
- raw_data - abi file object from which the tags would be unpacked | |
""" | |
if elem_code in _BYTEFMT: | |
# because '>1s' unpack differently from '>s' | |
if elem_num == 1: | |
num = "" | |
else: | |
num = str(elem_num) | |
fmt = ">" + num + _BYTEFMT[elem_code] | |
assert len(raw_data) == struct.calcsize(fmt) | |
data = struct.unpack(fmt, raw_data) | |
# no need to use tuple if len(data) == 1 | |
# also if data is date / time | |
if elem_code not in [10, 11] and len(data) == 1: | |
data = data[0] | |
# account for different data types | |
if elem_code == 2: | |
return data | |
elif elem_code == 10: | |
return str(datetime.date(*data)) | |
elif elem_code == 11: | |
return str(datetime.time(*data[:3])) | |
elif elem_code == 13: | |
return bool(data) | |
elif elem_code == 18: | |
return data[1:] | |
elif elem_code == 19: | |
return data[:-1] | |
else: | |
return data | |
else: | |
return None | |
if __name__ == "__main__": | |
pass | |