aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2011 by Wibowo Arindrarto ([email protected])
# Revisions copyright 2011-2016 by Peter Cock.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO parser for the ABI format.
ABI is the format used by Applied Biosystem's sequencing machines to store
sequencing results.
For more details on the format specification, visit:
http://www6.appliedbiosystems.com/support/software_community/ABIF_File_Format.pdf
"""
import datetime
import struct
import sys
from os.path import basename
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from .Interfaces import SequenceIterator
# dictionary for determining which tags goes into SeqRecord annotation
# each key is tag_name + tag_number
# if a tag entry needs to be added, just add its key and its key
# for the annotations dictionary as the value
# dictionary for tags that require preprocessing before use in creating
# seqrecords
_EXTRACT = {
"TUBE1": "sample_well",
"DySN1": "dye",
"GTyp1": "polymer",
"MODL1": "machine_model",
}
# Complete data structure representing 98% of the API. The general section
# represents the part of the API that's common to ALL instruments, whereas the
# instrument specific sections are labelled as they are in the ABIF spec
#
# Keys don't seem to clash from machine to machine, so when we parse, we look
# for ANY key, and store that in the raw ABIF data structure attached to the
# annotations, with the assumption that anyone parsing the data can look up the
# spec themself
#
# Key definitions are retained in case end users want "nice" labels pre-made
# for them for all of the available fields.
_INSTRUMENT_SPECIFIC_TAGS = {}
# fmt: off
_INSTRUMENT_SPECIFIC_TAGS["general"] = {
"APFN2": "Sequencing Analysis parameters file name",
"APXV1": "Analysis Protocol XML schema version",
"APrN1": "Analysis Protocol settings name",
"APrV1": "Analysis Protocol settings version",
"APrX1": "Analysis Protocol XML string",
"CMNT1": "Sample Comment",
"CTID1": "Container Identifier, a.k.a. plate barcode",
"CTNM1": "Container name, usually identical to CTID, but not necessarily so",
"CTTL1": "Comment Title",
"CpEP1": "Capillary type electrophoresis. 1 for a capillary based machine. 0 for a slab gel based machine.",
"DATA1": "Channel 1 raw data",
"DATA2": "Channel 2 raw data",
"DATA3": "Channel 3 raw data",
"DATA4": "Channel 4 raw data",
"DATA5": "Short Array holding measured volts/10 (EP voltage) during run",
"DATA6": "Short Array holding measured milliAmps trace (EP current) during run",
"DATA7": "Short Array holding measured milliWatts trace (Laser EP Power) during run",
"DATA8": "Short Array holding measured oven Temperature (polymer temperature) trace during run",
"DATA9": "Channel 9 processed data",
"DATA10": "Channel 10 processed data",
"DATA11": "Channel 11 processed data",
"DATA12": "Channel 12 processed data",
# Prism 3100/3100-Avant may provide DATA105
# 3130/3130-XL may provide DATA105
# 3530/3530-XL may provide DATA105-199, 9-12, 205-299
"DSam1": "Downsampling factor",
"DySN1": "Dye set name",
"Dye#1": "Number of dyes",
"DyeN1": "Dye 1 name",
"DyeN2": "Dye 2 name",
"DyeN3": "Dye 3 name",
"DyeN4": "Dye 4 name",
"DyeW1": "Dye 1 wavelength",
"DyeW2": "Dye 2 wavelength",
"DyeW3": "Dye 3 wavelength",
"DyeW4": "Dye 4 wavelength",
# 'DyeN5-N': 'Dye 5-N Name',
# 'DyeW5-N': 'Dye 5-N Wavelength',
"EPVt1": "Electrophoresis voltage setting (volts)",
"EVNT1": "Start Run event",
"EVNT2": "Stop Run event",
"EVNT3": "Start Collection event",
"EVNT4": "Stop Collection event",
"FWO_1": 'Base Order. Sequencing Analysis Filter wheel order. Fixed for 3500 at "GATC"',
"GTyp1": "Gel or polymer Type",
"InSc1": "Injection time (seconds)",
"InVt1": "Injection voltage (volts)",
"LANE1": "Lane/Capillary",
"LIMS1": "Sample tracking ID",
"LNTD1": "Length to detector",
"LsrP1": "Laser Power setting (micro Watts)",
"MCHN1": "Instrument name and serial number",
"MODF1": "Data collection module file",
"MODL1": "Model number",
"NAVG1": "Pixels averaged per lane",
"NLNE1": "Number of capillaries",
"OfSc1": "List of scans that are marked off scale in Collection. (optional)",
# OvrI and OrvV are listed as "1-N", and "One for each dye (unanalyzed
# and/or analyzed data)"
"OvrI1": "List of scan number indexes that have values greater than 32767 but did not "
"saturate the camera. In Genemapper samples, this can have indexes with "
"values greater than 32000. In sequencing samples, this cannot have "
"indexes with values greater than 32000.",
"OvrI2": "List of scan number indexes that have values greater than 32767 but did not "
"saturate the camera. In Genemapper samples, this can have indexes with "
"values greater than 32000. In sequencing samples, this cannot have "
"indexes with values greater than 32000.",
"OvrI3": "List of scan number indexes that have values greater than 32767 but did not "
"saturate the camera. In Genemapper samples, this can have indexes with "
"values greater than 32000. In sequencing samples, this cannot have "
"indexes with values greater than 32000.",
"OvrI4": "List of scan number indexes that have values greater than 32767 but did not "
"saturate the camera. In Genemapper samples, this can have indexes with "
"values greater than 32000. In sequencing samples, this cannot have "
"indexes with values greater than 32000.",
"OvrV1": "List of color data values found at the locations listed in the OvrI tag. "
"There must be exactly as many numbers in this array as in the OvrI array.",
"OvrV2": "List of color data values found at the locations listed in the OvrI tag. "
"There must be exactly as many numbers in this array as in the OvrI array.",
"OvrV3": "List of color data values found at the locations listed in the OvrI tag. "
"There must be exactly as many numbers in this array as in the OvrI array.",
"OvrV4": "List of color data values found at the locations listed in the OvrI tag. "
"There must be exactly as many numbers in this array as in the OvrI array.",
"PDMF1": "Sequencing Analysis Mobility file name chosen in collection",
"RMXV1": "Run Module XML schema version",
"RMdN1": "Run Module name (same as MODF)",
"RMdX1": "Run Module XML string",
"RPrN1": "Run Protocol name",
"RPrV1": "Run Protocol version",
"RUND1": "Run Started Date",
"RUND2": "Run Stopped Date",
"RUND3": "Data Collection Started Date",
"RUND4": "Data Collection Stopped date",
"RUNT1": "Run Started Time",
"RUNT2": "Run Stopped Time",
"RUNT3": "Data Collection Started Time",
"RUNT4": "Data Collection Stopped Time",
"Rate1": "Scanning Rate. Milliseconds per frame.",
"RunN1": "Run Name",
"SCAN1": "Number of scans",
"SMED1": "Polymer lot expiration date",
"SMLt1": "Polymer lot number",
"SMPL1": "Sample name",
"SVER1": "Data collection software version",
"SVER3": "Data collection firmware version",
"Satd1": "Array of longs representing the scan numbers of data points, which are flagged as saturated by data collection (optional)",
"Scal1": "Rescaling divisor for color data",
"Scan1": "Number of scans (legacy - use SCAN)",
"TUBE1": "Well ID",
"Tmpr1": "Run temperature setting",
"User1": "Name of user who created the plate (optional)",
}
# No instrument specific tags
# _INSTRUMENT_SPECIFIC_TAGS['abi_prism_3100/3100-Avant'] = {
# }
_INSTRUMENT_SPECIFIC_TAGS["abi_3130/3130xl"] = {
"CTOw1": "Container owner",
"HCFG1": "Instrument Class",
"HCFG2": "Instrument Family",
"HCFG3": "Official Instrument Name",
"HCFG4": "Instrument Parameters",
"RMdVa1": "Run Module version",
}
_INSTRUMENT_SPECIFIC_TAGS["abi_3530/3530xl"] = {
"AAct1": "Primary Analysis Audit Active indication. True if system auditing was enabled during the last write of this file, "
"false if system auditing was disabled.",
"ABED1": "Anode buffer expiration date using ISO 8601 format using the patterns YYYY-MM-DDTHH:MM:SS.ss+/-HH:MM. Hundredths of a second are optional.",
"ABID1": "Anode buffer tray first installed date",
"ABLt1": "Anode buffer lot number",
"ABRn1": "Number of runs (injections) processed with the current Anode Buffer (runs allowed - runs remaining)",
"ABTp1": "Anode buffer type",
"AEPt1": "Analysis Ending scan number for basecalling on initial analysis",
"AEPt2": "Analysis Ending scan number for basecalling on last analysis",
"APCN1": "Amplicon name",
"ARTN1": "Analysis Return code. Produced only by 5 Prime basecaller 1.0b3",
"ASPF1": "Flag to indicate whether adaptive processing worked or not",
"ASPt1": "Analysis Starting scan number for first analysis",
"ASPt2": "Analysis Starting scan number for last analysis",
"AUDT2": "Audit log used across 3500 software (optional)",
"AVld1": "Assay validation flag (true or false)",
"AmbT1": "Record of ambient temperature readings",
"AsyC1": "The assay contents (xml format)",
"AsyN1": "The assay name",
"AsyV1": "The assay version",
"B1Pt1": "Reference scan number for mobility and spacing curves for first analysis",
"B1Pt2": "Reference scan number for mobility and spacing curves for last analysis",
"BCTS1": "Basecaller timestamp. Time of completion of most recent analysis",
"BcRn1": "Basecalling qc code",
"BcRs1": "Basecalling warnings, a concatenated comma separated string",
"BcRs2": "Basecalling errors, a concatenated comma separated string",
"CAED1": "Capillary array expiration",
"CALt1": "Capillary array lot number",
"CARn1": "Number of injections processed (including the one of which this sample was a part) through the capillary array",
"CASN1": "Capillary array serial number",
"CBED1": "Cathode buffer expiration date",
"CBID1": "Cathode buffer tray first installed date",
"CBLt1": "Cathode buffer lot number",
"CBRn1": "Number of runs (injections) processed with the current Cathode Buffer (runs allowed - runs remaining)",
"CBTp1": "Cathode buffer type",
"CLRG1": "Start of the clear range (inclusive).",
"CLRG2": "Clear range length",
"CRLn1": "Contiguous read length",
"CRLn2": 'One of "Pass", "Fail", or "Check"',
"CTOw1": "The name entered as the Owner of a plate, in the plate editor",
"CkSm1": "File checksum",
"DCEv1": "A list of door-close events, separated by semicolon. Door open events are generally paired with door close events.",
"DCHT1": "Reserved for backward compatibility. The detection cell heater temperature setting from the Run Module. Not used for 3500.",
"DOEv1": "A list of door-open events, separated by semicolon. Door close events are generally paired with door open events.",
"ESig2": "Electronic signature record used across 3500 software",
"FTab1": "Feature table. Can be created by Nibbler for Clear Range.",
"FVoc1": "Feature table vocabulary. Can be created by Nibbler for Clear Range.",
"Feat1": "Features. Can be created by Nibbler for Clear Range.",
"HCFG1": "The Instrument Class. All upper case, no spaces. Initial valid value: CE",
"HCFG2": "The Instrument Family. All upper case, no spaces. Valid values: 31XX or 37XX for UDC, 35XX (for 3500)",
"HCFG3": "The official instrument name. Mixed case, minus any special formatting. Initial valid values: 3130, 3130xl, 3730, 3730xl, 3500, 3500xl.",
"HCFG4": "Instrument parameters. Contains key-value pairs of instrument configuration information, separated by semicolons. "
"Four parameters are included initially: UnitID=<UNITD number>, CPUBoard=<board type>, "
"ArraySize=<# of capillaries>, SerialNumber=<Instrument Serial#>.",
"InjN1": "Injection name",
"LAST1": "Parameter settings information",
"NOIS1": "The estimate of rms baseline noise (S/N ratio) for each dye for a successfully analyzed sample. "
"Corresponds in order to the raw data in tags DATA 1-4. KB basecaller only.",
"P1AM1": "Amplitude of primary peak, which is not necessarily equal to corresponding signal strength at that position",
"P1RL1": "Deviation of primary peak position from (PLoc,2), times 100, rounded to integer",
"P1WD1": "Full-width Half-max of primary peak, times 100, rounded to integer. "
"Corresponding signal intensity is not necessarily equal to one half of primary peak amplitude",
"P2AM1": "Amplitude of secondary peak, which is not necessarily equal to corresponding signal strength at that position",
"P2BA1": "Base of secondary peak",
"P2RL1": "Deviation of secondary peak position from (PLoc,2), times 100, rounded to integer",
"PBAS1": "Array of sequence characters edited by user",
"PBAS2": "Array of sequence characters as called by Basecaller",
"PCON1": "Array of quality Values (0-255) as edited by user",
"PCON2": "Array of quality values (0-255) as called by Basecaller",
"PDMF2": "Mobility file name chosen in most recent analysis (identical to PDMF1)",
"PLOC1": "Array of peak locations edited by user",
"PLOC2": "Array of peak locations as called by Basecaller",
"PRJT1": "SeqScape 2.0 project template name",
"PROJ4": "SeqScape 2.0 project name",
"PSZE1": "Plate size. The number of sample positions in the container. Current allowed values: 96, 384.",
"PTYP1": "Plate type. Current allowed values: 96-Well, 384-Well.",
"PuSc1": "Median pupscore",
"QV201": "QV20+ value",
"QV202": 'One of "Pass", "Fail", or "Check"',
"QcPa1": "QC parameters",
"QcRn1": "Trimming and QC code",
"QcRs1": "QC warnings, a concatenated comma separated string",
"QcRs2": "QC errors, a concatenated comma separated string",
"RGOw1": "The name entered as the Owner of a Results Group, in the Results Group Editor. Implemented as the user name from the results group.",
"RInj1": "Reinjection number. The reinjection number that this sample belongs to. Not present if there was no reinjection.",
"RNmF1": "Raman normalization factor",
"RevC1": "for whether the sequence has been complemented",
"RunN1": "Run name (which, for 3500, is different from injection name)",
"S/N%1": "Signal strength for each dye",
"SMID1": "Polymer first installed date",
"SMRn1": "Number of runs (injections) processed with the current polymer (runs allowed - runs remaining)",
"SPAC1": "Average peak spacing used in last analysis",
"SPAC2": "Basecaller name - corresponds to name of bcp file.",
"SPAC3": "Average peak spacing last calculated by the Basecaller.",
"SPEC1": "Sequencing Analysis Specimen Name",
"SVER2": "Basecaller version number",
"SVER4": "Sample File Format Version String",
"ScPa1": "The parameter string of size caller",
"ScSt1": "Raw data start point. Set to 0 for 3500 data collection.",
"SpeN1": "Active spectral calibration name",
"TrPa1": "Trimming parameters",
"TrSc1": "Trace score.",
"TrSc2": 'One of "Pass", "Fail", or "Check"',
"phAR1": "Trace peak aria ratio",
"phCH1": 'Chemistry type ("term", "prim", "unknown"), based on DYE_1 information',
"phDY1": 'Dye ("big", "d-rhod", "unknown"), based on mob file information',
"phQL1": "Maximum Quality Value",
"phTR1": "Set Trim region",
"phTR2": "Trim probability",
}
_INSTRUMENT_SPECIFIC_TAGS["abi_3730/3730xl"] = {
"BufT1": "Buffer tray heater temperature (degrees C)",
}
# fmt: on
# dictionary for data unpacking format
_BYTEFMT = {
1: "b", # byte
2: "s", # char
3: "H", # word
4: "h", # short
5: "i", # long
6: "2i", # rational, legacy unsupported
7: "f", # float
8: "d", # double
10: "h2B", # date
11: "4B", # time
12: "2i2b", # thumb
13: "B", # bool
14: "2h", # point, legacy unsupported
15: "4h", # rect, legacy unsupported
16: "2i", # vPoint, legacy unsupported
17: "4i", # vRect, legacy unsupported
18: "s", # pString
19: "s", # cString
20: "2i", # tag, legacy unsupported
}
# header data structure (excluding 4 byte ABIF marker)
_HEADFMT = ">H4sI2H3I"
# directory data structure
_DIRFMT = ">4sI2H4I"
__global_tag_listing = []
for tag in _INSTRUMENT_SPECIFIC_TAGS.values():
__global_tag_listing += tag.keys()
def _get_string_tag(opt_bytes_value, default=None):
"""Return the string value of the given an optional raw bytes tag value.
If the bytes value is None, return the given default value.
"""
if opt_bytes_value is None:
return default
try:
return opt_bytes_value.decode()
except UnicodeDecodeError:
return opt_bytes_value.decode(encoding=sys.getdefaultencoding())
class AbiIterator(SequenceIterator):
"""Parser for Abi files."""
def __init__(self, source, trim=False):
"""Return an iterator for the Abi file format."""
self.trim = trim
super().__init__(source, mode="b", fmt="ABI")
def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator."""
# check if input file is a valid Abi file
marker = handle.read(4)
if not marker:
# handle empty file gracefully
raise ValueError("Empty file.")
if marker != b"ABIF":
raise OSError(f"File should start ABIF, not {marker!r}")
records = self.iterate(handle)
return records
def iterate(self, handle):
"""Parse the file and generate SeqRecord objects."""
# dirty hack for handling time information
times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""}
# initialize annotations
annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))
# parse header and extract data from directories
header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))
# Set default sample ID value, which we expect to be present in most
# cases in the SMPL1 tag, but may be missing.
sample_id = "<unknown id>"
raw = {}
seq = qual = None
for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
key = tag_name + str(tag_number)
raw[key] = tag_data
# PBAS2 is base-called sequence, only available in 3530
if key == "PBAS2":
seq = tag_data.decode()
# PCON2 is quality values of base-called sequence
elif key == "PCON2":
qual = [ord(val) for val in tag_data.decode()]
# SMPL1 is sample id entered before sequencing run, it must be
# a string.
elif key == "SMPL1":
sample_id = _get_string_tag(tag_data)
elif key in times:
times[key] = tag_data
else:
if key in _EXTRACT:
annot[_EXTRACT[key]] = tag_data
# set time annotations
annot["run_start"] = f"{times['RUND1']} {times['RUNT1']}"
annot["run_finish"] = f"{times['RUND2']} {times['RUNT2']}"
# raw data (for advanced end users benefit)
annot["abif_raw"] = raw
# fsa check
is_fsa_file = all(tn not in raw for tn in ("PBAS1", "PBAS2"))
if is_fsa_file:
try:
file_name = basename(handle.name).replace(".fsa", "")
except AttributeError:
file_name = ""
sample_id = _get_string_tag(raw.get("LIMS1"), sample_id)
description = _get_string_tag(raw.get("CTID1"), "<unknown description>")
record = SeqRecord(
Seq(""),
id=sample_id,
name=file_name,
description=description,
annotations=annot,
)
else:
# use the file name as SeqRecord.name if available
try:
file_name = basename(handle.name).replace(".ab1", "")
except AttributeError:
file_name = ""
record = SeqRecord(
Seq(seq),
id=sample_id,
name=file_name,
description="",
annotations=annot,
)
if qual:
# Expect this to be missing for FSA files.
record.letter_annotations["phred_quality"] = qual
elif not is_fsa_file and not qual and self.trim:
raise ValueError(
"The 'abi-trim' format can not be used for files without"
" quality values."
)
if self.trim and not is_fsa_file:
record = _abi_trim(record)
record.annotations["molecule_type"] = "DNA"
yield record
def _AbiTrimIterator(handle):
"""Return an iterator for the Abi file format that yields trimmed SeqRecord objects (PRIVATE)."""
return AbiIterator(handle, trim=True)
def _abi_parse_header(header, handle):
"""Return directory contents (PRIVATE)."""
# header structure (after ABIF marker):
# file version, tag name, tag number,
# element type code, element size, number of elements
# data size, data offset, handle (not file handle)
head_elem_size = header[4]
head_elem_num = header[5]
head_offset = header[7]
index = 0
while index < head_elem_num:
start = head_offset + index * head_elem_size
# add directory offset to tuple
# to handle directories with data size <= 4 bytes
handle.seek(start)
dir_entry = struct.unpack(_DIRFMT, handle.read(struct.calcsize(_DIRFMT))) + (
start,
)
index += 1
# only parse desired dirs
key = dir_entry[0].decode()
key += str(dir_entry[1])
tag_name = dir_entry[0].decode()
tag_number = dir_entry[1]
elem_code = dir_entry[2]
elem_num = dir_entry[4]
data_size = dir_entry[5]
data_offset = dir_entry[6]
tag_offset = dir_entry[8]
# if data size <= 4 bytes, data is stored inside tag
# so offset needs to be changed
if data_size <= 4:
data_offset = tag_offset + 20
handle.seek(data_offset)
data = handle.read(data_size)
yield tag_name, tag_number, _parse_tag_data(elem_code, elem_num, data)
def _abi_trim(seq_record):
"""Trims the sequence using Richard Mott's modified trimming algorithm (PRIVATE).
Arguments:
- seq_record - SeqRecord object to be trimmed.
Trimmed bases are determined from their segment score, which is a
cumulative sum of each base's score. Base scores are calculated from
their quality values.
More about the trimming algorithm:
http://www.phrap.org/phredphrap/phred.html
http://resources.qiagenbioinformatics.com/manuals/clcgenomicsworkbench/650/Quality_trimming.html
"""
start = False # flag for starting position of trimmed sequence
segment = 20 # minimum sequence length
trim_start = 0 # init start index
cutoff = 0.05 # default cutoff value for calculating base score
if len(seq_record) <= segment:
return seq_record
else:
# calculate base score
score_list = [
cutoff - (10 ** (qual / -10.0))
for qual in seq_record.letter_annotations["phred_quality"]
]
# calculate cumulative score
# if cumulative value < 0, set it to 0
# first value is set to 0, because of the assumption that
# the first base will always be trimmed out
cummul_score = [0]
for i in range(1, len(score_list)):
score = cummul_score[-1] + score_list[i]
if score < 0:
cummul_score.append(0)
else:
cummul_score.append(score)
if not start:
# trim_start = value when cumulative score is first > 0
trim_start = i
start = True
# trim_finish = index of highest cumulative score,
# marking the end of sequence segment with highest cumulative score
trim_finish = cummul_score.index(max(cummul_score))
return seq_record[trim_start:trim_finish]
def _parse_tag_data(elem_code, elem_num, raw_data):
"""Return single data value (PRIVATE).
Arguments:
- elem_code - What kind of data
- elem_num - How many data points
- raw_data - abi file object from which the tags would be unpacked
"""
if elem_code in _BYTEFMT:
# because '>1s' unpack differently from '>s'
if elem_num == 1:
num = ""
else:
num = str(elem_num)
fmt = ">" + num + _BYTEFMT[elem_code]
assert len(raw_data) == struct.calcsize(fmt)
data = struct.unpack(fmt, raw_data)
# no need to use tuple if len(data) == 1
# also if data is date / time
if elem_code not in [10, 11] and len(data) == 1:
data = data[0]
# account for different data types
if elem_code == 2:
return data
elif elem_code == 10:
return str(datetime.date(*data))
elif elem_code == 11:
return str(datetime.time(*data[:3]))
elif elem_code == 13:
return bool(data)
elif elem_code == 18:
return data[1:]
elif elem_code == 19:
return data[:-1]
else:
return data
else:
return None
if __name__ == "__main__":
pass