# Copyright 2004 by Harry Zuzan. All rights reserved. # Copyright 2016 by Adam Kurkiewicz. All rights reserved. # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Reading information from Affymetrix CEL files version 3 and 4.""" import struct try: import numpy except ImportError: from Bio import MissingPythonDependencyError raise MissingPythonDependencyError( "Install NumPy if you want to use Bio.Affy.CelFile" ) from None class ParserError(ValueError): """Affymetrix parser error.""" def __init__(self, *args): """Initialise class.""" super().__init__(*args) class Record: """Stores the information in a cel file. Example usage: >>> from Bio.Affy import CelFile >>> with open("Affy/affy_v3_example.CEL") as handle: ... c = CelFile.read(handle) ... >>> print(c.ncols, c.nrows) 5 5 >>> print(c.intensities) [[ 234. 170. 22177. 164. 22104.] [ 188. 188. 21871. 168. 21883.] [ 188. 193. 21455. 198. 21300.] [ 188. 182. 21438. 188. 20945.] [ 193. 20370. 174. 20605. 168.]] >>> print(c.stdevs) [[ 24. 34.5 2669. 19.7 3661.2] [ 29.8 29.8 2795.9 67.9 2792.4] [ 29.8 88.7 2976.5 62. 2914.5] [ 29.8 76.2 2759.5 49.2 2762. ] [ 38.8 2611.8 26.6 2810.7 24.1]] >>> print(c.npix) [[25 25 25 25 25] [25 25 25 25 25] [25 25 25 25 25] [25 25 25 25 25] [25 25 25 25 25]] """ def __init__(self): """Initialize the class.""" self.version = None self.GridCornerUL = None self.GridCornerUR = None self.GridCornerLR = None self.GridCornerLL = None self.DatHeader = None self.Algorithm = None self.AlgorithmParameters = None self.NumberCells = None self.intensities = None self.stdevs = None self.npix = None self.nrows = None self.ncols = None self.nmask = None self.mask = None self.noutliers = None self.outliers = None self.modified = None def read(handle, version=None): """Read Affymetrix CEL file and return Record object. CEL files format versions 3 and 4 are supported. Please specify the CEL file format as 3 or 4 if known for the version argument. If the version number is not specified, the parser will attempt to detect the version from the file contents. The Record object returned by this function stores the intensities from the CEL file in record.intensities. Currently, record.mask and record.outliers are not set in when parsing version 4 CEL files. Example Usage: >>> from Bio.Affy import CelFile >>> with open("Affy/affy_v3_example.CEL") as handle: ... record = CelFile.read(handle) ... >>> record.version == 3 True >>> print("%i by %i array" % record.intensities.shape) 5 by 5 array >>> with open("Affy/affy_v4_example.CEL", "rb") as handle: ... record = CelFile.read(handle, version=4) ... >>> record.version == 4 True >>> print("%i by %i array" % record.intensities.shape) 5 by 5 array """ try: data = handle.read(0) except AttributeError: raise ValueError("handle should be a file handle") from None data = handle.read(4) if not data: raise ValueError("Empty file.") if data == b"[CEL": raise ValueError("CEL file in version 3 format should be opened in text mode") if data == "[CEL": # Version 3 format. Continue to read the header here before passing # control to _read_v3 to avoid having to seek to the beginning of # the file. data += next(handle) if data.strip() != "[CEL]": raise ValueError("Failed to parse Affy Version 3 CEL file.") line = next(handle) keyword, value = line.split("=", 1) if keyword != "Version": raise ValueError("Failed to parse Affy Version 3 CEL file.") version = int(value) if version != 3: raise ValueError("Incorrect version number in Affy Version 3 CEL file.") return _read_v3(handle) try: magicNumber = struct.unpack("= 0: min_max_pixel_intensity, filename = value[:i].split() record.DatHeader["filename"] = filename assert min_max_pixel_intensity[0] == "[" assert min_max_pixel_intensity[-1] == "]" ( min_pixel_intensity, max_pixel_intensity, ) = min_max_pixel_intensity[1:-1].split("..") record.DatHeader["min-pixel_intensity"] = int( min_pixel_intensity ) record.DatHeader["max-pixel_intensity"] = int( max_pixel_intensity ) value = value[i + 1 :] index = 0 field = value[index : index + 9] if field[:4] != "CLS=" or field[8] != " ": raise ValueError( "Field does not start with 'CLS=' or have a blank space at position 8" ) record.DatHeader["CLS"] = int(field[4:8]) index += 9 field = value[index : index + 9] if field[:4] != "RWS=" or field[8] != " ": raise ValueError( "Field does not start with 'RWS=' or have a blank space at position 8" ) record.DatHeader["RWS"] = int(field[4:8]) index += 9 field = value[index : index + 7] if field[:4] != "XIN=" or field[6] != " ": raise ValueError( "Field does not start with 'XIN=' or have a blank space at position 6" ) record.DatHeader["XIN"] = int(field[4:6]) index += 7 field = value[index : index + 7] if field[:4] != "YIN=" or field[6] != " ": raise ValueError( "Field does not start with 'YIN=' or have a blank space at poition 6" ) record.DatHeader["YIN"] = int(field[4:6]) index += 7 field = value[index : index + 6] if field[:3] != "VE=" or field[5] != " ": raise ValueError( "Field does not start with 'VE=' or have a blank space at position 5" ) record.DatHeader["VE"] = int(field[3:5]) index += 6 field = value[index : index + 7] if field[6] != " ": raise ValueError( "Field value for position 6 isn't a blank space" ) temperature = field[:6].strip() if temperature: record.DatHeader["temperature"] = int(temperature) else: record.DatHeader["temperature"] = None index += 7 field = value[index : index + 4] if not field.endswith(" "): raise ValueError("Field doesn't end with a blank space") record.DatHeader["laser-power"] = float(field) index += 4 field = value[index : index + 18] if field[8] != " ": raise ValueError( "Field value for position 8 isn't a blank space" ) record.DatHeader["scan-date"] = field[:8] if field[17] != " ": raise ValueError( "Field value for position 17 isn't a blank space" ) record.DatHeader["scan-date"] = field[:8] record.DatHeader["scan-time"] = field[9:17] index += 18 value = value[index:] subfields = value.split("\x14") if len(subfields) != 12: ValueError("Subfields length isn't 12") subfield = subfields[0] try: scanner_id, scanner_type = subfield.split() except ValueError: scanner_id = subfield.strip() else: record.DatHeader["scanner-type"] = scanner_type record.DatHeader["scanner-id"] = scanner_id record.DatHeader["array-type"] = subfields[2].strip() field = subfields[7].strip() if field: record.DatHeader["filter-wavelength"] = int(field) field = subfields[8].strip() if field: record.DatHeader["arc-radius"] = float(field) field = subfields[9].strip() if field: record.DatHeader["laser-spotsize"] = float(field) field = subfields[10].strip() if field: record.DatHeader["pixel-size"] = float(field) field = subfields[11].strip() if field: record.DatHeader["image-orientation"] = int(field) elif key == "Algorithm": record.Algorithm = value elif key == "AlgorithmParameters": parameters = value.split(";") values = {} for parameter in parameters: key, value = parameter.split(":", 1) if key in ( "Percentile", "CellMargin", "FullFeatureWidth", "FullFeatureHeight", "PoolWidthExtenstion", "PoolHeightExtension", "NumPixelsToUse", "ExtendPoolWidth", "ExtendPoolHeight", "OutlierRatioLowPercentile", "OutlierRatioHighPercentile", "HalfCellRowsDivisor", "HalfCellRowsRemainder", "HighCutoff", "LowCutoff", "featureRows", "featureColumns", ): values[key] = int(value) elif key in ( "OutlierHigh", "OutlierLow", "StdMult", "PercentileSpread", "PairCutoff", "featureWidth", "featureHeight", ): values[key] = float(value) elif key in ( "FixedCellSize", "IgnoreOutliersInShiftRows", "FeatureExtraction", "UseSubgrids", "RandomizePixels", "ImageCalibration", "IgnoreShiftRowOutliers", ): if value == "TRUE": value = True elif value == "FALSE": value = False else: raise ValueError("Unexpected boolean value") values[key] = value elif key in ( "AlgVersion", "ErrorBasis", "CellIntensityCalculationType", ): values[key] = value else: raise ValueError("Unexpected tag in AlgorithmParameters") record.AlgorithmParameters = values elif section == "INTENSITY": if line.startswith("NumberCells="): key, value = line.split("=", 1) record.NumberCells = int(value) elif line.startswith("CellHeader="): key, value = line.split("=", 1) if value.split() != ["X", "Y", "MEAN", "STDV", "NPIXELS"]: raise ParserError( "Unexpected CellHeader in INTENSITY " "section CEL version 3 file" ) else: words = line.split() y = int(words[0]) x = int(words[1]) record.intensities[x, y] = float(words[2]) record.stdevs[x, y] = float(words[3]) record.npix[x, y] = int(words[4]) elif section == "MASKS": if line.startswith("NumberCells="): key, value = line.split("=", 1) record.nmask = int(value) elif line.startswith("CellHeader="): key, value = line.split("=", 1) if value.split() != ["X", "Y"]: raise ParserError( "Unexpected CellHeader in MASKS " "section in CEL version 3 file" ) else: words = line.split() y = int(words[0]) x = int(words[1]) record.mask[x, y] = True elif section == "OUTLIERS": if line.startswith("NumberCells="): key, value = line.split("=", 1) record.noutliers = int(value) elif line.startswith("CellHeader="): key, value = line.split("=", 1) if value.split() != ["X", "Y"]: raise ParserError( "Unexpected CellHeader in OUTLIERS " "section in CEL version 3 file" ) else: words = line.split() y = int(words[0]) x = int(words[1]) record.outliers[x, y] = True elif section == "MODIFIED": if line.startswith("NumberCells="): key, value = line.split("=", 1) record.nmodified = int(value) elif line.startswith("CellHeader="): key, value = line.split("=", 1) if value.split() != ["X", "Y", "ORIGMEAN"]: raise ParserError( "Unexpected CellHeader in MODIFIED " "section in CEL version 3 file" ) else: words = line.split() y = int(words[0]) x = int(words[1]) record.modified[x, y] = float(words[2]) return record if __name__ == "__main__": from Bio._utils import run_doctest run_doctest()