aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2004 by Harry Zuzan. All rights reserved.
# Copyright 2016 by Adam Kurkiewicz. All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Reading information from Affymetrix CEL files version 3 and 4."""
import struct
try:
import numpy
except ImportError:
from Bio import MissingPythonDependencyError
raise MissingPythonDependencyError(
"Install NumPy if you want to use Bio.Affy.CelFile"
) from None
class ParserError(ValueError):
"""Affymetrix parser error."""
def __init__(self, *args):
"""Initialise class."""
super().__init__(*args)
class Record:
"""Stores the information in a cel file.
Example usage:
>>> from Bio.Affy import CelFile
>>> with open("Affy/affy_v3_example.CEL") as handle:
... c = CelFile.read(handle)
...
>>> print(c.ncols, c.nrows)
5 5
>>> print(c.intensities)
[[ 234. 170. 22177. 164. 22104.]
[ 188. 188. 21871. 168. 21883.]
[ 188. 193. 21455. 198. 21300.]
[ 188. 182. 21438. 188. 20945.]
[ 193. 20370. 174. 20605. 168.]]
>>> print(c.stdevs)
[[ 24. 34.5 2669. 19.7 3661.2]
[ 29.8 29.8 2795.9 67.9 2792.4]
[ 29.8 88.7 2976.5 62. 2914.5]
[ 29.8 76.2 2759.5 49.2 2762. ]
[ 38.8 2611.8 26.6 2810.7 24.1]]
>>> print(c.npix)
[[25 25 25 25 25]
[25 25 25 25 25]
[25 25 25 25 25]
[25 25 25 25 25]
[25 25 25 25 25]]
"""
def __init__(self):
"""Initialize the class."""
self.version = None
self.GridCornerUL = None
self.GridCornerUR = None
self.GridCornerLR = None
self.GridCornerLL = None
self.DatHeader = None
self.Algorithm = None
self.AlgorithmParameters = None
self.NumberCells = None
self.intensities = None
self.stdevs = None
self.npix = None
self.nrows = None
self.ncols = None
self.nmask = None
self.mask = None
self.noutliers = None
self.outliers = None
self.modified = None
def read(handle, version=None):
"""Read Affymetrix CEL file and return Record object.
CEL files format versions 3 and 4 are supported.
Please specify the CEL file format as 3 or 4 if known for the version
argument. If the version number is not specified, the parser will attempt
to detect the version from the file contents.
The Record object returned by this function stores the intensities from
the CEL file in record.intensities.
Currently, record.mask and record.outliers are not set in when parsing
version 4 CEL files.
Example Usage:
>>> from Bio.Affy import CelFile
>>> with open("Affy/affy_v3_example.CEL") as handle:
... record = CelFile.read(handle)
...
>>> record.version == 3
True
>>> print("%i by %i array" % record.intensities.shape)
5 by 5 array
>>> with open("Affy/affy_v4_example.CEL", "rb") as handle:
... record = CelFile.read(handle, version=4)
...
>>> record.version == 4
True
>>> print("%i by %i array" % record.intensities.shape)
5 by 5 array
"""
try:
data = handle.read(0)
except AttributeError:
raise ValueError("handle should be a file handle") from None
data = handle.read(4)
if not data:
raise ValueError("Empty file.")
if data == b"[CEL":
raise ValueError("CEL file in version 3 format should be opened in text mode")
if data == "[CEL":
# Version 3 format. Continue to read the header here before passing
# control to _read_v3 to avoid having to seek to the beginning of
# the file.
data += next(handle)
if data.strip() != "[CEL]":
raise ValueError("Failed to parse Affy Version 3 CEL file.")
line = next(handle)
keyword, value = line.split("=", 1)
if keyword != "Version":
raise ValueError("Failed to parse Affy Version 3 CEL file.")
version = int(value)
if version != 3:
raise ValueError("Incorrect version number in Affy Version 3 CEL file.")
return _read_v3(handle)
try:
magicNumber = struct.unpack("<i", data)
except TypeError:
raise ValueError(
"CEL file in version 4 format should be opened in binary mode"
) from None
except struct.error:
raise ValueError(
"Failed to read magic number from Affy Version 4 CEL file"
) from None
if magicNumber != (64,):
raise ValueError("Incorrect magic number in Affy Version 4 CEL file")
return _read_v4(handle)
def _read_v4(f):
# We follow the documentation here:
# http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx
record = Record()
preHeaders = ["version", "columns", "rows", "cellNo", "headerLen"]
preHeadersMap = {}
headersMap = {}
# Load pre-headers. The magic number was already parsed in the read
# function calling _read_v4.
preHeadersMap["magic"] = 64
try:
for name in preHeaders:
preHeadersMap[name] = struct.unpack("<i", f.read(4))[0]
except struct.error:
raise ParserError("Failed to parse CEL version 4 file") from None
char = f.read(preHeadersMap["headerLen"])
header = char.decode("ascii", "ignore")
for line in header.split("\n"):
if "=" in line:
headline = line.split("=")
headersMap[headline[0]] = "=".join(headline[1:])
record.version = preHeadersMap["version"]
if record.version != 4:
raise ParserError("Incorrect version number in CEL version 4 file")
record.GridCornerUL = headersMap["GridCornerUL"]
record.GridCornerUR = headersMap["GridCornerUR"]
record.GridCornerLR = headersMap["GridCornerLR"]
record.GridCornerLL = headersMap["GridCornerLL"]
record.DatHeader = headersMap["DatHeader"]
record.Algorithm = headersMap["Algorithm"]
record.AlgorithmParameters = headersMap["AlgorithmParameters"]
record.NumberCells = preHeadersMap["cellNo"]
# record.intensities are set below
# record.stdevs are set below
# record.npix are set below
record.nrows = int(headersMap["Rows"])
record.ncols = int(headersMap["Cols"])
# These cannot be reliably set in v4, because of discrepancies between real
# data and the documented format.
record.nmask = None
record.mask = None
record.noutliers = None
record.outliers = None
record.modified = None
# Real data never seems to have anything but zeros here, but we don't want
# to take chances. Raising an error is better than returning unreliable
# data.
def raiseBadHeader(field, expected):
actual = int(headersMap[field])
message = f"The header {field} is expected to be 0, not {actual}"
if actual != expected:
raise ParserError(message)
raiseBadHeader("Axis-invertX", 0)
raiseBadHeader("AxisInvertY", 0)
raiseBadHeader("OffsetX", 0)
raiseBadHeader("OffsetY", 0)
# This is unfortunately undocumented, but it turns out that real data has
# the record.AlgorithmParameters repeated in the data section, until an
# EOF, i.e. b"\x04".
char = b"\x00"
safetyValve = 10**4
for i in range(safetyValve):
char = f.read(1)
# For debugging
# print([i for i in char], end="")
if char == b"\x04":
break
if i == safetyValve:
raise ParserError(
"Parse Error. The parser expects a short, "
"undocumented binary blob terminating with "
"ASCII EOF, x04"
)
# After that there are precisely 15 bytes padded. Again, undocumented.
padding = f.read(15)
# That's how we pull out the values (triplets of the form float, float,
# signed short).
structa = struct.Struct("< f f h")
# There are 10 bytes in our struct.
structSize = 10
# We initialize the most important: intensities, stdevs and npixs.
record.intensities = numpy.empty(record.NumberCells, dtype=float)
record.stdevs = numpy.empty(record.NumberCells, dtype=float)
record.npix = numpy.empty(record.NumberCells, dtype=int)
b = f.read(structSize * record.NumberCells)
for i in range(record.NumberCells):
binaryFragment = b[i * structSize : (i + 1) * structSize]
intensity, stdevs, npix = structa.unpack(binaryFragment)
record.intensities[i] = intensity
record.stdevs[i] = stdevs
record.npix[i] = npix
# reshape without copying.
def reshape(array):
view = array.view()
view.shape = (record.nrows, record.ncols)
return view
record.intensities = reshape(record.intensities)
record.stdevs = reshape(record.stdevs)
record.npix = reshape(record.npix)
return record
def _read_v3(handle):
# Needs error handling.
# Needs to know the chip design.
record = Record()
# The version number was already obtained when the read function calling
# _read_v3 parsed the CEL section.
record.version = 3
section = ""
for line in handle:
line = line.rstrip("\r\n")
if not line:
continue
# Set current section
if line.startswith("[HEADER]"):
section = "HEADER"
elif line.startswith("[INTENSITY]"):
section = "INTENSITY"
record.intensities = numpy.zeros((record.nrows, record.ncols))
record.stdevs = numpy.zeros((record.nrows, record.ncols))
record.npix = numpy.zeros((record.nrows, record.ncols), int)
elif line.startswith("[MASKS]"):
section = "MASKS"
record.mask = numpy.zeros((record.nrows, record.ncols), bool)
elif line.startswith("[OUTLIERS]"):
section = "OUTLIERS"
record.outliers = numpy.zeros((record.nrows, record.ncols), bool)
elif line.startswith("[MODIFIED]"):
section = "MODIFIED"
record.modified = numpy.zeros((record.nrows, record.ncols))
elif line.startswith("["):
raise ParserError("Unknown section found in version 3 CEL file")
else: # read the data in a section
if section == "HEADER":
# Set record.ncols and record.nrows, remaining data goes into
# record.header dict
key, value = line.split("=", 1)
if key == "Cols":
record.ncols = int(value)
elif key == "Rows":
record.nrows = int(value)
elif key == "GridCornerUL":
x, y = value.split()
record.GridCornerUL = (int(x), int(y))
elif key == "GridCornerUR":
x, y = value.split()
record.GridCornerUR = (int(x), int(y))
elif key == "GridCornerLR":
x, y = value.split()
record.GridCornerLR = (int(x), int(y))
elif key == "GridCornerLL":
x, y = value.split()
record.GridCornerLL = (int(x), int(y))
elif key == "DatHeader":
# not sure if all parameters here are interpreted correctly
record.DatHeader = {}
i = value.find(":")
if i >= 0:
min_max_pixel_intensity, filename = value[:i].split()
record.DatHeader["filename"] = filename
assert min_max_pixel_intensity[0] == "["
assert min_max_pixel_intensity[-1] == "]"
(
min_pixel_intensity,
max_pixel_intensity,
) = min_max_pixel_intensity[1:-1].split("..")
record.DatHeader["min-pixel_intensity"] = int(
min_pixel_intensity
)
record.DatHeader["max-pixel_intensity"] = int(
max_pixel_intensity
)
value = value[i + 1 :]
index = 0
field = value[index : index + 9]
if field[:4] != "CLS=" or field[8] != " ":
raise ValueError(
"Field does not start with 'CLS=' or have a blank space at position 8"
)
record.DatHeader["CLS"] = int(field[4:8])
index += 9
field = value[index : index + 9]
if field[:4] != "RWS=" or field[8] != " ":
raise ValueError(
"Field does not start with 'RWS=' or have a blank space at position 8"
)
record.DatHeader["RWS"] = int(field[4:8])
index += 9
field = value[index : index + 7]
if field[:4] != "XIN=" or field[6] != " ":
raise ValueError(
"Field does not start with 'XIN=' or have a blank space at position 6"
)
record.DatHeader["XIN"] = int(field[4:6])
index += 7
field = value[index : index + 7]
if field[:4] != "YIN=" or field[6] != " ":
raise ValueError(
"Field does not start with 'YIN=' or have a blank space at poition 6"
)
record.DatHeader["YIN"] = int(field[4:6])
index += 7
field = value[index : index + 6]
if field[:3] != "VE=" or field[5] != " ":
raise ValueError(
"Field does not start with 'VE=' or have a blank space at position 5"
)
record.DatHeader["VE"] = int(field[3:5])
index += 6
field = value[index : index + 7]
if field[6] != " ":
raise ValueError(
"Field value for position 6 isn't a blank space"
)
temperature = field[:6].strip()
if temperature:
record.DatHeader["temperature"] = int(temperature)
else:
record.DatHeader["temperature"] = None
index += 7
field = value[index : index + 4]
if not field.endswith(" "):
raise ValueError("Field doesn't end with a blank space")
record.DatHeader["laser-power"] = float(field)
index += 4
field = value[index : index + 18]
if field[8] != " ":
raise ValueError(
"Field value for position 8 isn't a blank space"
)
record.DatHeader["scan-date"] = field[:8]
if field[17] != " ":
raise ValueError(
"Field value for position 17 isn't a blank space"
)
record.DatHeader["scan-date"] = field[:8]
record.DatHeader["scan-time"] = field[9:17]
index += 18
value = value[index:]
subfields = value.split("\x14")
if len(subfields) != 12:
ValueError("Subfields length isn't 12")
subfield = subfields[0]
try:
scanner_id, scanner_type = subfield.split()
except ValueError:
scanner_id = subfield.strip()
else:
record.DatHeader["scanner-type"] = scanner_type
record.DatHeader["scanner-id"] = scanner_id
record.DatHeader["array-type"] = subfields[2].strip()
field = subfields[7].strip()
if field:
record.DatHeader["filter-wavelength"] = int(field)
field = subfields[8].strip()
if field:
record.DatHeader["arc-radius"] = float(field)
field = subfields[9].strip()
if field:
record.DatHeader["laser-spotsize"] = float(field)
field = subfields[10].strip()
if field:
record.DatHeader["pixel-size"] = float(field)
field = subfields[11].strip()
if field:
record.DatHeader["image-orientation"] = int(field)
elif key == "Algorithm":
record.Algorithm = value
elif key == "AlgorithmParameters":
parameters = value.split(";")
values = {}
for parameter in parameters:
key, value = parameter.split(":", 1)
if key in (
"Percentile",
"CellMargin",
"FullFeatureWidth",
"FullFeatureHeight",
"PoolWidthExtenstion",
"PoolHeightExtension",
"NumPixelsToUse",
"ExtendPoolWidth",
"ExtendPoolHeight",
"OutlierRatioLowPercentile",
"OutlierRatioHighPercentile",
"HalfCellRowsDivisor",
"HalfCellRowsRemainder",
"HighCutoff",
"LowCutoff",
"featureRows",
"featureColumns",
):
values[key] = int(value)
elif key in (
"OutlierHigh",
"OutlierLow",
"StdMult",
"PercentileSpread",
"PairCutoff",
"featureWidth",
"featureHeight",
):
values[key] = float(value)
elif key in (
"FixedCellSize",
"IgnoreOutliersInShiftRows",
"FeatureExtraction",
"UseSubgrids",
"RandomizePixels",
"ImageCalibration",
"IgnoreShiftRowOutliers",
):
if value == "TRUE":
value = True
elif value == "FALSE":
value = False
else:
raise ValueError("Unexpected boolean value")
values[key] = value
elif key in (
"AlgVersion",
"ErrorBasis",
"CellIntensityCalculationType",
):
values[key] = value
else:
raise ValueError("Unexpected tag in AlgorithmParameters")
record.AlgorithmParameters = values
elif section == "INTENSITY":
if line.startswith("NumberCells="):
key, value = line.split("=", 1)
record.NumberCells = int(value)
elif line.startswith("CellHeader="):
key, value = line.split("=", 1)
if value.split() != ["X", "Y", "MEAN", "STDV", "NPIXELS"]:
raise ParserError(
"Unexpected CellHeader in INTENSITY "
"section CEL version 3 file"
)
else:
words = line.split()
y = int(words[0])
x = int(words[1])
record.intensities[x, y] = float(words[2])
record.stdevs[x, y] = float(words[3])
record.npix[x, y] = int(words[4])
elif section == "MASKS":
if line.startswith("NumberCells="):
key, value = line.split("=", 1)
record.nmask = int(value)
elif line.startswith("CellHeader="):
key, value = line.split("=", 1)
if value.split() != ["X", "Y"]:
raise ParserError(
"Unexpected CellHeader in MASKS "
"section in CEL version 3 file"
)
else:
words = line.split()
y = int(words[0])
x = int(words[1])
record.mask[x, y] = True
elif section == "OUTLIERS":
if line.startswith("NumberCells="):
key, value = line.split("=", 1)
record.noutliers = int(value)
elif line.startswith("CellHeader="):
key, value = line.split("=", 1)
if value.split() != ["X", "Y"]:
raise ParserError(
"Unexpected CellHeader in OUTLIERS "
"section in CEL version 3 file"
)
else:
words = line.split()
y = int(words[0])
x = int(words[1])
record.outliers[x, y] = True
elif section == "MODIFIED":
if line.startswith("NumberCells="):
key, value = line.split("=", 1)
record.nmodified = int(value)
elif line.startswith("CellHeader="):
key, value = line.split("=", 1)
if value.split() != ["X", "Y", "ORIGMEAN"]:
raise ParserError(
"Unexpected CellHeader in MODIFIED "
"section in CEL version 3 file"
)
else:
words = line.split()
y = int(words[0])
x = int(words[1])
record.modified[x, y] = float(words[2])
return record
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()