# Copyright 2016 by Stephen Marshall. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """Parser for the cellosaurus.txt file from ExPASy. See https://web.expasy.org/cellosaurus/ Tested with the release of Version 18 (July 2016). Functions: - read Reads a file containing one cell line entry - parse Reads a file containing multiple cell line entries Classes: - Record Holds cell line data. Examples -------- This example downloads the Cellosaurus database and parses it. Note that urlopen returns a stream of bytes, while the parser expects a stream of plain string, so we use TextIOWrapper to convert bytes to string using the UTF-8 encoding. This is not needed if you download the cellosaurus.txt file in advance and open it (see the comment below). >>> from urllib.request import urlopen >>> from io import TextIOWrapper >>> from Bio.ExPASy import cellosaurus >>> url = "ftp://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt" >>> bytestream = urlopen(url) >>> textstream = TextIOWrapper(bytestream, "UTF-8") >>> # alternatively, use >>> # textstream = open("cellosaurus.txt") >>> # if you downloaded the cellosaurus.txt file in advance. >>> records = cellosaurus.parse(textstream) >>> for record in records: ... if 'Homo sapiens' in record['OX'][0]: ... print(record['ID']) # doctest:+ELLIPSIS ... #15310-LN #W7079 (L)PC6 0.5alpha ... """ def parse(handle): """Parse cell line records. This function is for parsing cell line files containing multiple records. Arguments: - handle - handle to the file. """ while True: record = __read(handle) if not record: break yield record def read(handle): """Read one cell line record. This function is for parsing cell line files containing exactly one record. Arguments: - handle - handle to the file. """ record = __read(handle) # We should have reached the end of the record by now remainder = handle.read() if remainder: raise ValueError("More than one cell line record found") return record class Record(dict): """Holds information from an ExPASy Cellosaurus record as a Python dictionary. Each record contains the following keys: --------- --------------------------- ---------------------- Line code Content Occurrence in an entry --------- --------------------------- ---------------------- ID Identifier (cell line name) Once; starts an entry AC Accession (CVCL_xxxx) Once AS Secondary accession number(s) Optional; once SY Synonyms Optional; once DR Cross-references Optional; once or more RX References identifiers Optional: once or more WW Web pages Optional; once or more CC Comments Optional; once or more ST STR profile data Optional; once or more DI Diseases Optional; once or more OX Species of origin Once or more HI Hierarchy Optional; once or more OI Originate from same individual Optional; once or more SX Sex (gender) of cell Optional; once CA Category Once // Terminator Once; ends an entry """ def __init__(self): """Initialize the class.""" dict.__init__(self) self["ID"] = "" self["AC"] = "" self["AS"] = "" self["SY"] = "" self["DR"] = [] self["RX"] = [] self["WW"] = [] self["CC"] = [] self["ST"] = [] self["DI"] = [] self["OX"] = [] self["HI"] = [] self["OI"] = [] self["SX"] = "" self["CA"] = "" def __repr__(self): """Return the canonical string representation of the Record object.""" if self["ID"]: if self["AC"]: return f"{self.__class__.__name__} ({self['ID']}, {self['AC']})" else: return f"{self.__class__.__name__} ({self['ID']})" else: return f"{self.__class__.__name__} ( )" def __str__(self): """Return a readable string representation of the Record object.""" output = "ID: " + self["ID"] output += " AC: " + self["AC"] output += " AS: " + self["AS"] output += " SY: " + self["SY"] output += " DR: " + repr(self["DR"]) output += " RX: " + repr(self["RX"]) output += " WW: " + repr(self["WW"]) output += " CC: " + repr(self["CC"]) output += " ST: " + repr(self["ST"]) output += " DI: " + repr(self["DI"]) output += " OX: " + repr(self["OX"]) output += " HI: " + repr(self["HI"]) output += " OI: " + repr(self["OI"]) output += " SX: " + self["SX"] output += " CA: " + self["CA"] return output # Everything below is private def __read(handle): record = None for line in handle: key, value = line[:2], line[5:].rstrip() if key == "ID": record = Record() record["ID"] = value elif key in ["AC", "AS", "SY", "SX", "CA"]: record[key] += value elif key in [ "AC", "AS", "SY", "RX", "WW", "CC", "ST", "DI", "OX", "HI", "OI", "SX", "CA", ]: record[key].append(value) elif key == "DR": k, v = value.split(";") record["DR"].append((k.strip(), v.strip())) elif key == "//": if record: return record else: continue if record: raise ValueError("Unexpected end of stream") if __name__ == "__main__": from Bio._utils import run_doctest run_doctest()