Spaces:
No application file
No application file
# Copyright 2016 by Stephen Marshall. All rights reserved. | |
# This code is part of the Biopython distribution and governed by its | |
# license. Please see the LICENSE file that should have been included | |
# as part of this package. | |
"""Parser for the cellosaurus.txt file from ExPASy. | |
See https://web.expasy.org/cellosaurus/ | |
Tested with the release of Version 18 (July 2016). | |
Functions: | |
- read Reads a file containing one cell line entry | |
- parse Reads a file containing multiple cell line entries | |
Classes: | |
- Record Holds cell line data. | |
Examples | |
-------- | |
This example downloads the Cellosaurus database and parses it. Note that | |
urlopen returns a stream of bytes, while the parser expects a stream of plain | |
string, so we use TextIOWrapper to convert bytes to string using the UTF-8 | |
encoding. This is not needed if you download the cellosaurus.txt file in | |
advance and open it (see the comment below). | |
>>> from urllib.request import urlopen | |
>>> from io import TextIOWrapper | |
>>> from Bio.ExPASy import cellosaurus | |
>>> url = "ftp://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt" | |
>>> bytestream = urlopen(url) | |
>>> textstream = TextIOWrapper(bytestream, "UTF-8") | |
>>> # alternatively, use | |
>>> # textstream = open("cellosaurus.txt") | |
>>> # if you downloaded the cellosaurus.txt file in advance. | |
>>> records = cellosaurus.parse(textstream) | |
>>> for record in records: | |
... if 'Homo sapiens' in record['OX'][0]: | |
... print(record['ID']) # doctest:+ELLIPSIS | |
... | |
#15310-LN | |
#W7079 | |
(L)PC6 | |
0.5alpha | |
... | |
""" | |
def parse(handle): | |
"""Parse cell line records. | |
This function is for parsing cell line files containing multiple | |
records. | |
Arguments: | |
- handle - handle to the file. | |
""" | |
while True: | |
record = __read(handle) | |
if not record: | |
break | |
yield record | |
def read(handle): | |
"""Read one cell line record. | |
This function is for parsing cell line files containing | |
exactly one record. | |
Arguments: | |
- handle - handle to the file. | |
""" | |
record = __read(handle) | |
# We should have reached the end of the record by now | |
remainder = handle.read() | |
if remainder: | |
raise ValueError("More than one cell line record found") | |
return record | |
class Record(dict): | |
"""Holds information from an ExPASy Cellosaurus record as a Python dictionary. | |
Each record contains the following keys: | |
--------- --------------------------- ---------------------- | |
Line code Content Occurrence in an entry | |
--------- --------------------------- ---------------------- | |
ID Identifier (cell line name) Once; starts an entry | |
AC Accession (CVCL_xxxx) Once | |
AS Secondary accession number(s) Optional; once | |
SY Synonyms Optional; once | |
DR Cross-references Optional; once or more | |
RX References identifiers Optional: once or more | |
WW Web pages Optional; once or more | |
CC Comments Optional; once or more | |
ST STR profile data Optional; once or more | |
DI Diseases Optional; once or more | |
OX Species of origin Once or more | |
HI Hierarchy Optional; once or more | |
OI Originate from same individual Optional; once or more | |
SX Sex (gender) of cell Optional; once | |
CA Category Once | |
// Terminator Once; ends an entry | |
""" | |
def __init__(self): | |
"""Initialize the class.""" | |
dict.__init__(self) | |
self["ID"] = "" | |
self["AC"] = "" | |
self["AS"] = "" | |
self["SY"] = "" | |
self["DR"] = [] | |
self["RX"] = [] | |
self["WW"] = [] | |
self["CC"] = [] | |
self["ST"] = [] | |
self["DI"] = [] | |
self["OX"] = [] | |
self["HI"] = [] | |
self["OI"] = [] | |
self["SX"] = "" | |
self["CA"] = "" | |
def __repr__(self): | |
"""Return the canonical string representation of the Record object.""" | |
if self["ID"]: | |
if self["AC"]: | |
return f"{self.__class__.__name__} ({self['ID']}, {self['AC']})" | |
else: | |
return f"{self.__class__.__name__} ({self['ID']})" | |
else: | |
return f"{self.__class__.__name__} ( )" | |
def __str__(self): | |
"""Return a readable string representation of the Record object.""" | |
output = "ID: " + self["ID"] | |
output += " AC: " + self["AC"] | |
output += " AS: " + self["AS"] | |
output += " SY: " + self["SY"] | |
output += " DR: " + repr(self["DR"]) | |
output += " RX: " + repr(self["RX"]) | |
output += " WW: " + repr(self["WW"]) | |
output += " CC: " + repr(self["CC"]) | |
output += " ST: " + repr(self["ST"]) | |
output += " DI: " + repr(self["DI"]) | |
output += " OX: " + repr(self["OX"]) | |
output += " HI: " + repr(self["HI"]) | |
output += " OI: " + repr(self["OI"]) | |
output += " SX: " + self["SX"] | |
output += " CA: " + self["CA"] | |
return output | |
# Everything below is private | |
def __read(handle): | |
record = None | |
for line in handle: | |
key, value = line[:2], line[5:].rstrip() | |
if key == "ID": | |
record = Record() | |
record["ID"] = value | |
elif key in ["AC", "AS", "SY", "SX", "CA"]: | |
record[key] += value | |
elif key in [ | |
"AC", | |
"AS", | |
"SY", | |
"RX", | |
"WW", | |
"CC", | |
"ST", | |
"DI", | |
"OX", | |
"HI", | |
"OI", | |
"SX", | |
"CA", | |
]: | |
record[key].append(value) | |
elif key == "DR": | |
k, v = value.split(";") | |
record["DR"].append((k.strip(), v.strip())) | |
elif key == "//": | |
if record: | |
return record | |
else: | |
continue | |
if record: | |
raise ValueError("Unexpected end of stream") | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest() | |