File size: 6,381 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# Copyright 2016 by Stephen Marshall.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Parser for the cellosaurus.txt file from ExPASy.

See https://web.expasy.org/cellosaurus/

Tested with the release of Version 18 (July 2016).

Functions:
 - read       Reads a file containing one cell line entry
 - parse      Reads a file containing multiple cell line entries

Classes:
 - Record     Holds cell line data.

Examples
--------
This example downloads the Cellosaurus database and parses it. Note that
urlopen returns a stream of bytes, while the parser expects a stream of plain
string, so we use TextIOWrapper to convert bytes to string using the UTF-8
encoding. This is not needed if you download the cellosaurus.txt file in
advance and open it (see the comment below).

    >>> from urllib.request import urlopen
    >>> from io import TextIOWrapper
    >>> from Bio.ExPASy import cellosaurus
    >>> url = "ftp://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt"
    >>> bytestream = urlopen(url)
    >>> textstream = TextIOWrapper(bytestream, "UTF-8")
    >>> # alternatively, use
    >>> # textstream = open("cellosaurus.txt")
    >>> # if you downloaded the cellosaurus.txt file in advance.
    >>> records = cellosaurus.parse(textstream)
    >>> for record in records:
    ...     if 'Homo sapiens' in record['OX'][0]:
    ...         print(record['ID'])  # doctest:+ELLIPSIS
    ...
    #15310-LN
    #W7079
    (L)PC6
    0.5alpha
    ...

"""


def parse(handle):
    """Parse cell line records.

    This function is for parsing cell line files containing multiple
    records.

    Arguments:
     - handle   - handle to the file.

    """
    while True:
        record = __read(handle)
        if not record:
            break
        yield record


def read(handle):
    """Read one cell line record.

    This function is for parsing cell line files containing
    exactly one record.

    Arguments:
     - handle   - handle to the file.

    """
    record = __read(handle)
    # We should have reached the end of the record by now
    remainder = handle.read()
    if remainder:
        raise ValueError("More than one cell line record found")
    return record


class Record(dict):
    """Holds information from an ExPASy Cellosaurus record as a Python dictionary.

    Each record contains the following keys:

     ---------  ---------------------------     ----------------------
     Line code  Content                         Occurrence in an entry
     ---------  ---------------------------     ----------------------
     ID         Identifier (cell line name)     Once; starts an entry
     AC         Accession (CVCL_xxxx)           Once
     AS         Secondary accession number(s)   Optional; once
     SY         Synonyms                        Optional; once
     DR         Cross-references                Optional; once or more
     RX         References identifiers          Optional: once or more
     WW         Web pages                       Optional; once or more
     CC         Comments                        Optional; once or more
     ST         STR profile data                Optional; once or more
     DI         Diseases                        Optional; once or more
     OX         Species of origin               Once or more
     HI         Hierarchy                       Optional; once or more
     OI         Originate from same individual  Optional; once or more
     SX         Sex (gender) of cell            Optional; once
     CA         Category                        Once
     //         Terminator                      Once; ends an entry

    """

    def __init__(self):
        """Initialize the class."""
        dict.__init__(self)
        self["ID"] = ""
        self["AC"] = ""
        self["AS"] = ""
        self["SY"] = ""
        self["DR"] = []
        self["RX"] = []
        self["WW"] = []
        self["CC"] = []
        self["ST"] = []
        self["DI"] = []
        self["OX"] = []
        self["HI"] = []
        self["OI"] = []
        self["SX"] = ""
        self["CA"] = ""

    def __repr__(self):
        """Return the canonical string representation of the Record object."""
        if self["ID"]:
            if self["AC"]:
                return f"{self.__class__.__name__} ({self['ID']}, {self['AC']})"
            else:
                return f"{self.__class__.__name__} ({self['ID']})"
        else:
            return f"{self.__class__.__name__} ( )"

    def __str__(self):
        """Return a readable string representation of the Record object."""
        output = "ID: " + self["ID"]
        output += " AC: " + self["AC"]
        output += " AS: " + self["AS"]
        output += " SY: " + self["SY"]
        output += " DR: " + repr(self["DR"])
        output += " RX: " + repr(self["RX"])
        output += " WW: " + repr(self["WW"])
        output += " CC: " + repr(self["CC"])
        output += " ST: " + repr(self["ST"])
        output += " DI: " + repr(self["DI"])
        output += " OX: " + repr(self["OX"])
        output += " HI: " + repr(self["HI"])
        output += " OI: " + repr(self["OI"])
        output += " SX: " + self["SX"]
        output += " CA: " + self["CA"]
        return output


# Everything below is private


def __read(handle):
    record = None

    for line in handle:

        key, value = line[:2], line[5:].rstrip()
        if key == "ID":
            record = Record()
            record["ID"] = value
        elif key in ["AC", "AS", "SY", "SX", "CA"]:
            record[key] += value
        elif key in [
            "AC",
            "AS",
            "SY",
            "RX",
            "WW",
            "CC",
            "ST",
            "DI",
            "OX",
            "HI",
            "OI",
            "SX",
            "CA",
        ]:
            record[key].append(value)
        elif key == "DR":
            k, v = value.split(";")
            record["DR"].append((k.strip(), v.strip()))
        elif key == "//":
            if record:
                return record
            else:
                continue
    if record:
        raise ValueError("Unexpected end of stream")


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()