File size: 3,468 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Copyright 1999 by Jeffrey Chang.  All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Code to parse the keywlist.txt file from SwissProt/UniProt.

See:
 - https://www.uniprot.org/docs/keywlist.txt

Classes:
 - Record            Stores the information about one keyword or one category
   in the keywlist.txt file.

Functions:
 - parse             Parses the keywlist.txt file and returns an iterator to
   the records it contains.

"""


class Record(dict):
    """Store information of one keyword or category from the keywords list.

    This record stores the information of one keyword or category in the
    keywlist.txt as a Python dictionary. The keys in this dictionary are
    the line codes that can appear in the keywlist.txt file::

        ---------  ---------------------------     ----------------------
        Line code  Content                         Occurrence in an entry
        ---------  ---------------------------     ----------------------
        ID         Identifier (keyword)            Once; starts a keyword entry
        IC         Identifier (category)           Once; starts a category entry
        AC         Accession (KW-xxxx)             Once
        DE         Definition                      Once or more
        SY         Synonyms                        Optional; once or more
        GO         Gene ontology (GO) mapping      Optional; once or more
        HI         Hierarchy                       Optional; once or more
        WW         Relevant WWW site               Optional; once or more
        CA         Category                        Once per keyword entry; absent
                                                   in category entries

    """

    def __init__(self):
        """Initialize the class."""
        dict.__init__(self)
        for keyword in ("DE", "SY", "GO", "HI", "WW"):
            self[keyword] = []


def parse(handle):
    """Parse the keyword list from file handle.

    Returns a generator object which yields keyword entries as
    Bio.SwissProt.KeyWList.Record() object.
    """
    record = Record()
    # First, skip the header - look for start of a record
    for line in handle:
        if line.startswith("ID   "):
            # Looks like there was no header
            record["ID"] = line[5:].strip()
            break
        if line.startswith("IC   "):
            # Looks like there was no header
            record["IC"] = line[5:].strip()
            break
    # Now parse the records
    for line in handle:
        if line.startswith("-------------------------------------"):
            # We have reached the footer
            break
        key = line[:2]
        if key == "//":
            record["DE"] = " ".join(record["DE"])
            record["SY"] = " ".join(record["SY"])
            yield record
            record = Record()
        elif line[2:5] == "   ":
            value = line[5:].strip()
            if key in ("ID", "IC", "AC", "CA"):
                record[key] = value
            elif key in ("DE", "SY", "GO", "HI", "WW"):
                record[key].append(value)
            else:
                raise ValueError(f"Cannot parse line '{line.strip()}'")
    # Read the footer and throw it away
    for line in handle:
        pass