Spaces:
No application file
No application file
# Copyright (C) 2002, Thomas Hamelryck ([email protected]) | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Turn an mmCIF file into a dictionary.""" | |
from Bio.File import as_handle | |
class MMCIF2Dict(dict): | |
"""Parse a mmCIF file and return a dictionary.""" | |
def __init__(self, filename): | |
"""Parse a mmCIF file and return a dictionary. | |
Arguments: | |
- file - name of the PDB file OR an open filehandle | |
""" | |
self.quote_chars = ["'", '"'] | |
self.whitespace_chars = [" ", "\t"] | |
with as_handle(filename) as handle: | |
loop_flag = False | |
key = None | |
tokens = self._tokenize(handle) | |
try: | |
token = next(tokens) | |
except StopIteration: | |
return # for Python 3.7 and PEP 479 | |
self[token[0:5]] = token[5:] | |
i = 0 | |
n = 0 | |
for token in tokens: | |
if token.lower() == "loop_": | |
loop_flag = True | |
keys = [] | |
i = 0 | |
n = 0 | |
continue | |
elif loop_flag: | |
# The second condition checks we are in the first column | |
# Some mmCIF files (e.g. 4q9r) have values in later columns | |
# starting with an underscore and we don't want to read | |
# these as keys | |
if token.startswith("_") and (n == 0 or i % n == 0): | |
if i > 0: | |
loop_flag = False | |
else: | |
self[token] = [] | |
keys.append(token) | |
n += 1 | |
continue | |
else: | |
self[keys[i % n]].append(token) | |
i += 1 | |
continue | |
if key is None: | |
key = token | |
else: | |
self[key] = [token] | |
key = None | |
# Private methods | |
def _splitline(self, line): | |
# See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for the syntax | |
in_token = False | |
# quote character of the currently open quote, or None if no quote open | |
quote_open_char = None | |
start_i = 0 | |
for (i, c) in enumerate(line): | |
if c in self.whitespace_chars: | |
if in_token and not quote_open_char: | |
in_token = False | |
yield line[start_i:i] | |
elif c in self.quote_chars: | |
if not quote_open_char and not in_token: | |
quote_open_char = c | |
in_token = True | |
start_i = i + 1 | |
elif c == quote_open_char and ( | |
i + 1 == len(line) or line[i + 1] in self.whitespace_chars | |
): | |
quote_open_char = None | |
in_token = False | |
yield line[start_i:i] | |
elif c == "#" and not in_token: | |
# Skip comments. "#" is a valid non-comment char inside of a | |
# quote and inside of an unquoted token (!?!?), so we need to | |
# check that the current char is not in a token. | |
return | |
elif not in_token: | |
in_token = True | |
start_i = i | |
if in_token: | |
yield line[start_i:] | |
if quote_open_char: | |
raise ValueError("Line ended with quote open: " + line) | |
def _tokenize(self, handle): | |
empty = True | |
for line in handle: | |
empty = False | |
if line.startswith("#"): | |
continue | |
elif line.startswith(";"): | |
# The spec says that leading whitespace on each line must be | |
# preserved while trailing whitespace may be stripped. The | |
# trailing newline must be stripped. | |
token_buffer = [line[1:].rstrip()] | |
for line in handle: | |
line = line.rstrip() | |
if line.startswith(";"): | |
yield "\n".join(token_buffer) | |
line = line[1:] | |
if line and line[0] not in self.whitespace_chars: | |
raise ValueError("Missing whitespace") | |
break | |
token_buffer.append(line) | |
else: | |
raise ValueError("Missing closing semicolon") | |
yield from self._splitline(line.strip()) | |
if empty: | |
raise ValueError("Empty file.") | |