Hukuna's picture
Upload 221 files
ce7bf5b verified
raw
history blame
6.32 kB
# Copyright Generate Biomedicines, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shlex
from dataclasses import dataclass
@dataclass
class PeekedLine:
line: str
next_position: int
def peek_line(f, peeked: PeekedLine, rewind=True):
ret = True
pos = f.tell()
line = f.readline()
if line == "": # at EOF
ret = False
elif line[-1] == "\n":
line = line[:-1]
peeked.line = line
if rewind:
peeked.next_position = f.tell()
f.seek(pos)
else:
peeked.next_position = pos
return ret
def advance(f, peeked: PeekedLine):
f.seek(peeked.next_position)
def star_item_parse(line: str):
parts = line.split(".")
if len(parts) < 2:
raise Exception(f"expected at least two parts in the STAR data line {line}")
cat = parts[0]
name_parts = parts[1].split()
name = name_parts[0]
if len(name_parts) >= 2:
val = name_parts[1]
else:
val = ""
return (cat, name, val)
def star_read_data(f, names: list, in_loop: bool, cols=False, has_blocks=True):
tab = []
line = ""
if cols:
tab = [[] for _ in range(len(names))]
peeked = PeekedLine("", 0)
if in_loop:
heads = []
while peek_line(f, peeked):
if not peeked.line.startswith("_"):
break
parts = peeked.line.split(".")
if len(parts) != 2:
raise Exception(f"expected two parts in the STAR data line {line}")
heads.append(parts[1].strip())
advance(f, peeked)
# figure out which columns we want
indices = [-1] * len(names)
for i, name in enumerate(names):
if name in heads:
indices[i] = heads.index(name)
# read each row and get the corresponding columns
row = [None] * len(heads)
ma = max(indices)
while star_read_data_row(f, row, in_loop, has_blocks):
if (ma >= 0) and (len(row) <= ma):
raise Exception(f"loop row has insufficient elements: {line}")
if not cols:
tab.append([""] * len(names))
for i, index in enumerate(indices):
if cols:
tab[i].append(row[index] if index >= 0 else "")
else:
tab[-1][i] = row[index] if index >= 0 else ""
else:
if not cols:
tab = [[""] * len(names)]
category, cat, name = "", "", ""
row = ["", ""]
while star_read_data_row(f, row, in_loop, has_blocks, peeked):
cat, name, _ = star_item_parse(row[0])
if category == "":
category = cat
elif category != cat:
advance(f, peeked)
break
if name not in names:
continue
idx = names.index(name)
if cols:
tab[idx].push_back(row[1])
else:
tab[0][idx] = row[1]
return tab
def star_read_data_row(
f, row: list, in_loop: bool, has_blocks: bool, peeked: PeekedLine = None
):
i = 0
ret = True
if peeked is None:
peeked = PeekedLine("", 0)
while i < len(row):
if not peek_line(f, peeked, rewind=False):
if peeked.line == "" and i == 0:
return False
raise Exception(f"read {i} tokens when {len(row)} were requested: {row}")
if (
peeked.line.startswith("loop_")
or peeked.line.startswith("data_")
or (in_loop and peeked.line.startswith("_"))
):
if i == 0:
advance(f, peeked)
return False
raise Exception(
f"data block ended while reading requested number of tokens: {len(row)}"
)
if peeked.line.startswith(";"):
row[i] = peeked.line[1:]
while peek_line(f, peeked, rewind=False):
if peeked.line.startswith(";"):
break
row[i] += peeked.line
i = i + 1
elif peeked.line.startswith("#"):
pass
else:
elems = (
[part for part in shlex.split(peeked.line.strip())]
if has_blocks
else peeked.line.strip().split()
)
if i + len(elems) > len(row):
raise Exception(
f"too many elements when trying to read {len(row)} tokens; last read: {elems}, row was: {row}, i = {i}"
)
for elem in elems:
row[i] = elem
i = i + 1
return ret
def star_string_escape(text):
# NOTE: has_space designates whether the string really should be quoted, not
# based on having quote characters within it, but just because of some other
# reason (e.g., it has spaces or is empty or starts with underscore, which can
# have special meaning in CIF).
has_space = (" " in text) or (text == "") or ((len(text) > 0) and (text[0] == "_"))
has_single = "'" in text
has_double = '"' in text
if not has_single and not has_double:
if not has_space:
return text
else:
return f"'{text}'"
elif not has_single:
return f"'{text}'"
elif not has_double:
return '"' + text + '"'
return "\n;" + str + "\n;"
def star_loop_header_write(f, category, names):
f.write("loop_\n")
for name in names:
f.write(f"{category}.{name} \n")
def star_value_defined(val):
return (val != ".") and (val != "?")
def star_value(val, default):
if star_value_defined(val):
return val
return default
def atom_site_token(value):
return "." if value == " " else value