Spaces:
Sleeping
Sleeping
File size: 6,320 Bytes
ce7bf5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
# Copyright Generate Biomedicines, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shlex
from dataclasses import dataclass
@dataclass
class PeekedLine:
line: str
next_position: int
def peek_line(f, peeked: PeekedLine, rewind=True):
ret = True
pos = f.tell()
line = f.readline()
if line == "": # at EOF
ret = False
elif line[-1] == "\n":
line = line[:-1]
peeked.line = line
if rewind:
peeked.next_position = f.tell()
f.seek(pos)
else:
peeked.next_position = pos
return ret
def advance(f, peeked: PeekedLine):
f.seek(peeked.next_position)
def star_item_parse(line: str):
parts = line.split(".")
if len(parts) < 2:
raise Exception(f"expected at least two parts in the STAR data line {line}")
cat = parts[0]
name_parts = parts[1].split()
name = name_parts[0]
if len(name_parts) >= 2:
val = name_parts[1]
else:
val = ""
return (cat, name, val)
def star_read_data(f, names: list, in_loop: bool, cols=False, has_blocks=True):
tab = []
line = ""
if cols:
tab = [[] for _ in range(len(names))]
peeked = PeekedLine("", 0)
if in_loop:
heads = []
while peek_line(f, peeked):
if not peeked.line.startswith("_"):
break
parts = peeked.line.split(".")
if len(parts) != 2:
raise Exception(f"expected two parts in the STAR data line {line}")
heads.append(parts[1].strip())
advance(f, peeked)
# figure out which columns we want
indices = [-1] * len(names)
for i, name in enumerate(names):
if name in heads:
indices[i] = heads.index(name)
# read each row and get the corresponding columns
row = [None] * len(heads)
ma = max(indices)
while star_read_data_row(f, row, in_loop, has_blocks):
if (ma >= 0) and (len(row) <= ma):
raise Exception(f"loop row has insufficient elements: {line}")
if not cols:
tab.append([""] * len(names))
for i, index in enumerate(indices):
if cols:
tab[i].append(row[index] if index >= 0 else "")
else:
tab[-1][i] = row[index] if index >= 0 else ""
else:
if not cols:
tab = [[""] * len(names)]
category, cat, name = "", "", ""
row = ["", ""]
while star_read_data_row(f, row, in_loop, has_blocks, peeked):
cat, name, _ = star_item_parse(row[0])
if category == "":
category = cat
elif category != cat:
advance(f, peeked)
break
if name not in names:
continue
idx = names.index(name)
if cols:
tab[idx].push_back(row[1])
else:
tab[0][idx] = row[1]
return tab
def star_read_data_row(
f, row: list, in_loop: bool, has_blocks: bool, peeked: PeekedLine = None
):
i = 0
ret = True
if peeked is None:
peeked = PeekedLine("", 0)
while i < len(row):
if not peek_line(f, peeked, rewind=False):
if peeked.line == "" and i == 0:
return False
raise Exception(f"read {i} tokens when {len(row)} were requested: {row}")
if (
peeked.line.startswith("loop_")
or peeked.line.startswith("data_")
or (in_loop and peeked.line.startswith("_"))
):
if i == 0:
advance(f, peeked)
return False
raise Exception(
f"data block ended while reading requested number of tokens: {len(row)}"
)
if peeked.line.startswith(";"):
row[i] = peeked.line[1:]
while peek_line(f, peeked, rewind=False):
if peeked.line.startswith(";"):
break
row[i] += peeked.line
i = i + 1
elif peeked.line.startswith("#"):
pass
else:
elems = (
[part for part in shlex.split(peeked.line.strip())]
if has_blocks
else peeked.line.strip().split()
)
if i + len(elems) > len(row):
raise Exception(
f"too many elements when trying to read {len(row)} tokens; last read: {elems}, row was: {row}, i = {i}"
)
for elem in elems:
row[i] = elem
i = i + 1
return ret
def star_string_escape(text):
# NOTE: has_space designates whether the string really should be quoted, not
# based on having quote characters within it, but just because of some other
# reason (e.g., it has spaces or is empty or starts with underscore, which can
# have special meaning in CIF).
has_space = (" " in text) or (text == "") or ((len(text) > 0) and (text[0] == "_"))
has_single = "'" in text
has_double = '"' in text
if not has_single and not has_double:
if not has_space:
return text
else:
return f"'{text}'"
elif not has_single:
return f"'{text}'"
elif not has_double:
return '"' + text + '"'
return "\n;" + str + "\n;"
def star_loop_header_write(f, category, names):
f.write("loop_\n")
for name in names:
f.write(f"{category}.{name} \n")
def star_value_defined(val):
return (val != ".") and (val != "?")
def star_value(val, default):
if star_value_defined(val):
return val
return default
def atom_site_token(value):
return "." if value == " " else value
|