File size: 11,138 Bytes
0b11a42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 |
import json
import logging
import os
import pickle
from pathlib import Path
from typing import Any, List
import anndata
import dill
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
from anndata import AnnData
from Bio.SeqIO.FastaIO import SimpleFastaParser
logger = logging.getLogger(__name__)
def create_dirs(paths:List):
for path in paths:
if not os.path.exists(path):
os.mkdir(path)
def save(path: Path, data: object, ignore_ext: bool = False) -> Path:
"""Saves data to this path. Extension and saving function is determined from the type.
If the correct extension was already in the path its also ok.
At the moment we handle:
- pyplot figures -> .pdf
- dictionaries -> .yaml
- list -> .yaml
- numpy -> .npy
- pandas dataframes -> .tsv
- anndata -> .h5ad
- strings -> .txt
- _anything else_ -> .p (pickled with `dill`)
Parameters
----------
path : Path
The full path to save to
data: object
Data to save
ignore_ext : bool
Whether to ignore adding the normal expected extension
Returns
-------
Path
The final path to the file
"""
if not isinstance(path, Path):
path = Path(path)
# Make sure the folder exists:
path.parent.mkdir(parents=True, exist_ok=True)
annotation_path = os.path.dirname(os.path.abspath(__file__))
with open(annotation_path+"/tcga_anndata_groupings.yaml", 'r') as stream:
tcga_annotations = yaml.safe_load(stream)
def make_path(p: Path, ext: str) -> Path:
"""If the path doesn't end with the given extension add the extension to the path.
Parameters
----------
p : Path
The path
ext : str
The expected extension
Returns
-------
Path
The fixed path
"""
if not ignore_ext and not p.name.endswith(ext):
return p.parent.joinpath(f"{p.name}{ext}")
return p
# PyPlot Figure
if isinstance(data, mpl.figure.Figure):
path = make_path(path, ".pdf")
data.savefig(path)
plt.close(data)
# Dict β YAML Files
elif isinstance(data, dict):
path = make_path(path, ".yaml")
with open(path, "w") as fp:
yaml.dump(data, fp)
# List β YAML Files
elif isinstance(data, list):
path = make_path(path, ".yaml")
with open(path, "w") as fp:
yaml.dump(data, fp)
# NumPy Array
elif isinstance(data, np.ndarray):
path = make_path(path, ".npy")
np.save(path, data)
# Dataframes β TSV
elif isinstance(data, pd.DataFrame):
path = make_path(path, ".tsv")
data.to_csv(path, sep="\t")
# AnnData
elif isinstance(data, anndata.AnnData):
path = make_path(path, ".h5ad")
for date_col in set(tcga_annotations['anndata']['obs']['datetime_columns']) & set(data.obs.columns):
if "datetime" in data.obs[date_col].dtype.name:
data.obs[date_col] = data.obs[date_col].dt.strftime("%Y-%m-%d")
else:
logger.info(f"Column {date_col} in obs should be a date but isnt formatted as one.")
data.write(path)
# Strings to normal files
elif isinstance(data, str):
path = make_path(path, ".txt")
with open(path, "w") as fp:
fp.write(data)
# Everything else β pickle
else:
path = make_path(path, ".p")
dill.dump(data, open(path, "wb"))
return path
def _resolve_path(path: Path) -> Path:
"""Given a path, will try to resolve it in multiple ways:
1. Is it a path to a S3 bucket?
2. Is it a global/local file that exists?
3. Is it path that is a prefix to a file that is unique?
Parameters
----------
path : Path
The path
Returns
-------
Path
The global resolved file.
Raises
------
FileNotFoundError
If the file doesn't exists or if there are multiple files that match the glob.
"""
if not path.name.startswith("/"):
path = path.expanduser().resolve()
# If it exists we'll take it:
if path.exists():
return path
# But mostly we load files without the extension so we glob for a uniue file:
glob_name = path.name if path.name.endswith("*") else path.name + "*"
paths = list(path.parent.glob(glob_name))
if len(paths) == 1:
return paths[0] # was unique glob
raise FileNotFoundError(
f"Was trying to resolve path\n\t{path}*\nbut was ambigious because there are no or multiple files that fit the glob."
)
def _to_int_string(element: Any) -> str:
"""Casts a number to a fixed formatted string that's nice categoriazebale.
Parameters
----------
element : Any
The number, float or int
Returns
-------
str
Either the number formatted as a string or the original input if it
didn't work
"""
try:
fl = float(element)
return f"{fl:0.0f}"
except:
return element
def cast_anndata(ad: AnnData) -> None:
"""Fixes the data-type in the `.obs` and `.var` DataFrame columns of an
AnnData object. __Works in-place__. Currently does the following:
1.1. Enforces numerical-categorical `.obs` columns
1.2. Makes all other `.obs` columns categoricals
1.3. Makes date-time `.obs` columns, non-categorical pandas `datetime64`
1.4. Enforces real strinng `.obs` columns, to be strings not categoricals
1.5. Enforces some numerical `.obs` columns
Configuration for which column belongs in which group is configured in
`/transforna/utils/ngs_annotations.yaml` in this repository.
Parameters
----------
ad : AnnData
The AnnData object
"""
# 1. Fix obs-annotation dtypes
# 1.1. Force numerical looking columns to be actual categorical variables
annotation_path = os.path.dirname(os.path.abspath(__file__))
with open(annotation_path+"/tcga_anndata_groupings.yaml", 'r') as stream:
tcga_annotations = yaml.safe_load(stream)
numerical_categorical_columns: List[str] = set(tcga_annotations['anndata']['obs']['numerical_categorical_columns']) & set(
ad.obs.columns
)
for column in numerical_categorical_columns:
ad.obs[column] = ad.obs[column].apply(_to_int_string).astype("U").astype("category")
# 1.2. Forces string and mixed columns to be categoricals
ad.strings_to_categoricals()
# 1.3. DateTime, parse dates from string
datetime_columns: List[str] = set(tcga_annotations['anndata']['obs']['datetime_columns']) & set(ad.obs.columns)
for column in datetime_columns:
try:
ad.obs[column] = pd.to_datetime(ad.obs[column]).astype("datetime64[ns]")
except ValueError as e:
warning(
f"""to_datetime error (parsing "unparseable"):\n {e}\nColumn
{column} will be set as string not as datetime."""
)
ad.obs[column] = ad.obs[column].astype("string")
# 1.4. Make _real_ string columns to force to be string, reversing step 1.2.
# These are columns that contain acutal text, something like an description
# or also IDs, which are identical not categories.
string_columns: List[str] = set(tcga_annotations['anndata']['obs']['string_columns']) & set(ad.obs.columns)
for column in string_columns:
ad.obs[column] = ad.obs[column].astype("string")
# 1.5. Force numerical columns to be numerical, this is necesary with some
# invalid inputs or NaNs
numerical_columns: List[str] = set(tcga_annotations['anndata']['obs']['numerical_columns']) & set(ad.obs.columns)
for column in numerical_columns:
ad.obs[column] = pd.to_numeric(ad.obs[column], errors="coerce")
# 2. Fix var-annotation dtypes
# 2.1. Enforce boolean columns to be real python bools, normally NaNs become
# True here, which we change to False.
boolean_columns: List[str] = set(tcga_annotations['anndata']['var']['boolean_columns']) & set(ad.var.columns)
for column in boolean_columns:
ad.var[column].fillna(False, inplace=True)
ad.var[column] = ad.var[column].astype(bool)
def load(path: str, ext: str = None, **kwargs):
"""Loads the given filepath.
This will use the extension of the filename to determine what to use for
reading (if not overwritten). Most common use-case:
At the moment we handle:
- pickled objects (.p)
- numpy objects (.npy)
- dataframes (.csv, .tsv)
- json files (.json)
- yaml files (.yaml)
- anndata files (.h5ad)
- excel files (.xlsx)
- text (.txt)
Parameters
----------
path : str
The file-name of the cached file, without extension. (Or path)
The file-name can be a glob match e.g. `/data/something/LC__*__21.7.2.*`
which matches the everything with anything filling the stars. This only
works if there is only one match. So this is shortcut if you do not know
the full name but you know there is only one.
ext : str, optional
The extension to assume, ignoring the actual extension. E.g. loading
"tsv" for a "something.csv" file with tab-limits, by default None
Returns
-------
Whatever is in the saved file.
Raises
------
FileNotFoundError
If a given path doesn't exist or doesn't give a unqiue file path.
NotImplementedError
Trying to load a file with an extension we do not have loading code for.
"""
path = _resolve_path(Path(path))
# If extension is not overwritten take the one from the path_
if ext is None:
ext = path.suffix[1:]
# Pickle files
if ext == "p":
return pickle.load(open(path, "rb"))
# Numpy Arrays
elif ext == "npy":
return np.load(path)
# TSV β DataFrame
elif ext == "tsv":
return pd.read_csv(path, sep="\t", **kwargs)
# CSV β DataFrame
elif ext == "csv":
return pd.read_csv(path, **kwargs)
# JSON β dict
elif ext == "json":
return json.load(open(path))
# YAML β dict
elif ext == "yaml":
return yaml.load(open(path), Loader=yaml.SafeLoader)
# AnnData
elif ext == "h5ad":
ad = anndata.read_h5ad(path)
cast_anndata(ad)
return ad
# Excel files β DataFrame
elif ext == "xlsx":
return pd.read_excel(path, **kwargs)
# General text files β string
elif ext == "txt":
with open(path, "r") as text_file:
return text_file.read()
#fasta
elif ext == "fa":
## load sequences
with open(path) as fasta_file:
identifiers = []
seqs = []
for title, sequence in SimpleFastaParser(fasta_file):
identifiers.append(title.split(None, 1)[0])
seqs.append(sequence)
#convert sequences to dataframe
return pd.DataFrame({'Sequences':seqs})
else:
raise NotImplementedError
|