from pathlib import Path import numpy as np import pandas as pd from ase.db import connect from scipy import stats from mlip_arena.models import REGISTRY, MLIPEnum DATA_DIR = Path(__file__).parent.absolute() def load_wbm_structures(): """ Load the WBM structures from a ASE DB file. """ with connect(DATA_DIR.parent / "wbm_structures.db") as db: for row in db.select(): yield row.toatoms(add_additional_information=True) def gather_results(): for model in MLIPEnum: if "eos_bulk" not in REGISTRY[model.name].get("gpu-tasks", []): continue if (DATA_DIR / f"{model.name}.parquet").exists(): continue all_data = [] for atoms in load_wbm_structures(): fpath = Path(model.name) / f"{atoms.info['key_value_pairs']['wbm_id']}.pkl" if not fpath.exists(): continue all_data.append(pd.read_pickle(fpath)) df = pd.concat(all_data, ignore_index=True) df.to_parquet(DATA_DIR / f"{model.name}.parquet") def summarize(): summary_table = pd.DataFrame( columns=[ "model", "energy-diff-flip-times", "tortuosity", "spearman-compression-energy", "spearman-compression-derivative", "spearman-tension-energy", "missing", ] ) for model in MLIPEnum: fpath = DATA_DIR / f"{model.name}.parquet" if not fpath.exists(): continue df_raw_results = pd.read_parquet(fpath) df_analyzed = pd.DataFrame( columns=[ "model", "structure", "formula", "volume-ratio", "energy-delta-per-atom", "energy-diff-flip-times", "energy-delta-per-volume-b0", "tortuosity", "spearman-compression-energy", "spearman-compression-derivative", "spearman-tension-energy", "missing", ] ) for wbm_struct in load_wbm_structures(): structure_id = wbm_struct.info["key_value_pairs"]["wbm_id"] try: results = df_raw_results.loc[df_raw_results["id"] == structure_id] b0 = results["b0"].values[0] # vol0 = results["v0"].values[0] results = results["eos"].values[0] es = np.array(results["energies"]) vols = np.array(results["volumes"]) indices = np.argsort(vols) vols = vols[indices] es = es[indices] imine = len(es) // 2 # min_center_val = np.min(es[imid - 1 : imid + 2]) # imine = np.where(es == min_center_val)[0][0] emin = es[imine] vol0 = vols[imine] interpolated_volumes = [ (vols[i] + vols[i + 1]) / 2 for i in range(len(vols) - 1) ] ediff = np.diff(es) ediff_sign = np.sign(ediff) mask = ediff_sign != 0 ediff = ediff[mask] ediff_sign = ediff_sign[mask] ediff_flip = np.diff(ediff_sign) != 0 etv = np.sum(np.abs(np.diff(es))) data = { "model": model.name, "structure": structure_id, "formula": wbm_struct.get_chemical_formula(), "missing": False, "volume-ratio": vols / vol0, "energy-delta-per-atom": (es - emin) / len(wbm_struct), "energy-diff-flip-times": np.sum(ediff_flip).astype(int), "energy-delta-per-volume-b0": (es - emin) / (b0*vol0), "tortuosity": etv / (abs(es[0] - emin) + abs(es[-1] - emin)), "spearman-compression-energy": stats.spearmanr( vols[:imine], es[:imine] ).statistic, "spearman-compression-derivative": stats.spearmanr( interpolated_volumes[:imine], ediff[:imine] ).statistic, "spearman-tension-energy": stats.spearmanr( vols[imine:], es[imine:] ).statistic, } except Exception as e: print(e) data = { "model": model.name, "structure": structure_id, "formula": wbm_struct.get_chemical_formula(), "missing": True, "volume-ratio": None, "energy-delta-per-atom": None, "energy-delta-per-volume-b0": None, "energy-diff-flip-times": None, "tortuosity": None, "spearman-compression-energy": None, "spearman-compression-derivative": None, "spearman-tension-energy": None, } df_analyzed = pd.concat([df_analyzed, pd.DataFrame([data])], ignore_index=True) df_analyzed.to_parquet(DATA_DIR / f"{model.name}_processed.parquet") # json_fpath = DATA_DIR / f"EV_scan_analyzed_{model.name}.json" # df_analyzed.to_json(json_fpath, orient="records") valid_results = df_analyzed[df_analyzed["missing"] == False] analysis_summary = { "model": model.name, "energy-diff-flip-times": valid_results["energy-diff-flip-times"].mean(), "energy-diff-flip-times-std": valid_results["energy-diff-flip-times"].std(), "tortuosity": valid_results["tortuosity"].mean(), "tortuosity-std": valid_results["tortuosity"].std(), "spearman-compression-energy": valid_results[ "spearman-compression-energy" ].mean(), "spearman-compression-energy-std": valid_results["spearman-compression-energy"].std(), "spearman-compression-derivative": valid_results[ "spearman-compression-derivative" ].mean(), "spearman-compression-derivative-std": valid_results[ "spearman-compression-derivative" ].std(), "spearman-tension-energy": valid_results["spearman-tension-energy"].mean(), "spearman-tension-energy-std": valid_results["spearman-tension-energy"].std(), "missing": len(df_analyzed[df_analyzed["missing"] == True]), } summary_table = pd.concat( [summary_table, pd.DataFrame([analysis_summary])], ignore_index=True ) flip_rank = ( (summary_table["energy-diff-flip-times"] - 1) .abs() .rank(ascending=True, method="min") ) tortuosity_rank = summary_table["tortuosity"].rank(ascending=True, method="min") spearman_compression_energy_rank = summary_table["spearman-compression-energy"].rank( method="min" ) spearman_compression_derivative_rank = summary_table[ "spearman-compression-derivative" ].rank(ascending=False, method="min") spearman_tension_energy_rank = summary_table["spearman-tension-energy"].rank( ascending=False, method="min" ) missing_rank = summary_table["missing"].rank(ascending=True, method="min") rank_aggr = ( flip_rank + tortuosity_rank + spearman_compression_energy_rank + spearman_compression_derivative_rank + spearman_tension_energy_rank + missing_rank ) rank = rank_aggr.rank(method="min") summary_table.insert(1, "rank", rank.astype(int)) summary_table.insert(2, "rank-aggregation", rank_aggr.astype(int)) summary_table = summary_table.sort_values(by="rank", ascending=True) summary_table = summary_table.reset_index(drop=True) summary_table.to_csv(DATA_DIR / "summary.csv", index=False) summary_table.to_latex(DATA_DIR / "summary.tex", index=False, float_format="%.3f") return summary_table if __name__ == "__main__": gather_results() summarize()