Spaces:
Running
Running
Yuan (Cyrus) Chiang
Christine Zhang
commited on
Add energy conservation benchmark (#64)
Browse files* add entropy task script
* soft import, use logger, prefect future
---------
Co-authored-by: Christine Zhang <[email protected]>
- benchmarks/energy_conservation/run.py +214 -0
- mlip_arena/tasks/phonon.py +13 -7
benchmarks/energy_conservation/run.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Task for running MD simulations and computing the differential entropy
|
3 |
+
of the simulated structures with respect to a reference dataset.
|
4 |
+
|
5 |
+
See https://github.com/dskoda/quests for differential entropy details.
|
6 |
+
"""
|
7 |
+
|
8 |
+
from __future__ import annotations
|
9 |
+
|
10 |
+
import os
|
11 |
+
from datetime import datetime
|
12 |
+
|
13 |
+
import numpy as np
|
14 |
+
from ase.io import read
|
15 |
+
from prefect import task
|
16 |
+
from prefect.cache_policies import INPUTS, TASK_SOURCE
|
17 |
+
from prefect.runtime import task_run
|
18 |
+
|
19 |
+
from mlip_arena.models import MLIPEnum
|
20 |
+
from mlip_arena.tasks.md import run as MD
|
21 |
+
from mlip_arena.tasks.utils import logger
|
22 |
+
|
23 |
+
try:
|
24 |
+
from quests.descriptor import get_descriptors
|
25 |
+
from quests.entropy import delta_entropy
|
26 |
+
except ImportError as e:
|
27 |
+
logger.warning(e)
|
28 |
+
logger.warning(
|
29 |
+
"quests is not installed. Please install it using `pip install quests` or following the instructions at https://github.com/dskoda/quests to use this module."
|
30 |
+
)
|
31 |
+
|
32 |
+
|
33 |
+
def get_entropy_from_path(
|
34 |
+
subset_path, dataset_path, dataset_desc_out_path, k=32, cutoff=5.0, h=0.015
|
35 |
+
):
|
36 |
+
"""
|
37 |
+
Computes the differential entropy of a subset of structures with respect
|
38 |
+
to a reference dataset.
|
39 |
+
|
40 |
+
Arguments:
|
41 |
+
subset_path (str): Path to the file containing the subset of structures.
|
42 |
+
dataset_path (str): Path to the file containing the full dataset of structures without the subset.
|
43 |
+
dataset_desc_out_path (str): Path to save the descriptors of the full dataset.
|
44 |
+
k (int, optional): Number of nearest neighbors used for descriptor calculation. Default is 32.
|
45 |
+
cutoff (float, optional): Cutoff distance for descriptor calculation. Default is 5.0.
|
46 |
+
h (float, optional): Bandwidth for the Gaussian kernel. Default is 0.015.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
np.ndarray: The differential entropy of the subset with respect to the dataset.
|
50 |
+
"""
|
51 |
+
|
52 |
+
x_structures = read(dataset_path, index=":")
|
53 |
+
x_desc = get_descriptors(x_structures, k=k, cutoff=cutoff)
|
54 |
+
np.save(dataset_desc_out_path, x_desc)
|
55 |
+
|
56 |
+
y_structures = read(subset_path, index=":")
|
57 |
+
y_desc = get_descriptors(y_structures, k=k, cutoff=cutoff)
|
58 |
+
|
59 |
+
dH = delta_entropy(y_desc, x_desc, h=h)
|
60 |
+
return dH
|
61 |
+
|
62 |
+
|
63 |
+
def get_trajectory_entropy(
|
64 |
+
trajectory_dir,
|
65 |
+
start_idx,
|
66 |
+
end_idx,
|
67 |
+
step,
|
68 |
+
dataset_desc_path,
|
69 |
+
k=32,
|
70 |
+
cutoff=5.0,
|
71 |
+
h=0.015,
|
72 |
+
):
|
73 |
+
"""
|
74 |
+
Computes the differential entropy of a subset of structures in a trajectory with respect
|
75 |
+
to a reference dataset.
|
76 |
+
|
77 |
+
Arguments:
|
78 |
+
trajectory_dir (str): Path to the directory containing the trajectory files.
|
79 |
+
start_idx (int): Starting index of the subset of structures to select from each trajectory.
|
80 |
+
end_idx (int): Ending index of the subset of structures to select from each trajectory.
|
81 |
+
step (int): Step size of the subset of structures to select from each trajectory.
|
82 |
+
dataset_desc_path (str): Path to the file containing the descriptors of the full dataset of structures without the subset.
|
83 |
+
k (int, optional): Number of nearest neighbors used for descriptor calculation. Default is 32.
|
84 |
+
cutoff (float, optional): Cutoff distance for descriptor calculation. Default is 5.0.
|
85 |
+
h (float, optional): Bandwidth for the Gaussian kernel. Default is 0.015.
|
86 |
+
|
87 |
+
Choose start_idx, end_idx, step to select which structures to compute the differential entropy for, based on what sliding window is chosen.
|
88 |
+
e.g. window of size 5 with stride 2 means we select every other structure starting at index 2 (middle of the first window) to the -2 index (middle of the last window)
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
np.ndarray: The differential entropy of the subset of structures in the trajectory with respect to the dataset.
|
92 |
+
"""
|
93 |
+
structures = []
|
94 |
+
for traj_file in sorted(os.listdir(trajectory_dir)):
|
95 |
+
traj = read(os.path.join(trajectory_dir, traj_file), index=":")
|
96 |
+
every_other = traj[start_idx:end_idx:step]
|
97 |
+
structures.extend(every_other)
|
98 |
+
|
99 |
+
desc = get_descriptors(structures, k=k, cutoff=cutoff)
|
100 |
+
x_desc = np.load(dataset_desc_path)
|
101 |
+
dH = delta_entropy(desc, x_desc, h=h)
|
102 |
+
return dH
|
103 |
+
|
104 |
+
|
105 |
+
def run_simulations(model_names, structures, out_dir):
|
106 |
+
"""
|
107 |
+
Runs simulations on a list of structures.
|
108 |
+
|
109 |
+
Parameters:
|
110 |
+
model_names (list[str]): List of models to use.
|
111 |
+
structures (list[ase.Atoms]): List of structures to simulate.
|
112 |
+
out_dir (str): Directory to save the simulation trajectories to.
|
113 |
+
|
114 |
+
Notes:
|
115 |
+
Structures are replicated to have at least 100 atoms and at most 500 atoms.
|
116 |
+
Structures are simulated with NVE MD at 1000 K for 5 ps.
|
117 |
+
Simulation trajectories are saved to files in out_dir, with each file named according to the index of the structure in the list.
|
118 |
+
"""
|
119 |
+
min_atoms = 100
|
120 |
+
max_atoms = 500
|
121 |
+
|
122 |
+
futures = []
|
123 |
+
|
124 |
+
for model_name in model_names:
|
125 |
+
os.makedirs(out_dir, exist_ok=True)
|
126 |
+
model = MLIPEnum[model_name]
|
127 |
+
calc = model.value()
|
128 |
+
|
129 |
+
for i, atoms in enumerate(structures):
|
130 |
+
logger.info(
|
131 |
+
f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Running {model_name} on structure number {i}"
|
132 |
+
)
|
133 |
+
|
134 |
+
# Replicate the structure
|
135 |
+
n_atoms = len(atoms)
|
136 |
+
rep_factor = int(
|
137 |
+
np.ceil((min_atoms / n_atoms) ** (1 / 3))
|
138 |
+
) # cube root since it's a 3D replication
|
139 |
+
supercell_atoms = atoms.repeat((rep_factor, rep_factor, rep_factor))
|
140 |
+
if len(supercell_atoms) > max_atoms:
|
141 |
+
logger.info(
|
142 |
+
f"Skipping structure {i} because it has too many atoms ({len(supercell_atoms)} > {max_atoms})"
|
143 |
+
)
|
144 |
+
continue # skip if it becomes too large
|
145 |
+
|
146 |
+
# Run NVE MD @ 1000K for 5 ps
|
147 |
+
future = MD.submit(
|
148 |
+
supercell_atoms,
|
149 |
+
calculator=calc,
|
150 |
+
ensemble="nve",
|
151 |
+
dynamics="velocityverlet",
|
152 |
+
time_step=1.0, # fs
|
153 |
+
total_time=5000, # 5 ps = 5000 fs
|
154 |
+
temperature=1000.0,
|
155 |
+
traj_file=f"{out_dir}/{i}.traj",
|
156 |
+
traj_interval=100,
|
157 |
+
zero_linear_momentum=True,
|
158 |
+
zero_angular_momentum=True,
|
159 |
+
)
|
160 |
+
futures.append(future)
|
161 |
+
|
162 |
+
return [f.result(raise_on_failure=False) for f in futures]
|
163 |
+
|
164 |
+
|
165 |
+
def _generate_task_run_name():
|
166 |
+
task_name = task_run.task_name
|
167 |
+
parameters = task_run.parameters
|
168 |
+
|
169 |
+
trajectory_dir = parameters["trajectory_dir"]
|
170 |
+
dataset_desc_path = parameters["dataset_desc_path"]
|
171 |
+
|
172 |
+
return f"{task_name}: {trajectory_dir} - {dataset_desc_path}"
|
173 |
+
|
174 |
+
|
175 |
+
@task(
|
176 |
+
name="Entropy along trajectory",
|
177 |
+
task_run_name=_generate_task_run_name,
|
178 |
+
cache_policy=TASK_SOURCE + INPUTS,
|
179 |
+
)
|
180 |
+
def run(
|
181 |
+
dataset_path,
|
182 |
+
model_names,
|
183 |
+
structures,
|
184 |
+
trajectory_dir,
|
185 |
+
start_idx,
|
186 |
+
end_idx,
|
187 |
+
step,
|
188 |
+
dataset_desc_path,
|
189 |
+
dH_out_path,
|
190 |
+
k=32,
|
191 |
+
cutoff=5.0,
|
192 |
+
h=0.015,
|
193 |
+
):
|
194 |
+
# Get descriptors for the dataset. This should exclude the subset of structures used for simulations.
|
195 |
+
# This may take a while if the dataset is large - in that case, would recommend splitting the structures into separate chunks.
|
196 |
+
x_structures = read(dataset_path, index=":")
|
197 |
+
x_desc = get_descriptors(x_structures, k=k, cutoff=cutoff)
|
198 |
+
np.save(dataset_desc_path, x_desc)
|
199 |
+
|
200 |
+
# Run simulations
|
201 |
+
run_simulations(model_names, structures, trajectory_dir)
|
202 |
+
|
203 |
+
# Get entropy for structures along trajectories
|
204 |
+
dH = get_trajectory_entropy(
|
205 |
+
trajectory_dir,
|
206 |
+
start_idx,
|
207 |
+
end_idx,
|
208 |
+
step,
|
209 |
+
dataset_desc_path,
|
210 |
+
k=k,
|
211 |
+
cutoff=cutoff,
|
212 |
+
h=h,
|
213 |
+
)
|
214 |
+
np.save(dH_out_path, dH)
|
mlip_arena/tasks/phonon.py
CHANGED
@@ -36,14 +36,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
36 |
from pathlib import Path
|
37 |
|
38 |
import numpy as np
|
39 |
-
from
|
40 |
-
from
|
41 |
from prefect import task
|
42 |
from prefect.cache_policies import INPUTS, TASK_SOURCE
|
43 |
from prefect.runtime import task_run
|
44 |
|
45 |
-
from
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
|
49 |
@task(cache_policy=TASK_SOURCE + INPUTS)
|
@@ -151,9 +159,7 @@ def run(
|
|
151 |
filename=Path(outdir, "band.yaml") if outdir is not None else "band.yaml",
|
152 |
)
|
153 |
if outdir:
|
154 |
-
phonon.save(
|
155 |
-
Path(outdir, "phonopy.yaml"), settings={"force_constants": True}
|
156 |
-
)
|
157 |
|
158 |
return {
|
159 |
"phonon": phonon,
|
|
|
36 |
from pathlib import Path
|
37 |
|
38 |
import numpy as np
|
39 |
+
from ase import Atoms
|
40 |
+
from ase.calculators.calculator import BaseCalculator
|
41 |
from prefect import task
|
42 |
from prefect.cache_policies import INPUTS, TASK_SOURCE
|
43 |
from prefect.runtime import task_run
|
44 |
|
45 |
+
from mlip_arena.tasks.utils import logger
|
46 |
+
|
47 |
+
try:
|
48 |
+
from phonopy import Phonopy
|
49 |
+
from phonopy.structure.atoms import PhonopyAtoms
|
50 |
+
except ImportError as e:
|
51 |
+
logger.warning(e)
|
52 |
+
logger.warning(
|
53 |
+
"Phonopy is not installed. Please install it following the instructions at https://phonopy.github.io/phonopy/install.html to use this module."
|
54 |
+
)
|
55 |
|
56 |
|
57 |
@task(cache_policy=TASK_SOURCE + INPUTS)
|
|
|
159 |
filename=Path(outdir, "band.yaml") if outdir is not None else "band.yaml",
|
160 |
)
|
161 |
if outdir:
|
162 |
+
phonon.save(Path(outdir, "phonopy.yaml"), settings={"force_constants": True})
|
|
|
|
|
163 |
|
164 |
return {
|
165 |
"phonon": phonon,
|