Spaces:

OpenDILabCommunity
/

gomoku

Sleeping

App Files Files Community

gomoku / DI-engine /ding /model /template /ebm.py

zjowowen

init space

3dfe8fb over 1 year ago

raw

history blame contribute delete

32.2 kB

	"""
	Vanilla DFO and EBM are adapted from https://github.com/kevinzakka/ibc.
	MCMC is adapted from https://github.com/google-research/ibc.
	"""
	from typing import Callable, Tuple
	from functools import wraps

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from abc import ABC, abstractmethod

	from ding.utils import MODEL_REGISTRY, STOCHASTIC_OPTIMIZER_REGISTRY
	from ding.torch_utils import unsqueeze_repeat
	from ding.model.wrapper import IModelWrapper
	from ding.model.common import RegressionHead


	def create_stochastic_optimizer(device: str, stochastic_optimizer_config: dict):
	"""
	Overview:
	Create stochastic optimizer.
	Arguments:
	- device (:obj:`str`): Device.
	- stochastic_optimizer_config (:obj:`dict`): Stochastic optimizer config.
	"""
	return STOCHASTIC_OPTIMIZER_REGISTRY.build(
	stochastic_optimizer_config.pop("type"), device=device, **stochastic_optimizer_config
	)


	def no_ebm_grad():
	"""Wrapper that disables energy based model gradients"""

	def ebm_disable_grad_wrapper(func: Callable):

	@wraps(func)
	def wrapper(args, *kwargs):
	ebm = args[-1]
	assert isinstance(ebm, (IModelWrapper, nn.Module)),\
	'Make sure ebm is the last positional arguments.'
	ebm.requires_grad_(False)
	result = func(args, *kwargs)
	ebm.requires_grad_(True)
	return result

	return wrapper

	return ebm_disable_grad_wrapper


	class StochasticOptimizer(ABC):
	"""
	Overview:
	Base class for stochastic optimizers.
	Interface:
	``__init__``, ``_sample``, ``_get_best_action_sample``, ``set_action_bounds``, ``sample``, ``infer``
	"""

	def _sample(self, obs: torch.Tensor, num_samples: int) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Overview:
	Drawing action samples from the uniform random distribution \
	and tiling observations to the same shape as action samples.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observation.
	- num_samples (:obj:`int`): The number of negative samples.
	Returns:
	- tiled_obs (:obj:`torch.Tensor`): Observations tiled.
	- action (:obj:`torch.Tensor`): Action sampled.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, O)`.
	- num_samples (:obj:`int`): :math:`N`.
	- tiled_obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
	- action (:obj:`torch.Tensor`): :math:`(B, N, A)`.
	Examples:
	>>> obs = torch.randn(2, 4)
	>>> opt = StochasticOptimizer()
	>>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
	>>> tiled_obs, action = opt._sample(obs, 8)
	"""
	size = (obs.shape[0], num_samples, self.action_bounds.shape[1])
	low, high = self.action_bounds[0, :], self.action_bounds[1, :]
	action_samples = low + (high - low) * torch.rand(size).to(self.device)
	tiled_obs = unsqueeze_repeat(obs, num_samples, 1)
	return tiled_obs, action_samples

	@staticmethod
	@torch.no_grad()
	def _get_best_action_sample(obs: torch.Tensor, action_samples: torch.Tensor, ebm: nn.Module):
	"""
	Overview:
	Return one action for each batch with highest probability (lowest energy).
	Arguments:
	- obs (:obj:`torch.Tensor`): Observation.
	- action_samples (:obj:`torch.Tensor`): Action from uniform distributions.
	Returns:
	- best_action_samples (:obj:`torch.Tensor`): Best action.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, O)`.
	- action_samples (:obj:`torch.Tensor`): :math:`(B, N, A)`.
	- best_action_samples (:obj:`torch.Tensor`): :math:`(B, A)`.
	Examples:
	>>> obs = torch.randn(2, 4)
	>>> action_samples = torch.randn(2, 8, 5)
	>>> ebm = EBM(4, 5)
	>>> opt = StochasticOptimizer()
	>>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
	>>> best_action_samples = opt._get_best_action_sample(obs, action_samples, ebm)
	"""
	# (B, N)
	energies = ebm.forward(obs, action_samples)
	probs = F.softmax(-1.0 * energies, dim=-1)
	# (B, )
	best_idxs = probs.argmax(dim=-1)
	return action_samples[torch.arange(action_samples.size(0)), best_idxs]

	def set_action_bounds(self, action_bounds: np.ndarray):
	"""
	Overview:
	Set action bounds calculated from the dataset statistics.
	Arguments:
	- action_bounds (:obj:`np.ndarray`): Array of shape (2, A), \
	where action_bounds[0] is lower bound and action_bounds[1] is upper bound.
	Returns:
	- action_bounds (:obj:`torch.Tensor`): Action bounds.
	Shapes:
	- action_bounds (:obj:`np.ndarray`): :math:`(2, A)`.
	- action_bounds (:obj:`torch.Tensor`): :math:`(2, A)`.
	Examples:
	>>> opt = StochasticOptimizer()
	>>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
	"""
	self.action_bounds = torch.as_tensor(action_bounds, dtype=torch.float32).to(self.device)

	@abstractmethod
	def sample(self, obs: torch.Tensor, ebm: nn.Module) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Overview:
	Create tiled observations and sample counter-negatives for InfoNCE loss.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observations.
	- ebm (:obj:`torch.nn.Module`): Energy based model.
	Returns:
	- tiled_obs (:obj:`torch.Tensor`): Tiled observations.
	- action (:obj:`torch.Tensor`): Actions.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, O)`.
	- ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
	- tiled_obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
	- action (:obj:`torch.Tensor`): :math:`(B, N, A)`.

	.. note:: In the case of derivative-free optimization, this function will simply call _sample.
	"""
	raise NotImplementedError

	@abstractmethod
	def infer(self, obs: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
	"""
	Overview:
	Optimize for the best action conditioned on the current observation.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observations.
	- ebm (:obj:`torch.nn.Module`): Energy based model.
	Returns:
	- best_action_samples (:obj:`torch.Tensor`): Best actions.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, O)`.
	- ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
	- best_action_samples (:obj:`torch.Tensor`): :math:`(B, A)`.
	"""
	raise NotImplementedError


	@STOCHASTIC_OPTIMIZER_REGISTRY.register('dfo')
	class DFO(StochasticOptimizer):
	"""
	Overview:
	Derivative-Free Optimizer in paper Implicit Behavioral Cloning.
	https://arxiv.org/abs/2109.00137
	Interface:
	``init``, ``sample``, ``infer``
	"""

	def __init__(
	self,
	noise_scale: float = 0.33,
	noise_shrink: float = 0.5,
	iters: int = 3,
	train_samples: int = 8,
	inference_samples: int = 16384,
	device: str = 'cpu',
	):
	"""
	Overview:
	Initialize the Derivative-Free Optimizer
	Arguments:
	- noise_scale (:obj:`float`): Initial noise scale.
	- noise_shrink (:obj:`float`): Noise scale shrink rate.
	- iters (:obj:`int`): Number of iterations.
	- train_samples (:obj:`int`): Number of samples for training.
	- inference_samples (:obj:`int`): Number of samples for inference.
	- device (:obj:`str`): Device.
	"""
	self.action_bounds = None
	self.noise_scale = noise_scale
	self.noise_shrink = noise_shrink
	self.iters = iters
	self.train_samples = train_samples
	self.inference_samples = inference_samples
	self.device = device

	def sample(self, obs: torch.Tensor, ebm: nn.Module) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Overview:
	Drawing action samples from the uniform random distribution \
	and tiling observations to the same shape as action samples.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observations.
	- ebm (:obj:`torch.nn.Module`): Energy based model.
	Returns:
	- tiled_obs (:obj:`torch.Tensor`): Tiled observation.
	- action_samples (:obj:`torch.Tensor`): Action samples.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, O)`.
	- ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
	- tiled_obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
	- action_samples (:obj:`torch.Tensor`): :math:`(B, N, A)`.
	Examples:
	>>> obs = torch.randn(2, 4)
	>>> ebm = EBM(4, 5)
	>>> opt = DFO()
	>>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
	>>> tiled_obs, action_samples = opt.sample(obs, ebm)
	"""
	return self._sample(obs, self.train_samples)

	@torch.no_grad()
	def infer(self, obs: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
	"""
	Overview:
	Optimize for the best action conditioned on the current observation.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observations.
	- ebm (:obj:`torch.nn.Module`): Energy based model.
	Returns:
	- best_action_samples (:obj:`torch.Tensor`): Actions.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, O)`.
	- ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
	- best_action_samples (:obj:`torch.Tensor`): :math:`(B, A)`.
	Examples:
	>>> obs = torch.randn(2, 4)
	>>> ebm = EBM(4, 5)
	>>> opt = DFO()
	>>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
	>>> best_action_samples = opt.infer(obs, ebm)
	"""
	noise_scale = self.noise_scale

	# (B, N, O), (B, N, A)
	obs, action_samples = self._sample(obs, self.inference_samples)

	for i in range(self.iters):
	# (B, N)
	energies = ebm.forward(obs, action_samples)
	probs = F.softmax(-1.0 * energies, dim=-1)

	# Resample with replacement.
	idxs = torch.multinomial(probs, self.inference_samples, replacement=True)
	action_samples = action_samples[torch.arange(action_samples.size(0)).unsqueeze(-1), idxs]

	# Add noise and clip to target bounds.
	action_samples = action_samples + torch.randn_like(action_samples) * noise_scale
	action_samples = action_samples.clamp(min=self.action_bounds[0, :], max=self.action_bounds[1, :])

	noise_scale *= self.noise_shrink

	# Return target with highest probability.
	return self._get_best_action_sample(obs, action_samples, ebm)


	@STOCHASTIC_OPTIMIZER_REGISTRY.register('ardfo')
	class AutoRegressiveDFO(DFO):
	"""
	Overview:
	AutoRegressive Derivative-Free Optimizer in paper Implicit Behavioral Cloning.
	https://arxiv.org/abs/2109.00137
	Interface:
	``__init__``, ``infer``
	"""

	def __init__(
	self,
	noise_scale: float = 0.33,
	noise_shrink: float = 0.5,
	iters: int = 3,
	train_samples: int = 8,
	inference_samples: int = 4096,
	device: str = 'cpu',
	):
	"""
	Overview:
	Initialize the AutoRegressive Derivative-Free Optimizer
	Arguments:
	- noise_scale (:obj:`float`): Initial noise scale.
	- noise_shrink (:obj:`float`): Noise scale shrink rate.
	- iters (:obj:`int`): Number of iterations.
	- train_samples (:obj:`int`): Number of samples for training.
	- inference_samples (:obj:`int`): Number of samples for inference.
	- device (:obj:`str`): Device.
	"""
	super().__init__(noise_scale, noise_shrink, iters, train_samples, inference_samples, device)

	@torch.no_grad()
	def infer(self, obs: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
	"""
	Overview:
	Optimize for the best action conditioned on the current observation.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observations.
	- ebm (:obj:`torch.nn.Module`): Energy based model.
	Returns:
	- best_action_samples (:obj:`torch.Tensor`): Actions.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, O)`.
	- ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
	- best_action_samples (:obj:`torch.Tensor`): :math:`(B, A)`.
	Examples:
	>>> obs = torch.randn(2, 4)
	>>> ebm = EBM(4, 5)
	>>> opt = AutoRegressiveDFO()
	>>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
	>>> best_action_samples = opt.infer(obs, ebm)
	"""
	noise_scale = self.noise_scale

	# (B, N, O), (B, N, A)
	obs, action_samples = self._sample(obs, self.inference_samples)

	for i in range(self.iters):
	# j: action_dim index
	for j in range(action_samples.shape[-1]):
	# (B, N)
	energies = ebm.forward(obs, action_samples)[..., j]
	probs = F.softmax(-1.0 * energies, dim=-1)

	# Resample with replacement.
	idxs = torch.multinomial(probs, self.inference_samples, replacement=True)
	action_samples = action_samples[torch.arange(action_samples.size(0)).unsqueeze(-1), idxs]

	# Add noise and clip to target bounds.
	action_samples[..., j] = action_samples[..., j] + torch.randn_like(action_samples[..., j]) * noise_scale

	action_samples[..., j] = action_samples[..., j].clamp(
	min=self.action_bounds[0, j], max=self.action_bounds[1, j]
	)

	noise_scale *= self.noise_shrink

	# (B, N)
	energies = ebm.forward(obs, action_samples)[..., -1]
	probs = F.softmax(-1.0 * energies, dim=-1)
	# (B, )
	best_idxs = probs.argmax(dim=-1)
	return action_samples[torch.arange(action_samples.size(0)), best_idxs]


	@STOCHASTIC_OPTIMIZER_REGISTRY.register('mcmc')
	class MCMC(StochasticOptimizer):
	"""
	Overview:
	MCMC method as stochastic optimizers in paper Implicit Behavioral Cloning.
	https://arxiv.org/abs/2109.00137
	Interface:
	``__init__``, ``sample``, ``infer``, ``grad_penalty``
	"""

	class BaseScheduler(ABC):
	"""
	Overview:
	Base class for learning rate scheduler.
	Interface:
	``get_rate``
	"""

	@abstractmethod
	def get_rate(self, index):
	"""
	Overview:
	Abstract method for getting learning rate.
	"""
	raise NotImplementedError

	class ExponentialScheduler:
	"""
	Overview:
	Exponential learning rate schedule for Langevin sampler.
	Interface:
	``__init__``, ``get_rate``
	"""

	def __init__(self, init, decay):
	"""
	Overview:
	Initialize the ExponentialScheduler.
	Arguments:
	- init (:obj:`float`): Initial learning rate.
	- decay (:obj:`float`): Decay rate.
	"""
	self._decay = decay
	self._latest_lr = init

	def get_rate(self, index):
	"""
	Overview:
	Get learning rate. Assumes calling sequentially.
	Arguments:
	- index (:obj:`int`): Current iteration.
	"""
	del index
	lr = self._latest_lr
	self._latest_lr *= self._decay
	return lr

	class PolynomialScheduler:
	"""
	Overview:
	Polynomial learning rate schedule for Langevin sampler.
	Interface:
	``__init__``, ``get_rate``
	"""

	def __init__(self, init, final, power, num_steps):
	"""
	Overview:
	Initialize the PolynomialScheduler.
	Arguments:
	- init (:obj:`float`): Initial learning rate.
	- final (:obj:`float`): Final learning rate.
	- power (:obj:`float`): Power of polynomial.
	- num_steps (:obj:`int`): Number of steps.
	"""
	self._init = init
	self._final = final
	self._power = power
	self._num_steps = num_steps

	def get_rate(self, index):
	"""
	Overview:
	Get learning rate for index.
	Arguments:
	- index (:obj:`int`): Current iteration.
	"""
	if index == -1:
	return self._init
	return (
	(self._init - self._final) * ((1 - (float(index) / float(self._num_steps - 1))) ** (self._power))
	) + self._final

	def __init__(
	self,
	iters: int = 100,
	use_langevin_negative_samples: bool = True,
	train_samples: int = 8,
	inference_samples: int = 512,
	stepsize_scheduler: dict = dict(
	init=0.5,
	final=1e-5,
	power=2.0,
	# num_steps,
	),
	optimize_again: bool = True,
	again_stepsize_scheduler: dict = dict(
	init=1e-5,
	final=1e-5,
	power=2.0,
	# num_steps,
	),
	device: str = 'cpu',
	# langevin_step
	noise_scale: float = 0.5,
	grad_clip=None,
	delta_action_clip: float = 0.5,
	add_grad_penalty: bool = True,
	grad_norm_type: str = 'inf',
	grad_margin: float = 1.0,
	grad_loss_weight: float = 1.0,
	**kwargs,
	):
	"""
	Overview:
	Initialize the MCMC.
	Arguments:
	- iters (:obj:`int`): Number of iterations.
	- use_langevin_negative_samples (:obj:`bool`): Whether to use Langevin sampler.
	- train_samples (:obj:`int`): Number of samples for training.
	- inference_samples (:obj:`int`): Number of samples for inference.
	- stepsize_scheduler (:obj:`dict`): Step size scheduler for Langevin sampler.
	- optimize_again (:obj:`bool`): Whether to run a second optimization.
	- again_stepsize_scheduler (:obj:`dict`): Step size scheduler for the second optimization.
	- device (:obj:`str`): Device.
	- noise_scale (:obj:`float`): Initial noise scale.
	- grad_clip (:obj:`float`): Gradient clip.
	- delta_action_clip (:obj:`float`): Action clip.
	- add_grad_penalty (:obj:`bool`): Whether to add gradient penalty.
	- grad_norm_type (:obj:`str`): Gradient norm type.
	- grad_margin (:obj:`float`): Gradient margin.
	- grad_loss_weight (:obj:`float`): Gradient loss weight.
	"""
	self.iters = iters
	self.use_langevin_negative_samples = use_langevin_negative_samples
	self.train_samples = train_samples
	self.inference_samples = inference_samples
	self.stepsize_scheduler = stepsize_scheduler
	self.optimize_again = optimize_again
	self.again_stepsize_scheduler = again_stepsize_scheduler
	self.device = device

	self.noise_scale = noise_scale
	self.grad_clip = grad_clip
	self.delta_action_clip = delta_action_clip
	self.add_grad_penalty = add_grad_penalty
	self.grad_norm_type = grad_norm_type
	self.grad_margin = grad_margin
	self.grad_loss_weight = grad_loss_weight

	@staticmethod
	def _gradient_wrt_act(
	obs: torch.Tensor,
	action: torch.Tensor,
	ebm: nn.Module,
	create_graph: bool = False,
	) -> torch.Tensor:
	"""
	Overview:
	Calculate gradient w.r.t action.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observations.
	- action (:obj:`torch.Tensor`): Actions.
	- ebm (:obj:`torch.nn.Module`): Energy based model.
	- create_graph (:obj:`bool`): Whether to create graph.
	Returns:
	- grad (:obj:`torch.Tensor`): Gradient w.r.t action.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
	- action (:obj:`torch.Tensor`): :math:`(B, N, A)`.
	- ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
	- grad (:obj:`torch.Tensor`): :math:`(B, N, A)`.
	"""
	action.requires_grad_(True)
	energy = ebm.forward(obs, action).sum()
	# `create_graph` set to `True` when second order derivative
	# is needed i.e, d(de/da)/d_param
	grad = torch.autograd.grad(energy, action, create_graph=create_graph)[0]
	action.requires_grad_(False)
	return grad

	def grad_penalty(self, obs: torch.Tensor, action: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
	"""
	Overview:
	Calculate gradient penalty.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observations.
	- action (:obj:`torch.Tensor`): Actions.
	- ebm (:obj:`torch.nn.Module`): Energy based model.
	Returns:
	- loss (:obj:`torch.Tensor`): Gradient penalty.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, N+1, O)`.
	- action (:obj:`torch.Tensor`): :math:`(B, N+1, A)`.
	- ebm (:obj:`torch.nn.Module`): :math:`(B, N+1, O)`.
	- loss (:obj:`torch.Tensor`): :math:`(B, )`.
	"""
	if not self.add_grad_penalty:
	return 0.
	# (B, N+1, A), this gradient is differentiable w.r.t model parameters
	de_dact = MCMC._gradient_wrt_act(obs, action, ebm, create_graph=True)

	def compute_grad_norm(grad_norm_type, de_dact) -> torch.Tensor:
	# de_deact: B, N+1, A
	# return: B, N+1
	grad_norm_type_to_ord = {
	'1': 1,
	'2': 2,
	'inf': float('inf'),
	}
	ord = grad_norm_type_to_ord[grad_norm_type]
	return torch.linalg.norm(de_dact, ord, dim=-1)

	# (B, N+1)
	grad_norms = compute_grad_norm(self.grad_norm_type, de_dact)
	grad_norms = grad_norms - self.grad_margin
	grad_norms = grad_norms.clamp(min=0., max=1e10)
	grad_norms = grad_norms.pow(2)

	grad_loss = grad_norms.mean()
	return grad_loss * self.grad_loss_weight

	# can not use @torch.no_grad() during the inference
	# because we need to calculate gradient w.r.t inputs as MCMC updates.
	@no_ebm_grad()
	def _langevin_step(self, obs: torch.Tensor, action: torch.Tensor, stepsize: float, ebm: nn.Module) -> torch.Tensor:
	"""
	Overview:
	Run one langevin MCMC step.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observations.
	- action (:obj:`torch.Tensor`): Actions.
	- stepsize (:obj:`float`): Step size.
	- ebm (:obj:`torch.nn.Module`): Energy based model.
	Returns:
	- action (:obj:`torch.Tensor`): Actions.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
	- action (:obj:`torch.Tensor`): :math:`(B, N, A)`.
	- stepsize (:obj:`float`): :math:`(B, )`.
	- ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
	"""
	l_lambda = 1.0
	de_dact = MCMC._gradient_wrt_act(obs, action, ebm)

	if self.grad_clip:
	de_dact = de_dact.clamp(min=-self.grad_clip, max=self.grad_clip)

	gradient_scale = 0.5
	de_dact = (gradient_scale * l_lambda * de_dact + torch.randn_like(de_dact) * l_lambda * self.noise_scale)

	delta_action = stepsize * de_dact
	delta_action_clip = self.delta_action_clip * 0.5 * (self.action_bounds[1] - self.action_bounds[0])
	delta_action = delta_action.clamp(min=-delta_action_clip, max=delta_action_clip)

	action = action - delta_action
	action = action.clamp(min=self.action_bounds[0], max=self.action_bounds[1])

	return action

	@no_ebm_grad()
	def _langevin_action_given_obs(
	self,
	obs: torch.Tensor,
	action: torch.Tensor,
	ebm: nn.Module,
	scheduler: BaseScheduler = None
	) -> torch.Tensor:
	"""
	Overview:
	Run langevin MCMC for `self.iters` steps.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observations.
	- action (:obj:`torch.Tensor`): Actions.
	- ebm (:obj:`torch.nn.Module`): Energy based model.
	- scheduler (:obj:`BaseScheduler`): Learning rate scheduler.
	Returns:
	- action (:obj:`torch.Tensor`): Actions.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
	- action (:obj:`torch.Tensor`): :math:`(B, N, A)`.
	- ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
	"""
	if not scheduler:
	self.stepsize_scheduler['num_steps'] = self.iters
	scheduler = MCMC.PolynomialScheduler(**self.stepsize_scheduler)
	stepsize = scheduler.get_rate(-1)
	for i in range(self.iters):
	action = self._langevin_step(obs, action, stepsize, ebm)
	stepsize = scheduler.get_rate(i)
	return action

	@no_ebm_grad()
	def sample(self, obs: torch.Tensor, ebm: nn.Module) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Overview:
	Create tiled observations and sample counter-negatives for InfoNCE loss.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observations.
	- ebm (:obj:`torch.nn.Module`): Energy based model.
	Returns:
	- tiled_obs (:obj:`torch.Tensor`): Tiled observations.
	- action_samples (:obj:`torch.Tensor`): Action samples.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, O)`.
	- ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
	- tiled_obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
	- action_samples (:obj:`torch.Tensor`): :math:`(B, N, A)`.
	Examples:
	>>> obs = torch.randn(2, 4)
	>>> ebm = EBM(4, 5)
	>>> opt = MCMC()
	>>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
	>>> tiled_obs, action_samples = opt.sample(obs, ebm)
	"""
	obs, uniform_action_samples = self._sample(obs, self.train_samples)
	if not self.use_langevin_negative_samples:
	return obs, uniform_action_samples
	langevin_action_samples = self._langevin_action_given_obs(obs, uniform_action_samples, ebm)
	return obs, langevin_action_samples

	@no_ebm_grad()
	def infer(self, obs: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
	"""
	Overview:
	Optimize for the best action conditioned on the current observation.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observations.
	- ebm (:obj:`torch.nn.Module`): Energy based model.
	Returns:
	- best_action_samples (:obj:`torch.Tensor`): Actions.
	Shapes:
	- obs (:obj:`torch.Tensor`): :math:`(B, O)`.
	- ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
	- best_action_samples (:obj:`torch.Tensor`): :math:`(B, A)`.
	Examples:
	>>> obs = torch.randn(2, 4)
	>>> ebm = EBM(4, 5)
	>>> opt = MCMC()
	>>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
	>>> best_action_samples = opt.infer(obs, ebm)
	"""
	# (B, N, O), (B, N, A)
	obs, uniform_action_samples = self._sample(obs, self.inference_samples)
	action_samples = self._langevin_action_given_obs(
	obs,
	uniform_action_samples,
	ebm,
	)

	# Run a second optimization, a trick for more precise inference
	if self.optimize_again:
	self.again_stepsize_scheduler['num_steps'] = self.iters
	action_samples = self._langevin_action_given_obs(
	obs,
	action_samples,
	ebm,
	scheduler=MCMC.PolynomialScheduler(**self.again_stepsize_scheduler),
	)

	# action_samples: B, N, A
	return self._get_best_action_sample(obs, action_samples, ebm)


	@MODEL_REGISTRY.register('ebm')
	class EBM(nn.Module):
	"""
	Overview:
	Energy based model.
	Interface:
	``__init__``, ``forward``
	"""

	def __init__(
	self,
	obs_shape: int,
	action_shape: int,
	hidden_size: int = 512,
	hidden_layer_num: int = 4,
	**kwargs,
	):
	"""
	Overview:
	Initialize the EBM.
	Arguments:
	- obs_shape (:obj:`int`): Observation shape.
	- action_shape (:obj:`int`): Action shape.
	- hidden_size (:obj:`int`): Hidden size.
	- hidden_layer_num (:obj:`int`): Number of hidden layers.
	"""
	super().__init__()
	input_size = obs_shape + action_shape
	self.net = nn.Sequential(
	nn.Linear(input_size, hidden_size), nn.ReLU(),
	RegressionHead(
	hidden_size,
	1,
	hidden_layer_num,
	final_tanh=False,
	)
	)

	def forward(self, obs, action):
	"""
	Overview:
	Forward computation graph of EBM.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observation of shape (B, N, O).
	- action (:obj:`torch.Tensor`): Action of shape (B, N, A).
	Returns:
	- pred (:obj:`torch.Tensor`): Energy of shape (B, N).
	Examples:
	>>> obs = torch.randn(2, 3, 4)
	>>> action = torch.randn(2, 3, 5)
	>>> ebm = EBM(4, 5)
	>>> pred = ebm(obs, action)
	"""
	x = torch.cat([obs, action], -1)
	x = self.net(x)
	return x['pred']


	@MODEL_REGISTRY.register('arebm')
	class AutoregressiveEBM(nn.Module):
	"""
	Overview:
	Autoregressive energy based model.
	Interface:
	``__init__``, ``forward``
	"""

	def __init__(
	self,
	obs_shape: int,
	action_shape: int,
	hidden_size: int = 512,
	hidden_layer_num: int = 4,
	):
	"""
	Overview:
	Initialize the AutoregressiveEBM.
	Arguments:
	- obs_shape (:obj:`int`): Observation shape.
	- action_shape (:obj:`int`): Action shape.
	- hidden_size (:obj:`int`): Hidden size.
	- hidden_layer_num (:obj:`int`): Number of hidden layers.
	"""
	super().__init__()
	self.ebm_list = nn.ModuleList()
	for i in range(action_shape):
	self.ebm_list.append(EBM(obs_shape, i + 1, hidden_size, hidden_layer_num))

	def forward(self, obs, action):
	"""
	Overview:
	Forward computation graph of AutoregressiveEBM.
	Arguments:
	- obs (:obj:`torch.Tensor`): Observation of shape (B, N, O).
	- action (:obj:`torch.Tensor`): Action of shape (B, N, A).
	Returns:
	- pred (:obj:`torch.Tensor`): Energy of shape (B, N, A).
	Examples:
	>>> obs = torch.randn(2, 3, 4)
	>>> action = torch.randn(2, 3, 5)
	>>> arebm = AutoregressiveEBM(4, 5)
	>>> pred = arebm(obs, action)
	"""
	output_list = []
	for i, ebm in enumerate(self.ebm_list):
	output_list.append(ebm(obs, action[..., :i + 1]))
	return torch.stack(output_list, axis=-1)