evotorch.distributions
Distribution (TensorMakerMixin, Serializable)
¶
Base class for any search distribution.
Source code in evotorch/distributions.py
class Distribution(TensorMakerMixin, Serializable):
"""
Base class for any search distribution.
"""
MANDATORY_PARAMETERS = set()
OPTIONAL_PARAMETERS = set()
def __init__(
self, *, solution_length: int, parameters: dict, dtype: Optional[DType] = None, device: Optional[Device] = None
):
"""
`__init__(...)`: Initialize the Distribution.
It is expected that one of these two conditions is met:
(i) the inheriting search distribution class does not implement its
own `__init__(...)` method; or
(ii) the inheriting search distribution class has its own
`__init__(...)` method, and calls `Distribution.__init__(...)`
from there, during its initialization phase.
Args:
solution_length: Expected as an integer, this argument represents
the solution length.
parameters: Expected as a dictionary, this argument stores
the parameters of the search distribution.
For example, for a Gaussian distribution where `mu`
represents the mean, and `sigma` represents the coverage
area, this dictionary would have the keys "mu" and "sigma",
and each of these keys would map to a PyTorch tensor.
dtype: The dtype of the search distribution (e.g. torch.float32).
device: The device of the search distribution (e.g. "cpu").
"""
self.__solution_length: int = int(solution_length)
self.__parameters: dict
self.__dtype: torch.dtype
self.__device: torch.device
self.__check_correctness(parameters)
cast_kwargs = {}
if dtype is not None:
cast_kwargs["dtype"] = to_torch_dtype(dtype)
if device is not None:
cast_kwargs["device"] = torch.device(device)
if len(cast_kwargs) == 0:
self.__parameters = copy(parameters)
else:
self.__parameters = cast_tensors_in_container(parameters, **cast_kwargs)
self.__dtype = cast_kwargs.get("dtype", dtype_of_container(parameters))
self.__device = cast_kwargs.get("device", device_of_container(parameters))
def __check_correctness(self, parameters: dict):
found_mandatory = 0
for param_name in parameters.keys():
if param_name in self.MANDATORY_PARAMETERS:
found_mandatory += 1
elif param_name in self.OPTIONAL_PARAMETERS:
pass # nothing to do
else:
raise ValueError(f"Unrecognized parameter: {repr(param_name)}")
if found_mandatory < len(self.MANDATORY_PARAMETERS):
raise ValueError(
f"Not all mandatory parameters of this Distribution were specified."
f" Mandatory parameters of this distribution: {self.MANDATORY_PARAMETERS};"
f" optional parameters of this distribution: {self.OPTIONAL_PARAMETERS};"
f" encountered parameters: {set(parameters.keys())}."
)
def to(self, device: Device) -> "Distribution":
"""
Bring the Distribution onto a computational device.
If the given device is already the device of this Distribution,
then the Distribution itself will be returned.
If the given device is different than the device of this
Distribution, a copy of this Distribution on the given device
will be created and returned.
Args:
device: The computation device onto which the Distribution
will be brought.
Returns:
The Distribution on the target device.
"""
if torch.device(self.device) == torch.device(device):
return self
else:
cls = self.__class__
return cls(solution_length=self.solution_length, parameters=self.parameters, device=device)
def _fill(self, out: torch.Tensor, *, generator: Optional[torch.Generator] = None):
"""
Fill the given tensor with samples from this search distribution.
It is expected that the inheriting search distribution class
has its own implementation for this method.
Args:
out: The PyTorch tensor that will be filled with the samples.
This tensor is expected as 2-dimensional with its number
of columns equal to the solution length declared by this
distribution.
generator: Optionally a PyTorch generator, to be used for
sampling. None means that the global generator of PyTorch
is to be used.
"""
raise NotImplementedError
def sample(
self,
num_solutions: Optional[int] = None,
*,
out: Optional[torch.Tensor] = None,
generator: Any = None,
) -> torch.Tensor:
"""
Sample solutions from this search distribution.
Args:
num_solutions: How many solutions will be sampled.
If this argument is given as an integer and the argument
`out` is left as None, then a new PyTorch tensor, filled
with the samples from this distribution, will be generated
and returned. The number of rows of this new tensor will
be equal to the given `num_solutions`.
If the argument `num_solutions` is provided as an integer,
then the argument `out` is expected as None.
out: The PyTorch tensor that will be filled with the samples
of this distribution. This tensor is expected as a
2-dimensional tensor with its number of columns equal to
the solution length declared by this distribution.
If the argument `out` is provided as a tensor, then the
argument `num_solutions` is expected as None.
generator: Optionally a PyTorch generator or any object which
has a `generator` attribute (e.g. a Problem instance).
If left as None, the global generator of PyTorch will be
used.
Returns:
A 2-dimensional PyTorch tensor which stores the sampled solutions.
"""
if (num_solutions is not None) and (out is not None):
raise ValueError(
"Received both `num_solutions` and `out` with values other than None."
"Please provide only one of these arguments with a value other than None, not both of them."
)
elif (num_solutions is not None) and (out is None):
num_solutions = int(num_solutions)
out = self.make_empty(num_solutions=num_solutions)
elif (num_solutions is None) and (out is not None):
if out.ndim != 2:
raise ValueError(
f"The `sample(...)` method can fill only 2-dimensional tensors."
f" However, the provided `out` tensor has {out.ndim} dimensions, its shape being {out.shape}."
)
_, num_cols = out.shape
if num_cols != self.solution_length:
raise ValueError(
f"The solution length declared by this distribution is {self.solution_length}."
f" However, the provided `out` tensor has {num_cols} columns."
f" The `sample(...)` method can only work with tensors whose number of columns are equal"
f" to the declared solution length."
)
else:
raise ValueError(
"Received both `num_solutions` and `out` as None."
"Please provide one of these arguments with a value other than None."
)
self._fill(out, generator=generator)
return out
def _compute_gradients(self, samples: torch.Tensor, weights: torch.Tensor, ranking_used: Optional[str]) -> dict:
"""
Compute the gradients out of the samples (sampled solutions)
and weights (i.e. weights or ranks of the solutions, better
solutions having numerically higher weights).
It is expected that the inheriting class implements this method.
Args:
samples: The sampled solutions, as a 2-dimensional tensor.
weights: Solution weights, as a 1-dimensional tensor of length
`n`, `n` being the number of sampled solutions.
ranking_used: Ranking that was used to obtain the weights.
Returns:
The gradient(s) in a dictionary.
"""
raise NotImplementedError
def compute_gradients(
self,
samples: torch.Tensor,
fitnesses: torch.Tensor,
*,
objective_sense: str,
ranking_method: Optional[str] = None,
) -> dict:
"""
Compute and return gradients.
Args:
samples: The solutions that were sampled from this Distribution.
The tensor passed via this argument is expected to have
the same dtype and device with this Distribution.
fitnesses: The evaluation results of the sampled solutions.
If fitnesses are given with a different dtype (maybe because
the eval_dtype of the Problem object is different than its
decision variable dtype), then this method will first
create an internal copy of the fitnesses with the correct
dtype, and then will use those copied fitnesses for
computing the gradients.
objective_sense: The objective sense, expected as "min" or "max".
In the case of "min", lower fitness values will be regarded
as better (therefore, in this case, one can alternatively
refer to fitnesses as 'unfitnesses' or 'solution costs').
In the case of "max", higher fitness values will be regarded
as better.
ranking_method: The ranking method to be used.
Can be: "linear" (where ranks linearly go from 0 to 1);
"centered" (where ranks linearly go from -0.5 to +0.5);
"normalized" (where the standard-normalized fitnesses
serve as ranks); or "raw" (where the fitnesses themselves
serve as ranks).
The default is "raw".
Returns:
A dictionary which contains the gradient for each parameter of the
distribution.
"""
if objective_sense == "max":
higher_is_better = True
elif objective_sense == "min":
higher_is_better = False
else:
raise ValueError(
f'`objective_sense` was expected as "min" or as "max".'
f" However, it was encountered as {repr(objective_sense)}."
)
if ranking_method is None:
ranking_method = "raw"
# Make sure that the fitnesses are in the correct dtype
fitnesses = torch.as_tensor(fitnesses, dtype=self.dtype)
[num_samples, _] = samples.shape
[num_fitnesses] = fitnesses.shape
if num_samples != num_fitnesses:
raise ValueError(
f"The number of samples and the number of fitnesses do not match:" f" {num_samples} != {num_fitnesses}."
)
weights = rank(fitnesses, ranking_method=ranking_method, higher_is_better=higher_is_better)
return self._compute_gradients(samples, weights, ranking_method)
def update_parameters(
self,
gradients: dict,
*,
learning_rates: Optional[dict] = None,
optimizers: Optional[dict] = None,
) -> "Distribution":
"""
Do an update on the distribution by following the given gradients.
It is expected that the inheriting class has its own implementation
for this method.
Args:
gradients: Gradients, as a dictionary, which will be used for
computing the necessary updates.
learning_rates: A dictionary which contains learning rates
for parameters that will be updated using a learning rate
coefficient.
optimizers: A dictionary which contains optimizer objects
for parameters that will be updated using an adaptive
optimizer.
Returns:
The updated copy of the distribution.
"""
raise NotImplementedError
def modified_copy(
self, *, dtype: Optional[DType] = None, device: Optional[Device] = None, **parameters
) -> "Distribution":
"""
Return a modified copy of this distribution.
Args:
dtype: The new dtype of the distribution.
device: The new device of the distribution.
parameters: Expected in the form of extra keyword arguments.
Each of these keyword arguments will cause the new distribution
to have a modified value for the specified parameter.
Returns:
The modified copy of the distribution.
"""
cls = self.__class__
if device is None:
device = self.device
if dtype is None:
dtype = self.dtype
new_parameters = copy(self.parameters)
new_parameters.update(parameters)
return cls(parameters=new_parameters, dtype=dtype, device=device)
def relative_entropy(dist_0: "Distribution", dist_1: "Distribution") -> float:
raise NotImplementedError
@property
def solution_length(self) -> int:
return self.__solution_length
@property
def device(self) -> torch.device:
return self.__device
@property
def dtype(self) -> torch.dtype:
return self.__dtype
@property
def parameters(self) -> dict:
return self.__parameters
def _follow_gradient(
self,
param_name: str,
x: torch.Tensor,
*,
learning_rates: Optional[dict] = None,
optimizers: Optional[dict] = None,
) -> torch.Tensor:
x = torch.as_tensor(x, dtype=self.dtype, device=self.device)
learning_rate, optimizer = self._get_learning_rate_and_optimizer(param_name, learning_rates, optimizers)
if (learning_rate is None) and (optimizer is None):
return x
elif (learning_rate is not None) and (optimizer is None):
return learning_rate * x
elif (learning_rate is None) and (optimizer is not None):
return optimizer.ascent(x)
else:
raise ValueError(
"Encountered both `learning_rate` and `optimizer` as values other than None."
" This method can only work if both of them are None or only one of them is not None."
)
@staticmethod
def _get_learning_rate_and_optimizer(
param_name: str, learning_rates: Optional[dict], optimizers: Optional[dict]
) -> tuple:
if learning_rates is None:
learning_rates = {}
if optimizers is None:
optimizers = {}
return learning_rates.get(param_name, None), optimizers.get(param_name, None)
@torch.no_grad()
def _get_cloned_state(self, *, memo: dict) -> dict:
return deep_clone(
self.__dict__,
otherwise_deepcopy=True,
memo=memo,
)
__init__(self, *, solution_length, parameters, dtype=None, device=None)
special
¶
__init__(...)
: Initialize the Distribution.
It is expected that one of these two conditions is met:
(i) the inheriting search distribution class does not implement its
own __init__(...)
method; or
(ii) the inheriting search distribution class has its own
__init__(...)
method, and calls Distribution.__init__(...)
from there, during its initialization phase.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
solution_length |
int |
Expected as an integer, this argument represents the solution length. |
required |
parameters |
dict |
Expected as a dictionary, this argument stores
the parameters of the search distribution.
For example, for a Gaussian distribution where |
required |
dtype |
Union[str, torch.dtype, numpy.dtype, Type] |
The dtype of the search distribution (e.g. torch.float32). |
None |
device |
Union[str, torch.device] |
The device of the search distribution (e.g. "cpu"). |
None |
Source code in evotorch/distributions.py
def __init__(
self, *, solution_length: int, parameters: dict, dtype: Optional[DType] = None, device: Optional[Device] = None
):
"""
`__init__(...)`: Initialize the Distribution.
It is expected that one of these two conditions is met:
(i) the inheriting search distribution class does not implement its
own `__init__(...)` method; or
(ii) the inheriting search distribution class has its own
`__init__(...)` method, and calls `Distribution.__init__(...)`
from there, during its initialization phase.
Args:
solution_length: Expected as an integer, this argument represents
the solution length.
parameters: Expected as a dictionary, this argument stores
the parameters of the search distribution.
For example, for a Gaussian distribution where `mu`
represents the mean, and `sigma` represents the coverage
area, this dictionary would have the keys "mu" and "sigma",
and each of these keys would map to a PyTorch tensor.
dtype: The dtype of the search distribution (e.g. torch.float32).
device: The device of the search distribution (e.g. "cpu").
"""
self.__solution_length: int = int(solution_length)
self.__parameters: dict
self.__dtype: torch.dtype
self.__device: torch.device
self.__check_correctness(parameters)
cast_kwargs = {}
if dtype is not None:
cast_kwargs["dtype"] = to_torch_dtype(dtype)
if device is not None:
cast_kwargs["device"] = torch.device(device)
if len(cast_kwargs) == 0:
self.__parameters = copy(parameters)
else:
self.__parameters = cast_tensors_in_container(parameters, **cast_kwargs)
self.__dtype = cast_kwargs.get("dtype", dtype_of_container(parameters))
self.__device = cast_kwargs.get("device", device_of_container(parameters))
compute_gradients(self, samples, fitnesses, *, objective_sense, ranking_method=None)
¶
Compute and return gradients.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
samples |
Tensor |
The solutions that were sampled from this Distribution. The tensor passed via this argument is expected to have the same dtype and device with this Distribution. |
required |
fitnesses |
Tensor |
The evaluation results of the sampled solutions. If fitnesses are given with a different dtype (maybe because the eval_dtype of the Problem object is different than its decision variable dtype), then this method will first create an internal copy of the fitnesses with the correct dtype, and then will use those copied fitnesses for computing the gradients. |
required |
objective_sense |
str |
The objective sense, expected as "min" or "max". In the case of "min", lower fitness values will be regarded as better (therefore, in this case, one can alternatively refer to fitnesses as 'unfitnesses' or 'solution costs'). In the case of "max", higher fitness values will be regarded as better. |
required |
ranking_method |
Optional[str] |
The ranking method to be used. Can be: "linear" (where ranks linearly go from 0 to 1); "centered" (where ranks linearly go from -0.5 to +0.5); "normalized" (where the standard-normalized fitnesses serve as ranks); or "raw" (where the fitnesses themselves serve as ranks). The default is "raw". |
None |
Returns:
Type | Description |
---|---|
dict |
A dictionary which contains the gradient for each parameter of the distribution. |
Source code in evotorch/distributions.py
def compute_gradients(
self,
samples: torch.Tensor,
fitnesses: torch.Tensor,
*,
objective_sense: str,
ranking_method: Optional[str] = None,
) -> dict:
"""
Compute and return gradients.
Args:
samples: The solutions that were sampled from this Distribution.
The tensor passed via this argument is expected to have
the same dtype and device with this Distribution.
fitnesses: The evaluation results of the sampled solutions.
If fitnesses are given with a different dtype (maybe because
the eval_dtype of the Problem object is different than its
decision variable dtype), then this method will first
create an internal copy of the fitnesses with the correct
dtype, and then will use those copied fitnesses for
computing the gradients.
objective_sense: The objective sense, expected as "min" or "max".
In the case of "min", lower fitness values will be regarded
as better (therefore, in this case, one can alternatively
refer to fitnesses as 'unfitnesses' or 'solution costs').
In the case of "max", higher fitness values will be regarded
as better.
ranking_method: The ranking method to be used.
Can be: "linear" (where ranks linearly go from 0 to 1);
"centered" (where ranks linearly go from -0.5 to +0.5);
"normalized" (where the standard-normalized fitnesses
serve as ranks); or "raw" (where the fitnesses themselves
serve as ranks).
The default is "raw".
Returns:
A dictionary which contains the gradient for each parameter of the
distribution.
"""
if objective_sense == "max":
higher_is_better = True
elif objective_sense == "min":
higher_is_better = False
else:
raise ValueError(
f'`objective_sense` was expected as "min" or as "max".'
f" However, it was encountered as {repr(objective_sense)}."
)
if ranking_method is None:
ranking_method = "raw"
# Make sure that the fitnesses are in the correct dtype
fitnesses = torch.as_tensor(fitnesses, dtype=self.dtype)
[num_samples, _] = samples.shape
[num_fitnesses] = fitnesses.shape
if num_samples != num_fitnesses:
raise ValueError(
f"The number of samples and the number of fitnesses do not match:" f" {num_samples} != {num_fitnesses}."
)
weights = rank(fitnesses, ranking_method=ranking_method, higher_is_better=higher_is_better)
return self._compute_gradients(samples, weights, ranking_method)
modified_copy(self, *, dtype=None, device=None, **parameters)
¶
Return a modified copy of this distribution.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dtype |
Union[str, torch.dtype, numpy.dtype, Type] |
The new dtype of the distribution. |
None |
device |
Union[str, torch.device] |
The new device of the distribution. |
None |
parameters |
Expected in the form of extra keyword arguments. Each of these keyword arguments will cause the new distribution to have a modified value for the specified parameter. |
{} |
Returns:
Type | Description |
---|---|
Distribution |
The modified copy of the distribution. |
Source code in evotorch/distributions.py
def modified_copy(
self, *, dtype: Optional[DType] = None, device: Optional[Device] = None, **parameters
) -> "Distribution":
"""
Return a modified copy of this distribution.
Args:
dtype: The new dtype of the distribution.
device: The new device of the distribution.
parameters: Expected in the form of extra keyword arguments.
Each of these keyword arguments will cause the new distribution
to have a modified value for the specified parameter.
Returns:
The modified copy of the distribution.
"""
cls = self.__class__
if device is None:
device = self.device
if dtype is None:
dtype = self.dtype
new_parameters = copy(self.parameters)
new_parameters.update(parameters)
return cls(parameters=new_parameters, dtype=dtype, device=device)
sample(self, num_solutions=None, *, out=None, generator=None)
¶
Sample solutions from this search distribution.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
num_solutions |
Optional[int] |
How many solutions will be sampled.
If this argument is given as an integer and the argument
|
None |
out |
Optional[torch.Tensor] |
The PyTorch tensor that will be filled with the samples
of this distribution. This tensor is expected as a
2-dimensional tensor with its number of columns equal to
the solution length declared by this distribution.
If the argument |
None |
generator |
Any |
Optionally a PyTorch generator or any object which
has a |
None |
Returns:
Type | Description |
---|---|
Tensor |
A 2-dimensional PyTorch tensor which stores the sampled solutions. |
Source code in evotorch/distributions.py
def sample(
self,
num_solutions: Optional[int] = None,
*,
out: Optional[torch.Tensor] = None,
generator: Any = None,
) -> torch.Tensor:
"""
Sample solutions from this search distribution.
Args:
num_solutions: How many solutions will be sampled.
If this argument is given as an integer and the argument
`out` is left as None, then a new PyTorch tensor, filled
with the samples from this distribution, will be generated
and returned. The number of rows of this new tensor will
be equal to the given `num_solutions`.
If the argument `num_solutions` is provided as an integer,
then the argument `out` is expected as None.
out: The PyTorch tensor that will be filled with the samples
of this distribution. This tensor is expected as a
2-dimensional tensor with its number of columns equal to
the solution length declared by this distribution.
If the argument `out` is provided as a tensor, then the
argument `num_solutions` is expected as None.
generator: Optionally a PyTorch generator or any object which
has a `generator` attribute (e.g. a Problem instance).
If left as None, the global generator of PyTorch will be
used.
Returns:
A 2-dimensional PyTorch tensor which stores the sampled solutions.
"""
if (num_solutions is not None) and (out is not None):
raise ValueError(
"Received both `num_solutions` and `out` with values other than None."
"Please provide only one of these arguments with a value other than None, not both of them."
)
elif (num_solutions is not None) and (out is None):
num_solutions = int(num_solutions)
out = self.make_empty(num_solutions=num_solutions)
elif (num_solutions is None) and (out is not None):
if out.ndim != 2:
raise ValueError(
f"The `sample(...)` method can fill only 2-dimensional tensors."
f" However, the provided `out` tensor has {out.ndim} dimensions, its shape being {out.shape}."
)
_, num_cols = out.shape
if num_cols != self.solution_length:
raise ValueError(
f"The solution length declared by this distribution is {self.solution_length}."
f" However, the provided `out` tensor has {num_cols} columns."
f" The `sample(...)` method can only work with tensors whose number of columns are equal"
f" to the declared solution length."
)
else:
raise ValueError(
"Received both `num_solutions` and `out` as None."
"Please provide one of these arguments with a value other than None."
)
self._fill(out, generator=generator)
return out
to(self, device)
¶
Bring the Distribution onto a computational device.
If the given device is already the device of this Distribution, then the Distribution itself will be returned. If the given device is different than the device of this Distribution, a copy of this Distribution on the given device will be created and returned.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
device |
Union[str, torch.device] |
The computation device onto which the Distribution will be brought. |
required |
Returns:
Type | Description |
---|---|
Distribution |
The Distribution on the target device. |
Source code in evotorch/distributions.py
def to(self, device: Device) -> "Distribution":
"""
Bring the Distribution onto a computational device.
If the given device is already the device of this Distribution,
then the Distribution itself will be returned.
If the given device is different than the device of this
Distribution, a copy of this Distribution on the given device
will be created and returned.
Args:
device: The computation device onto which the Distribution
will be brought.
Returns:
The Distribution on the target device.
"""
if torch.device(self.device) == torch.device(device):
return self
else:
cls = self.__class__
return cls(solution_length=self.solution_length, parameters=self.parameters, device=device)
update_parameters(self, gradients, *, learning_rates=None, optimizers=None)
¶
Do an update on the distribution by following the given gradients.
It is expected that the inheriting class has its own implementation for this method.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
gradients |
dict |
Gradients, as a dictionary, which will be used for computing the necessary updates. |
required |
learning_rates |
Optional[dict] |
A dictionary which contains learning rates for parameters that will be updated using a learning rate coefficient. |
None |
optimizers |
Optional[dict] |
A dictionary which contains optimizer objects for parameters that will be updated using an adaptive optimizer. |
None |
Returns:
Type | Description |
---|---|
Distribution |
The updated copy of the distribution. |
Source code in evotorch/distributions.py
def update_parameters(
self,
gradients: dict,
*,
learning_rates: Optional[dict] = None,
optimizers: Optional[dict] = None,
) -> "Distribution":
"""
Do an update on the distribution by following the given gradients.
It is expected that the inheriting class has its own implementation
for this method.
Args:
gradients: Gradients, as a dictionary, which will be used for
computing the necessary updates.
learning_rates: A dictionary which contains learning rates
for parameters that will be updated using a learning rate
coefficient.
optimizers: A dictionary which contains optimizer objects
for parameters that will be updated using an adaptive
optimizer.
Returns:
The updated copy of the distribution.
"""
raise NotImplementedError
ExpGaussian (Distribution)
¶
exponential Multivariate Gaussian, as used by XNES
Source code in evotorch/distributions.py
class ExpGaussian(Distribution):
"""exponential Multivariate Gaussian, as used by XNES"""
# Corresponding to mu and A in symbols used in xNES paper
MANDATORY_PARAMETERS = {"mu", "sigma"}
# Inverse of sigma, numerically more stable to track this independently to sigma
OPTIONAL_PARAMETERS = {"sigma_inv"}
def __init__(
self,
parameters: dict,
*,
solution_length: Optional[int] = None,
device: Optional[Device] = None,
dtype: Optional[DType] = None,
):
[mu_length] = parameters["mu"].shape
# Make sigma 2D
if len(parameters["sigma"].shape) == 1:
parameters["sigma"] = torch.diag(parameters["sigma"])
# Automatically generate sigma_inv if not provided
if "sigma_inv" not in parameters:
parameters["sigma_inv"] = torch.inverse(parameters["sigma"])
[sigma_length, _] = parameters["sigma"].shape
if solution_length is None:
solution_length = mu_length
else:
if solution_length != mu_length:
raise ValueError(
f"The argument `solution_length` does not match the length of `mu` provided in `parameters`."
f" solution_length={solution_length},"
f' parameters["mu"]={mu_length}.'
)
if mu_length != sigma_length:
raise ValueError(
f"The tensors `mu` and `sigma` provided within `parameters` have mismatching lengths."
f' parameters["mu"]={mu_length},'
f' parameters["sigma"]={sigma_length}.'
)
super().__init__(
solution_length=solution_length,
parameters=parameters,
device=device,
dtype=dtype,
)
# Make identity matrix as this is used throughout in gradient computation
self.eye = self.make_zeros((solution_length, solution_length))
self.eye[range(self.solution_length), range(self.solution_length)] = 1.0
@property
def mu(self) -> torch.Tensor:
"""Getter for mu
Returns:
mu (torch.Tensor): The center of the search distribution
"""
return self.parameters["mu"]
@mu.setter
def mu(self, new_mu: Iterable):
"""Setter for mu
Args:
new_mu (torch.Tensor): The new value of mu
"""
self.parameters["mu"] = torch.as_tensor(new_mu, dtype=self.dtype, device=self.device)
@property
def cov(self) -> torch.Tensor:
"""The covariance matrix A^T A"""
return self.sigma.transpose(0, 1) @ self.sigma
@property
def sigma(self) -> torch.Tensor:
"""Getter for sigma
Returns:
sigma (torch.Tensor): The square root of the covariance matrix
"""
return self.parameters["sigma"]
@property
def sigma_inv(self) -> torch.Tensor:
"""Getter for sigma_inv
Returns:
sigma_inv (torch.Tensor): The inverse square root of the covariance matrix
"""
if "sigma_inv" in self.parameters:
return self.parameters["sigma_inv"]
else:
return torch.inverse(self.parameters["sigma"])
@property
def A(self) -> torch.Tensor:
"""Alias for self.sigma, for notational consistency with paper"""
return self.sigma
@property
def A_inv(self) -> torch.Tensor:
"""Alias for self.sigma_inv, for notational consistency with paper"""
return self.sigma_inv
@sigma.setter
def sigma(self, new_sigma: Iterable):
"""Setter for sigma
Args:
new_sigma (torch.Tensor): The new value of sigma, the square root of the covariance matrix
"""
self.parameters["sigma"] = torch.as_tensor(new_sigma, dtype=self.dtype, device=self.device)
def to_global_coordinates(self, local_coordinates: torch.Tensor) -> torch.Tensor:
"""Map samples from local coordinate space N(0, I_d) to global coordinate space N(mu, A^T A)
This function is the inverse of to_local_coordinates
Args:
local_coordinates (torch.Tensor): The local coordinates sampled from N(0, I_d)
Returns:
global_coordinates (torch.Tensor): The global coordinates sampled from N(mu, A^T A)
"""
# Global samples are constructed as x = mu + A z where z is local coordinate
# We use transpose here to simplify the batched application of A
return self.mu.unsqueeze(0) + (self.A @ local_coordinates.T).T
def to_local_coordinates(self, global_coordinates: torch.Tensor) -> torch.Tensor:
"""Map samples from global coordinate space N(mu, A^T A) to local coordinate space N(0, I_d)
This function is the inverse of to_global_coordinates
Args:
global_coordinates (torch.Tensor): The global coordinates sampled from N(mu, A^T A)
Returns:
local_coordinates (torch.Tensor): The local coordinates sampled from N(0, I_d)
"""
# Global samples are constructed as x = mu + A z where z is local coordinate
# Therefore, we can recover z according to z = A_inv (x - mu)
return (self.A_inv @ (global_coordinates - self.mu.unsqueeze(0)).T).T
def _fill(self, out: torch.Tensor, *, generator: Optional[torch.Generator] = None):
"""Fill a tensor with samples from N(mu, A^T A)
Args:
out (torch.Tensor): The tensor to fill
generator (Optional[torch.Generator]): A generator to use to generate random values
"""
# Fill with local coordinates from N(0, I_d)
self.make_gaussian(out=out, generator=generator)
# Map local coordinates to global coordinate system
out[:] = self.to_global_coordinates(out)
def _compute_gradients(self, samples: torch.Tensor, weights: torch.Tensor, ranking_used: Optional[str]) -> dict:
"""Compute the gradients with respect to a given set of samples and weights
Args:
samples (torch.Tensor): Samples drawn from N(mu, A^T A), ideally using self._fill
weights (torch.Tensor): Weights e.g. fitnesses or utilities assigned to samples
ranking_used (optional[str]): The ranking method used to compute weights
Returns:
grads (dict): A dictionary containing the approximated natural gradient on d and M
"""
# Compute the local coordinates
local_coordinates = self.to_local_coordinates(samples)
# Make sure that the weights (utilities) are 0-centered
# (Otherwise the formulations would have to consider a bias term)
if ranking_used not in ("centered", "normalized"):
weights = weights - torch.mean(weights)
d_grad = total(dot(weights, local_coordinates))
local_coordinates_outer = local_coordinates.unsqueeze(1) * local_coordinates.unsqueeze(2)
M_grad = torch.sum(
weights.unsqueeze(-1).unsqueeze(-1) * (local_coordinates_outer - self.eye.unsqueeze(0)), dim=0
)
return {
"d": d_grad,
"M": M_grad,
}
def update_parameters(
self,
gradients: dict,
*,
learning_rates: Optional[dict] = None,
optimizers: Optional[dict] = None,
) -> "ExpGaussian":
d_grad = gradients["d"]
M_grad = gradients["M"]
if "d" not in learning_rates:
learning_rates["d"] = learning_rates["mu"]
if "M" not in learning_rates:
learning_rates["M"] = learning_rates["sigma"]
# Follow gradients for d, and M
update_d = self._follow_gradient("d", d_grad, learning_rates=learning_rates, optimizers=optimizers)
update_M = self._follow_gradient("M", M_grad, learning_rates=learning_rates, optimizers=optimizers)
# Fold into parameters mu, A and A inv
new_mu = self.mu + torch.mv(self.A, update_d)
new_A = self.A @ torch.matrix_exp(0.5 * update_M)
new_A_inv = torch.matrix_exp(-0.5 * update_M) @ self.A_inv
# Return modified distribution
return self.modified_copy(mu=new_mu, sigma=new_A, sigma_inv=new_A_inv)
A: Tensor
property
readonly
¶
Alias for self.sigma, for notational consistency with paper
A_inv: Tensor
property
readonly
¶
Alias for self.sigma_inv, for notational consistency with paper
cov: Tensor
property
readonly
¶
The covariance matrix A^T A
mu: Tensor
property
writable
¶
Getter for mu
Returns:
Type | Description |
---|---|
mu (torch.Tensor) |
The center of the search distribution |
sigma: Tensor
property
writable
¶
Getter for sigma
Returns:
Type | Description |
---|---|
sigma (torch.Tensor) |
The square root of the covariance matrix |
sigma_inv: Tensor
property
readonly
¶
Getter for sigma_inv
Returns:
Type | Description |
---|---|
sigma_inv (torch.Tensor) |
The inverse square root of the covariance matrix |
to_global_coordinates(self, local_coordinates)
¶
Map samples from local coordinate space N(0, I_d) to global coordinate space N(mu, A^T A) This function is the inverse of to_local_coordinates
Parameters:
Name | Type | Description | Default |
---|---|---|---|
local_coordinates |
torch.Tensor |
The local coordinates sampled from N(0, I_d) |
required |
Returns:
Type | Description |
---|---|
global_coordinates (torch.Tensor) |
The global coordinates sampled from N(mu, A^T A) |
Source code in evotorch/distributions.py
def to_global_coordinates(self, local_coordinates: torch.Tensor) -> torch.Tensor:
"""Map samples from local coordinate space N(0, I_d) to global coordinate space N(mu, A^T A)
This function is the inverse of to_local_coordinates
Args:
local_coordinates (torch.Tensor): The local coordinates sampled from N(0, I_d)
Returns:
global_coordinates (torch.Tensor): The global coordinates sampled from N(mu, A^T A)
"""
# Global samples are constructed as x = mu + A z where z is local coordinate
# We use transpose here to simplify the batched application of A
return self.mu.unsqueeze(0) + (self.A @ local_coordinates.T).T
to_local_coordinates(self, global_coordinates)
¶
Map samples from global coordinate space N(mu, A^T A) to local coordinate space N(0, I_d) This function is the inverse of to_global_coordinates
Parameters:
Name | Type | Description | Default |
---|---|---|---|
global_coordinates |
torch.Tensor |
The global coordinates sampled from N(mu, A^T A) |
required |
Returns:
Type | Description |
---|---|
local_coordinates (torch.Tensor) |
The local coordinates sampled from N(0, I_d) |
Source code in evotorch/distributions.py
def to_local_coordinates(self, global_coordinates: torch.Tensor) -> torch.Tensor:
"""Map samples from global coordinate space N(mu, A^T A) to local coordinate space N(0, I_d)
This function is the inverse of to_global_coordinates
Args:
global_coordinates (torch.Tensor): The global coordinates sampled from N(mu, A^T A)
Returns:
local_coordinates (torch.Tensor): The local coordinates sampled from N(0, I_d)
"""
# Global samples are constructed as x = mu + A z where z is local coordinate
# Therefore, we can recover z according to z = A_inv (x - mu)
return (self.A_inv @ (global_coordinates - self.mu.unsqueeze(0)).T).T
update_parameters(self, gradients, *, learning_rates=None, optimizers=None)
¶
Do an update on the distribution by following the given gradients.
It is expected that the inheriting class has its own implementation for this method.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
gradients |
dict |
Gradients, as a dictionary, which will be used for computing the necessary updates. |
required |
learning_rates |
Optional[dict] |
A dictionary which contains learning rates for parameters that will be updated using a learning rate coefficient. |
None |
optimizers |
Optional[dict] |
A dictionary which contains optimizer objects for parameters that will be updated using an adaptive optimizer. |
None |
Returns:
Type | Description |
---|---|
ExpGaussian |
The updated copy of the distribution. |
Source code in evotorch/distributions.py
def update_parameters(
self,
gradients: dict,
*,
learning_rates: Optional[dict] = None,
optimizers: Optional[dict] = None,
) -> "ExpGaussian":
d_grad = gradients["d"]
M_grad = gradients["M"]
if "d" not in learning_rates:
learning_rates["d"] = learning_rates["mu"]
if "M" not in learning_rates:
learning_rates["M"] = learning_rates["sigma"]
# Follow gradients for d, and M
update_d = self._follow_gradient("d", d_grad, learning_rates=learning_rates, optimizers=optimizers)
update_M = self._follow_gradient("M", M_grad, learning_rates=learning_rates, optimizers=optimizers)
# Fold into parameters mu, A and A inv
new_mu = self.mu + torch.mv(self.A, update_d)
new_A = self.A @ torch.matrix_exp(0.5 * update_M)
new_A_inv = torch.matrix_exp(-0.5 * update_M) @ self.A_inv
# Return modified distribution
return self.modified_copy(mu=new_mu, sigma=new_A, sigma_inv=new_A_inv)
ExpSeparableGaussian (SeparableGaussian)
¶
exponentialseparable Multivariate Gaussian, as used by SNES
Source code in evotorch/distributions.py
class ExpSeparableGaussian(SeparableGaussian):
"""exponentialseparable Multivariate Gaussian, as used by SNES"""
MANDATORY_PARAMETERS = {"mu", "sigma"}
OPTIONAL_PARAMETERS = set()
def _compute_gradients(self, samples: torch.Tensor, weights: torch.Tensor, ranking_used: Optional[str]) -> dict:
if ranking_used != "nes":
weights = weights / torch.sum(torch.abs(weights))
scaled_noises = samples - self.mu
raw_noises = scaled_noises / self.sigma
mu_grad = total(dot(weights, scaled_noises))
sigma_grad = total(dot(weights, (raw_noises**2) - 1))
return {"mu": mu_grad, "sigma": sigma_grad}
def update_parameters(
self,
gradients: dict,
*,
learning_rates: Optional[dict] = None,
optimizers: Optional[dict] = None,
) -> "ExpSeparableGaussian":
mu_grad = gradients["mu"]
sigma_grad = gradients["sigma"]
new_mu = self.mu + self._follow_gradient("mu", mu_grad, learning_rates=learning_rates, optimizers=optimizers)
new_sigma = self.sigma * torch.exp(
0.5 * self._follow_gradient("sigma", sigma_grad, learning_rates=learning_rates, optimizers=optimizers)
)
return self.modified_copy(mu=new_mu, sigma=new_sigma)
update_parameters(self, gradients, *, learning_rates=None, optimizers=None)
¶
Do an update on the distribution by following the given gradients.
It is expected that the inheriting class has its own implementation for this method.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
gradients |
dict |
Gradients, as a dictionary, which will be used for computing the necessary updates. |
required |
learning_rates |
Optional[dict] |
A dictionary which contains learning rates for parameters that will be updated using a learning rate coefficient. |
None |
optimizers |
Optional[dict] |
A dictionary which contains optimizer objects for parameters that will be updated using an adaptive optimizer. |
None |
Returns:
Type | Description |
---|---|
ExpSeparableGaussian |
The updated copy of the distribution. |
Source code in evotorch/distributions.py
def update_parameters(
self,
gradients: dict,
*,
learning_rates: Optional[dict] = None,
optimizers: Optional[dict] = None,
) -> "ExpSeparableGaussian":
mu_grad = gradients["mu"]
sigma_grad = gradients["sigma"]
new_mu = self.mu + self._follow_gradient("mu", mu_grad, learning_rates=learning_rates, optimizers=optimizers)
new_sigma = self.sigma * torch.exp(
0.5 * self._follow_gradient("sigma", sigma_grad, learning_rates=learning_rates, optimizers=optimizers)
)
return self.modified_copy(mu=new_mu, sigma=new_sigma)
SeparableGaussian (Distribution)
¶
Separable Multivariate Gaussian, as used by PGPE
Source code in evotorch/distributions.py
class SeparableGaussian(Distribution):
"""Separable Multivariate Gaussian, as used by PGPE"""
MANDATORY_PARAMETERS = {"mu", "sigma"}
OPTIONAL_PARAMETERS = {"divide_mu_grad_by", "divide_sigma_grad_by", "parenthood_ratio"}
def __init__(
self,
parameters: dict,
*,
solution_length: Optional[int] = None,
device: Optional[Device] = None,
dtype: Optional[DType] = None,
):
[mu_length] = parameters["mu"].shape
[sigma_length] = parameters["sigma"].shape
if solution_length is None:
solution_length = mu_length
else:
if solution_length != mu_length:
raise ValueError(
f"The argument `solution_length` does not match the length of `mu` provided in `parameters`."
f" solution_length={solution_length},"
f' parameters["mu"]={mu_length}.'
)
if mu_length != sigma_length:
raise ValueError(
f"The tensors `mu` and `sigma` provided within `parameters` have mismatching lengths."
f' parameters["mu"]={mu_length},'
f' parameters["sigma"]={sigma_length}.'
)
super().__init__(
solution_length=solution_length,
parameters=parameters,
device=device,
dtype=dtype,
)
@property
def mu(self) -> torch.Tensor:
return self.parameters["mu"]
@mu.setter
def mu(self, new_mu: Iterable):
self.parameters["mu"] = torch.as_tensor(new_mu, dtype=self.dtype, device=self.device)
@property
def sigma(self) -> torch.Tensor:
return self.parameters["sigma"]
@sigma.setter
def sigma(self, new_sigma: Iterable):
self.parameters["sigma"] = torch.as_tensor(new_sigma, dtype=self.dtype, device=self.device)
def _fill(self, out: torch.Tensor, *, generator: Optional[torch.Generator] = None):
self.make_gaussian(out=out, center=self.mu, stdev=self.sigma, generator=generator)
def _divide_grad(self, param_name: str, grad: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:
option = f"divide_{param_name}_grad_by"
if option in self.parameters:
div_by_what = self.parameters[option]
if div_by_what == "num_solutions":
[num_solutions] = weights.shape
grad = grad / num_solutions
elif div_by_what == "num_directions":
[num_solutions] = weights.shape
num_directions = num_solutions // 2
grad = grad / num_directions
elif div_by_what == "total_weight":
total_weight = torch.sum(torch.abs(weights))
grad = grad / total_weight
elif div_by_what == "weight_stdev":
weight_stdev = torch.std(weights)
grad = grad / weight_stdev
else:
raise ValueError(f"The parameter {option} has an unrecognized value: {div_by_what}")
return grad
def _compute_gradients_via_parenthood_ratio(self, samples: torch.Tensor, weights: torch.Tensor) -> dict:
[num_samples, _] = samples.shape
num_elites = math.floor(num_samples * self.parameters["parenthood_ratio"])
elite_indices = weights.argsort(descending=True)[:num_elites]
elites = samples[elite_indices, :]
return {
"mu": torch.mean(elites, dim=0) - self.parameters["mu"],
"sigma": torch.std(elites, dim=0) - self.parameters["sigma"],
}
def _compute_gradients(self, samples: torch.Tensor, weights: torch.Tensor, ranking_used: Optional[str]) -> dict:
if "parenthood_ratio" in self.parameters:
return self._compute_gradients_via_parenthood_ratio(samples, weights)
else:
mu = self.mu
sigma = self.sigma
# Compute the scaled noises, that is, the noise vectors which
# were used for generating the solutions
# (solution = scaled_noise + center)
scaled_noises = samples - mu
# Make sure that the weights (utilities) are 0-centered
# (Otherwise the formulations would have to consider a bias term)
if ranking_used not in ("centered", "normalized"):
weights = weights - torch.mean(weights)
mu_grad = self._divide_grad(
"mu",
total(dot(weights, scaled_noises)),
weights,
)
sigma_grad = self._divide_grad(
"sigma",
total(dot(weights, ((scaled_noises**2) - (sigma**2)) / sigma)),
weights,
)
return {
"mu": mu_grad,
"sigma": sigma_grad,
}
def update_parameters(
self,
gradients: dict,
*,
learning_rates: Optional[dict] = None,
optimizers: Optional[dict] = None,
) -> "SeparableGaussian":
mu_grad = gradients["mu"]
sigma_grad = gradients["sigma"]
new_mu = self.mu + self._follow_gradient("mu", mu_grad, learning_rates=learning_rates, optimizers=optimizers)
new_sigma = self.sigma + self._follow_gradient(
"sigma", sigma_grad, learning_rates=learning_rates, optimizers=optimizers
)
return self.modified_copy(mu=new_mu, sigma=new_sigma)
def relative_entropy(dist_0: "SeparableGaussian", dist_1: "SeparableGaussian") -> float:
mu_0 = dist_0.parameters["mu"]
mu_1 = dist_1.parameters["mu"]
sigma_0 = dist_0.parameters["sigma"]
sigma_1 = dist_1.parameters["sigma"]
cov_0 = sigma_0.pow(2.0)
cov_1 = sigma_1.pow(2.0)
mu_delta = mu_1 - mu_0
trace_cov = torch.sum(cov_0 / cov_1)
k = dist_0.solution_length
scaled_mu = torch.sum(mu_delta.pow(2.0) / cov_1)
log_det = torch.sum(torch.log(cov_1)) - torch.sum(torch.log(cov_0))
return 0.5 * (trace_cov - k + scaled_mu + log_det)
update_parameters(self, gradients, *, learning_rates=None, optimizers=None)
¶
Do an update on the distribution by following the given gradients.
It is expected that the inheriting class has its own implementation for this method.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
gradients |
dict |
Gradients, as a dictionary, which will be used for computing the necessary updates. |
required |
learning_rates |
Optional[dict] |
A dictionary which contains learning rates for parameters that will be updated using a learning rate coefficient. |
None |
optimizers |
Optional[dict] |
A dictionary which contains optimizer objects for parameters that will be updated using an adaptive optimizer. |
None |
Returns:
Type | Description |
---|---|
SeparableGaussian |
The updated copy of the distribution. |
Source code in evotorch/distributions.py
def update_parameters(
self,
gradients: dict,
*,
learning_rates: Optional[dict] = None,
optimizers: Optional[dict] = None,
) -> "SeparableGaussian":
mu_grad = gradients["mu"]
sigma_grad = gradients["sigma"]
new_mu = self.mu + self._follow_gradient("mu", mu_grad, learning_rates=learning_rates, optimizers=optimizers)
new_sigma = self.sigma + self._follow_gradient(
"sigma", sigma_grad, learning_rates=learning_rates, optimizers=optimizers
)
return self.modified_copy(mu=new_mu, sigma=new_sigma)
SymmetricSeparableGaussian (SeparableGaussian)
¶
Symmetric (antithetic) separable Gaussian distribution as used by PGPE.
Source code in evotorch/distributions.py
class SymmetricSeparableGaussian(SeparableGaussian):
"""
Symmetric (antithetic) separable Gaussian distribution
as used by PGPE.
"""
MANDATORY_PARAMETERS = {"mu", "sigma"}
OPTIONAL_PARAMETERS = {"divide_mu_grad_by", "divide_sigma_grad_by", "parenthood_ratio"}
def _fill(self, out: torch.Tensor, *, generator: Optional[torch.Generator] = None):
self.make_gaussian(out=out, center=self.mu, stdev=self.sigma, symmetric=True, generator=generator)
def _compute_gradients(
self,
samples: torch.Tensor,
weights: torch.Tensor,
ranking_used: Optional[str],
) -> dict:
if "parenthood_ratio" in self.parameters:
return self._compute_gradients_via_parenthood_ratio(samples, weights)
else:
mu = self.mu
sigma = self.sigma
# Make sure that the weights (utilities) are 0-centered
# (Otherwise the formulations would have to consider a bias term)
if ranking_used not in ("centered", "normalized"):
weights = weights - torch.mean(weights)
[nslns] = weights.shape
# ndirs = nslns // 2
# Compute the scaled noises, that is, the noise vectors which
# were used for generating the solutions
# (solution = scaled_noise + center)
scaled_noises = samples[0::2] - mu
# Separate the plus and the minus ends of the directions
fdplus = weights[0::2]
fdminus = weights[1::2]
# Considering that the population is stored like this:
# _
# solution0: center + scaled_noise0 \
# > direction0
# solution1: center - scaled_noise0 _/
# _
# solution2: center + scaled_noise1 \
# > direction1
# solution3: center - scaled_noise1 _/
#
# ...
# fdplus[0] becomes the utility of the plus end of direction0
# (i.e. utility of solution0)
# fdminus[0] becomes the utility of the minus end of direction0
# (i.e. utility of solution1)
# fdplus[1] becomes the utility of the plus end of direction1
# (i.e. utility of solution2)
# fdminus[1] becomes the utility of the minus end of direction1
# (i.e. utility of solution3)
# ... and so on...
grad_mu = self._divide_grad("mu", total(dot((fdplus - fdminus) / 2, scaled_noises)), weights)
grad_sigma = self._divide_grad(
"sigma",
total(dot(((fdplus + fdminus) / 2), ((scaled_noises**2) - (sigma**2)) / sigma)),
weights,
)
return {
"mu": grad_mu,
"sigma": grad_sigma,
}