"""Implements the Gaussian-based approach for imputation."""
from __future__ import annotations
from typing import TYPE_CHECKING, cast
import numpy as np
from numpy.random import default_rng
if TYPE_CHECKING:
from collections.abc import Callable
import numpy.typing as npt
from shapiq.game import Game
from .base import Imputer
from .gaussian_imputer_exceptions import CategoricalFeatureError
# We disallow columns with <= 2 unique values, since they are likely either:
# - Binary features
# - One-hot encoded features (which would have at most 2 values per encoded column)
MAX_UNIQUE_VALUES_FOR_CATEGORICAL = 2
[docs]
class GaussianImputer(Imputer):
r"""Implements the Gaussian-based approach for imputation according to [Aas21]_.
This approach assumes that the features of the background data form a multivariate Gaussian distribution.
The missing values are imputed by drawing Monte Carlo samples from the conditional distribution given the values of the features present in a coalition.
Note that only continuous features are supported, meaning that this imputer can't be used for datasets containing
categorical or binary features.
References:
.. [Aas21] Aas, K., Jullum, M., and Løland, A. (2021). Explaining individual predictions when features are dependent: More accurate approximations to Shapley values. Artificial Intelligence 298, 103502. doi: https://doi.org/10.1016/j.artint.2021.103502
"""
def __init__(
self,
model: object | Game | Callable[[npt.NDArray[np.floating]], npt.NDArray[np.floating]],
data: npt.NDArray[np.floating],
x: npt.NDArray[np.floating] | None = None,
*,
sample_size: int = 100,
random_state: int | None = None,
verbose: bool = False,
) -> None:
"""Initializes the class.
Args:
model: The model to explain as a callable function expecting a data points as input and
returning the model's predictions.
data: The background data to use for the explainer as a two-dimensional array with shape
``(n_samples, n_features)``.
x: The explanation point as a ``np.ndarray`` of shape ``(1, n_features)`` or
``(n_features,)``.
sample_size: The number of Monte Carlo samples to draw from the conditional background
data for imputation.
random_state: An optional random seed for reproducibility.
verbose: A flag to enable verbose imputation, which will print a progress bar for model
evaluation. Note that this can slow down the imputation process.
Raises:
CategoricalFeatureError: If the background data contains any categorical features.
"""
if data.shape[0] == 0:
msg = "Background data must not be empty"
raise ValueError(msg)
Imputer.__init__(
self,
model=model,
data=data,
x=x,
categorical_features=[],
sample_size=sample_size,
random_state=random_state,
verbose=verbose,
)
self._check_categorical_features()
self._mean_per_feature: npt.NDArray[np.floating] | None = None
self._cov_mat: npt.NDArray[np.floating] | None = None
def _check_categorical_features(self) -> None:
"""Check if any features are categorical variables.
Raises:
CategoricalFeatureError: If any categorical features are detected.
"""
categorical_indices: list[int] = []
for i, feature_values in enumerate(self.data.T):
if any(isinstance(v, str) for v in feature_values):
categorical_indices.append(i)
continue
unique_values = len(np.unique(feature_values))
if unique_values <= MAX_UNIQUE_VALUES_FOR_CATEGORICAL:
categorical_indices.append(i)
if len(categorical_indices) > 0:
raise CategoricalFeatureError(categorical_indices)
@property
def mean_per_feature(self) -> npt.NDArray[np.floating]:
"""The mean value for each feature.
This proprety is only computed once and then cached.
"""
if self._mean_per_feature is None:
self._mean_per_feature = cast("npt.NDArray[np.floating]", np.mean(self.data, axis=0))
return self._mean_per_feature
@property
def cov_mat(self) -> npt.NDArray[np.floating]:
"""The covariance matrix of the features.
This proprety is only computed once and then cached.
"""
if self._cov_mat is None:
self._cov_mat = self._ensure_positive_definite(np.cov(self.data.T))
return self._cov_mat
def _ensure_positive_definite(
self,
cov_mat: npt.NDArray[np.floating],
min_allowed_eigen_value: float = 1e-06,
) -> npt.NDArray[np.floating]:
"""Ensure covariance matrix is positive definite by correcting eigenvalues if necessary.
Args:
cov_mat: The input covariance matrix.
min_allowed_eigen_value: The minimum allowed eigenvalue. Defaults to ``1e-06``.
Returns:
The positive definite covariance matrix.
"""
eigen_values = np.linalg.eigvalsh(cov_mat)
if np.any(eigen_values <= min_allowed_eigen_value):
# Add regularization to make the matrix positive definite
min_eigen_value = np.min(eigen_values)
cov_mat += (min_allowed_eigen_value - min_eigen_value) * np.eye(cov_mat.shape[0])
return cov_mat
def _sample_monte_carlo(
self,
x: npt.NDArray[np.floating],
coalitions: npt.NDArray[np.bool],
) -> npt.NDArray[np.floating]:
"""Generate Gaussian Monte Carlo samples for the features missing in the given coalitions.
Args:
coalitions: The coalitions for which to impute values as a boolean array of shape ``(n_coalitions, n_features)``.
x: The explanation point to use the imputer on.
Returns:
Random samples for the missing features of each coalition as an array of shape ``(n_coalitions, sample_size, n_features)``.
The columns corresponding to known features are filled with the value of ``x`` for that feature.
"""
x_explain = x.flatten()
n_coalitions, n_features = coalitions.shape
rng = default_rng(self.random_state)
samples_all_coalitions = np.zeros((n_coalitions, self.sample_size, n_features))
for i, coalition in enumerate(coalitions):
known_indices = np.where(coalition)[0]
unknown_indices = np.where(~coalition)[0]
if len(known_indices) == 0:
# No conditioning on known features, therefore sample from original data distribution
Z = rng.standard_normal((self.sample_size, len(unknown_indices)))
samples = Z @ np.linalg.cholesky(self.cov_mat).T + self.mean_per_feature
elif len(unknown_indices) == 0:
samples = np.tile(x_explain, (self.sample_size, 1))
else:
x_S_star = x_explain[known_indices]
mu_S_known = self.mean_per_feature[known_indices]
mu_S_unknown = self.mean_per_feature[unknown_indices]
cov_S_known_known = self.cov_mat[np.ix_(known_indices, known_indices)]
cov_S_known_unknown = self.cov_mat[np.ix_(known_indices, unknown_indices)]
cov_S_unknown_known = self.cov_mat[np.ix_(unknown_indices, known_indices)]
cov_S_unknown_unknown = self.cov_mat[np.ix_(unknown_indices, unknown_indices)]
cov_S_known_known_inv = np.linalg.inv(cov_S_known_known)
cond_mean = mu_S_unknown + (cov_S_unknown_known @ cov_S_known_known_inv) @ (
x_S_star - mu_S_known
)
cond_cov = (
cov_S_unknown_unknown
- (cov_S_unknown_known @ cov_S_known_known_inv) @ cov_S_known_unknown
)
# for sampling from multivariate normal distribution with Cholesky we need to make sure that
# cond_cov is symmetric (regardless - Covariances should always be symmetric: Cov(X,Y) = Cov(Y,X))
cond_cov = 0.5 * (cond_cov + cond_cov.T)
# MC samples and Cholesky to turn N(0,1) to desired Gaussian distribution
Z = rng.standard_normal((self.sample_size, len(unknown_indices)))
samples_unknown = Z @ np.linalg.cholesky(cond_cov).T + cond_mean
samples = np.tile(x_explain, (self.sample_size, 1))
samples[:, unknown_indices] = samples_unknown
samples_all_coalitions[i] = samples
return samples_all_coalitions
[docs]
def value_function(self, coalitions: npt.NDArray[np.bool]) -> npt.NDArray[np.floating]:
"""Imputes the missing values of a data point and gets predictions for all coalitions.
Args:
coalitions: A boolean array of shape ``(n_coalitions, n_features)`` indicating which features are present (``True``) and which are missing (``False``).
Returns:
The model's predictions on the imputed data points as an array of shape ``(n_coalitions,)``.
Raises:
RuntimeError: If no explanation point has been provided, neither in the constructor nor by calling ``fit()``.
"""
if self.x is None:
msg = f"Must call {self.__class__.__name__}.fit(x) first before imputing"
raise RuntimeError(msg)
n_coalitions = coalitions.shape[0]
samples = self._draw_samples(self.x.flatten(), coalitions)
predictions = np.zeros((n_coalitions, self.sample_size))
for i in range(n_coalitions):
predictions[i] = self.predict(samples[i])
return cast("npt.NDArray[np.floating]", np.mean(predictions, axis=1))
def _draw_samples(
self, x: npt.NDArray[np.floating], coalitions: npt.NDArray[np.bool]
) -> npt.NDArray[np.floating]:
"""Draw samples for the given coalitions to be used for computing the utility.
This function should be overriden by a subclass, if the Monte Carlo sampling needs to be wrapped in any kind of transformation.
Args:
x: The explanation point as an array of shape ``(n_features,)``.
coalitions: A set of coalitions as an array of shape ``(n_coalitions, n_features)``.
Returns:
Samples draw for each coalition as an array of shape ``(n_coalitions, n_samples, n_features)``.
"""
return self._sample_monte_carlo(x, coalitions)