Source code for shapiq.imputer.gaussian_imputer

"""Implements the Gaussian-based approach for imputation."""

from __future__ import annotations

from typing import TYPE_CHECKING, cast

import numpy as np
from numpy.random import default_rng

if TYPE_CHECKING:
    from collections.abc import Callable

    import numpy.typing as npt

    from shapiq.game import Game

from .base import Imputer
from .gaussian_imputer_exceptions import CategoricalFeatureError

# We disallow columns with <= 2 unique values, since they are likely either:
# - Binary features
# - One-hot encoded features (which would have at most 2 values per encoded column)
MAX_UNIQUE_VALUES_FOR_CATEGORICAL = 2


[docs] class GaussianImputer(Imputer): r"""Implements the Gaussian-based approach for imputation according to [Aas21]_. This approach assumes that the features of the background data form a multivariate Gaussian distribution. The missing values are imputed by drawing Monte Carlo samples from the conditional distribution given the values of the features present in a coalition. Note that only continuous features are supported, meaning that this imputer can't be used for datasets containing categorical or binary features. References: .. [Aas21] Aas, K., Jullum, M., and Løland, A. (2021). Explaining individual predictions when features are dependent: More accurate approximations to Shapley values. Artificial Intelligence 298, 103502. doi: https://doi.org/10.1016/j.artint.2021.103502 """ def __init__( self, model: object | Game | Callable[[npt.NDArray[np.floating]], npt.NDArray[np.floating]], data: npt.NDArray[np.floating], x: npt.NDArray[np.floating] | None = None, *, sample_size: int = 100, random_state: int | None = None, verbose: bool = False, ) -> None: """Initializes the class. Args: model: The model to explain as a callable function expecting a data points as input and returning the model's predictions. data: The background data to use for the explainer as a two-dimensional array with shape ``(n_samples, n_features)``. x: The explanation point as a ``np.ndarray`` of shape ``(1, n_features)`` or ``(n_features,)``. sample_size: The number of Monte Carlo samples to draw from the conditional background data for imputation. random_state: An optional random seed for reproducibility. verbose: A flag to enable verbose imputation, which will print a progress bar for model evaluation. Note that this can slow down the imputation process. Raises: CategoricalFeatureError: If the background data contains any categorical features. """ if data.shape[0] == 0: msg = "Background data must not be empty" raise ValueError(msg) Imputer.__init__( self, model=model, data=data, x=x, categorical_features=[], sample_size=sample_size, random_state=random_state, verbose=verbose, ) self._check_categorical_features() self._mean_per_feature: npt.NDArray[np.floating] | None = None self._cov_mat: npt.NDArray[np.floating] | None = None def _check_categorical_features(self) -> None: """Check if any features are categorical variables. Raises: CategoricalFeatureError: If any categorical features are detected. """ categorical_indices: list[int] = [] for i, feature_values in enumerate(self.data.T): if any(isinstance(v, str) for v in feature_values): categorical_indices.append(i) continue unique_values = len(np.unique(feature_values)) if unique_values <= MAX_UNIQUE_VALUES_FOR_CATEGORICAL: categorical_indices.append(i) if len(categorical_indices) > 0: raise CategoricalFeatureError(categorical_indices) @property def mean_per_feature(self) -> npt.NDArray[np.floating]: """The mean value for each feature. This proprety is only computed once and then cached. """ if self._mean_per_feature is None: self._mean_per_feature = cast("npt.NDArray[np.floating]", np.mean(self.data, axis=0)) return self._mean_per_feature @property def cov_mat(self) -> npt.NDArray[np.floating]: """The covariance matrix of the features. This proprety is only computed once and then cached. """ if self._cov_mat is None: self._cov_mat = self._ensure_positive_definite(np.cov(self.data.T)) return self._cov_mat def _ensure_positive_definite( self, cov_mat: npt.NDArray[np.floating], min_allowed_eigen_value: float = 1e-06, ) -> npt.NDArray[np.floating]: """Ensure covariance matrix is positive definite by correcting eigenvalues if necessary. Args: cov_mat: The input covariance matrix. min_allowed_eigen_value: The minimum allowed eigenvalue. Defaults to ``1e-06``. Returns: The positive definite covariance matrix. """ eigen_values = np.linalg.eigvalsh(cov_mat) if np.any(eigen_values <= min_allowed_eigen_value): # Add regularization to make the matrix positive definite min_eigen_value = np.min(eigen_values) cov_mat += (min_allowed_eigen_value - min_eigen_value) * np.eye(cov_mat.shape[0]) return cov_mat def _sample_monte_carlo( self, x: npt.NDArray[np.floating], coalitions: npt.NDArray[np.bool], ) -> npt.NDArray[np.floating]: """Generate Gaussian Monte Carlo samples for the features missing in the given coalitions. Args: coalitions: The coalitions for which to impute values as a boolean array of shape ``(n_coalitions, n_features)``. x: The explanation point to use the imputer on. Returns: Random samples for the missing features of each coalition as an array of shape ``(n_coalitions, sample_size, n_features)``. The columns corresponding to known features are filled with the value of ``x`` for that feature. """ x_explain = x.flatten() n_coalitions, n_features = coalitions.shape rng = default_rng(self.random_state) samples_all_coalitions = np.zeros((n_coalitions, self.sample_size, n_features)) for i, coalition in enumerate(coalitions): known_indices = np.where(coalition)[0] unknown_indices = np.where(~coalition)[0] if len(known_indices) == 0: # No conditioning on known features, therefore sample from original data distribution Z = rng.standard_normal((self.sample_size, len(unknown_indices))) samples = Z @ np.linalg.cholesky(self.cov_mat).T + self.mean_per_feature elif len(unknown_indices) == 0: samples = np.tile(x_explain, (self.sample_size, 1)) else: x_S_star = x_explain[known_indices] mu_S_known = self.mean_per_feature[known_indices] mu_S_unknown = self.mean_per_feature[unknown_indices] cov_S_known_known = self.cov_mat[np.ix_(known_indices, known_indices)] cov_S_known_unknown = self.cov_mat[np.ix_(known_indices, unknown_indices)] cov_S_unknown_known = self.cov_mat[np.ix_(unknown_indices, known_indices)] cov_S_unknown_unknown = self.cov_mat[np.ix_(unknown_indices, unknown_indices)] cov_S_known_known_inv = np.linalg.inv(cov_S_known_known) cond_mean = mu_S_unknown + (cov_S_unknown_known @ cov_S_known_known_inv) @ ( x_S_star - mu_S_known ) cond_cov = ( cov_S_unknown_unknown - (cov_S_unknown_known @ cov_S_known_known_inv) @ cov_S_known_unknown ) # for sampling from multivariate normal distribution with Cholesky we need to make sure that # cond_cov is symmetric (regardless - Covariances should always be symmetric: Cov(X,Y) = Cov(Y,X)) cond_cov = 0.5 * (cond_cov + cond_cov.T) # MC samples and Cholesky to turn N(0,1) to desired Gaussian distribution Z = rng.standard_normal((self.sample_size, len(unknown_indices))) samples_unknown = Z @ np.linalg.cholesky(cond_cov).T + cond_mean samples = np.tile(x_explain, (self.sample_size, 1)) samples[:, unknown_indices] = samples_unknown samples_all_coalitions[i] = samples return samples_all_coalitions
[docs] def value_function(self, coalitions: npt.NDArray[np.bool]) -> npt.NDArray[np.floating]: """Imputes the missing values of a data point and gets predictions for all coalitions. Args: coalitions: A boolean array of shape ``(n_coalitions, n_features)`` indicating which features are present (``True``) and which are missing (``False``). Returns: The model's predictions on the imputed data points as an array of shape ``(n_coalitions,)``. Raises: RuntimeError: If no explanation point has been provided, neither in the constructor nor by calling ``fit()``. """ if self.x is None: msg = f"Must call {self.__class__.__name__}.fit(x) first before imputing" raise RuntimeError(msg) n_coalitions = coalitions.shape[0] samples = self._draw_samples(self.x.flatten(), coalitions) predictions = np.zeros((n_coalitions, self.sample_size)) for i in range(n_coalitions): predictions[i] = self.predict(samples[i]) return cast("npt.NDArray[np.floating]", np.mean(predictions, axis=1))
def _draw_samples( self, x: npt.NDArray[np.floating], coalitions: npt.NDArray[np.bool] ) -> npt.NDArray[np.floating]: """Draw samples for the given coalitions to be used for computing the utility. This function should be overriden by a subclass, if the Monte Carlo sampling needs to be wrapped in any kind of transformation. Args: x: The explanation point as an array of shape ``(n_features,)``. coalitions: A set of coalitions as an array of shape ``(n_coalitions, n_features)``. Returns: Samples draw for each coalition as an array of shape ``(n_coalitions, n_samples, n_features)``. """ return self._sample_monte_carlo(x, coalitions)