Source code for shapiq.imputer.baseline_imputer

"""Implementation of the baseline imputer."""

from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

import numpy as np

from .base import Imputer

if TYPE_CHECKING:
    from shapiq.typing import Model


[docs] class BaselineImputer(Imputer): """The baseline imputer for the shapiq package. The baseline imputer is used to impute the missing values of a data point by using predefined values (baseline values). If no baseline values are given, the imputer uses the mean (for numerical features) or the mode (for categorical features) of the background data. Attributes: baseline_values: The baseline values to use for imputation. empty_prediction: The model's prediction on an empty data point (all features missing). Examples: >>> model = lambda x: np.sum(x, axis=1) # some dummy model >>> data = np.random.rand(1000, 4) # some background data >>> x_to_impute = np.array([[1, 1, 1, 1]]) # some data point to impute >>> imputer = BaselineImputer(model=model, data=data, x=x_to_impute) >>> # get the baseline values >>> imputer.baseline_values array([[0.5, 0.5, 0.5, 0.5]]) # computed from data >>> # set new baseline values >>> baseline_vector = np.array([0, 0, 0, 0]) >>> imputer.init_background(baseline_vector) >>> imputer.baseline_values array([[0, 0, 0, 0]]) # given as input >>> # get the model prediction with missing values >>> imputer(np.array([[True, False, True, False]])) np.array([2.]) # model prediciton with the last baseline value """ def __init__( self, model: Model, data: np.ndarray, x: np.ndarray | None = None, *, categorical_features: list[int] | None = None, normalize: bool = True, random_state: int | None = None, ) -> None: """Initializes the baseline imputer. Args: model: The model to explain as a callable function expecting a data points as input and returning the model's predictions. data: The background data to use for the explainer as either a vector of baseline values or a two-dimensional array with shape ``(n_samples, n_features)``. If data is a matrix, the baseline values are calculated from the data. x: The explanation point to use the imputer to. categorical_features: A list of indices of the categorical features in the background data. If no categorical features are given, all features are assumed to be numerical or in string format (where ``np.mean`` fails) features. Defaults to ``None``. normalize: A flag to normalize the game values. If ``True``, then the game values are normalized and centered to be zero for the empty set of features. Defaults to ``True``. random_state: The random state to use for sampling. Defaults to ``None``. """ super().__init__( model=model, data=data, x=x, sample_size=1, categorical_features=categorical_features, random_state=random_state, ) # setup attributes self.baseline_values: np.ndarray = np.zeros((1, self.n_features)) # will be overwritten self.init_background(self.data) # set empty value and normalization if normalize: self.normalization_value = self.empty_prediction
[docs] def value_function(self, coalitions: np.ndarray) -> np.ndarray: """Imputes the missing values of a data point and calls the model. Args: coalitions: A boolean array indicating which features are present (``True``) and which are missing (``False``). The shape of the array must be ``(n_subsets, n_features)``. Returns: The model's predictions on the imputed data points. The shape of the array is ``(n_subsets, n_outputs)``. """ data = np.where(coalitions, self.x, self.baseline_values) return self.predict(data)
[docs] def init_background(self, data: np.ndarray) -> BaselineImputer: """Initializes the imputer to the background data. Args: data: The background data to use for the imputer. Either a vector of baseline values of shape ``(n_features,)`` or a matrix of shape ``(n_samples, n_features)``. If the data is a matrix, the baseline values are calculated from the data. Returns: The initialized imputer. Examples: >>> import numpy as np >>> from shapiq.games.imputer import BaselineImputer >>> data = np.array([[1, 2, "a"], [2, 3, "a"], [2, 4, "b"]], dtype=object) >>> x = np.array([1, 2, 3]) >>> imputer = BaselineImputer(model=lambda x: np.sum(x, axis=1), data=data, x=x) >>> imputer.baseline_values array([[1.66, 3, 'a']], dtype=object) # computed from data >>> baseline_vector = np.array([0, 0, 0]) >>> imputer.init_background(baseline_vector) >>> imputer.baseline_values array([[0, 0, 0]]) # given as input """ if data.ndim == 1 or data.shape[0] == 1: # data is a vector -> use as baseline values self.baseline_values = data.reshape(1, self.n_features) return self # data is a matrix -> calculate baseline values as mean or mode self.baseline_values = np.zeros((1, self.n_features), dtype=object) for feature in range(self.n_features): feature_column = data[:, feature] if feature in self._cat_features: # get mode for categorical features values, counts = np.unique(feature_column, return_counts=True) summarized_feature = values[np.argmax(counts)] else: try: # try to use mean for numerical features summarized_feature = np.mean(feature_column) except TypeError: # fallback to mode for potentially string features values, counts = np.unique(feature_column, return_counts=True) summarized_feature = values[np.argmax(counts)] # add feature to categorical features warnings.warn( f"Feature {feature} is not numerical. Adding it to categorical features.", stacklevel=2, ) self._cat_features.append(feature) self.baseline_values[0, feature] = summarized_feature self.calc_empty_prediction() # reset the empty prediction to the new baseline values return self
[docs] def calc_empty_prediction(self) -> float: """Runs the model on empty data points (all features missing) to get the empty prediction. Returns: The empty prediction. """ empty_predictions = self.predict(self.baseline_values) empty_prediction = float(empty_predictions[0]) self.empty_prediction = empty_prediction if self.normalize: # reset the normalization value self.normalization_value = empty_prediction return empty_prediction