Source code for shapiq.imputer.baseline_imputer

"""Implementation of the baseline imputer."""

from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

import numpy as np

from .base import Imputer

if TYPE_CHECKING:
    from shapiq.typing import Model



[docs]
class BaselineImputer(Imputer):
    """The baseline imputer for the shapiq package.

    The baseline imputer is used to impute the missing values of a data point by using predefined
    values (baseline values). If no baseline values are given, the imputer uses the mean (for
    numerical features) or the mode (for categorical features) of the background data.

    Attributes:
        baseline_values: The baseline values to use for imputation.
        empty_prediction: The model's prediction on an empty data point (all features missing).

    Examples:
        >>> model = lambda x: np.sum(x, axis=1)  # some dummy model
        >>> data = np.random.rand(1000, 4)  # some background data
        >>> x_to_impute = np.array([[1, 1, 1, 1]])  # some data point to impute
        >>> imputer = BaselineImputer(model=model, data=data, x=x_to_impute)
        >>> # get the baseline values
        >>> imputer.baseline_values
        array([[0.5, 0.5, 0.5, 0.5]])  # computed from data
        >>> # set new baseline values
        >>> baseline_vector = np.array([0, 0, 0, 0])
        >>> imputer.init_background(baseline_vector)
        >>> imputer.baseline_values
        array([[0, 0, 0, 0]])  # given as input
        >>> # get the model prediction with missing values
        >>> imputer(np.array([[True, False, True, False]]))
        np.array([2.])  # model prediciton with the last baseline value

    """

    def __init__(
        self,
        model: Model,
        data: np.ndarray,
        x: np.ndarray | None = None,
        *,
        categorical_features: list[int] | None = None,
        normalize: bool = True,
        random_state: int | None = None,
    ) -> None:
        """Initializes the baseline imputer.

        Args:
            model: The model to explain as a callable function expecting a data points as input and
                returning the model's predictions.

            data: The background data to use for the explainer as either a vector of baseline values
                or a two-dimensional array with shape ``(n_samples, n_features)``. If data is a
                matrix, the baseline values are calculated from the data.

            x: The explanation point to use the imputer to.

            categorical_features: A list of indices of the categorical features in the background
                data. If no categorical features are given, all features are assumed to be numerical
                or in string format (where ``np.mean`` fails) features. Defaults to ``None``.

            normalize: A flag to normalize the game values. If ``True``, then the game values are
                normalized and centered to be zero for the empty set of features. Defaults to
                ``True``.

            random_state: The random state to use for sampling. Defaults to ``None``.

        """
        super().__init__(
            model=model,
            data=data,
            x=x,
            sample_size=1,
            categorical_features=categorical_features,
            random_state=random_state,
        )

        # setup attributes
        self.baseline_values: np.ndarray = np.zeros((1, self.n_features))  # will be overwritten
        self.init_background(self.data)

        # set empty value and normalization
        if normalize:
            self.normalization_value = self.empty_prediction


[docs]
    def value_function(self, coalitions: np.ndarray) -> np.ndarray:
        """Imputes the missing values of a data point and calls the model.

        Args:
            coalitions: A boolean array indicating which features are present (``True``) and which are
                missing (``False``). The shape of the array must be ``(n_subsets, n_features)``.

        Returns:
            The model's predictions on the imputed data points. The shape of the array is
               ``(n_subsets, n_outputs)``.

        """
        data = np.where(coalitions, self.x, self.baseline_values)
        return self.predict(data)



[docs]
    def init_background(self, data: np.ndarray) -> BaselineImputer:
        """Initializes the imputer to the background data.

        Args:
            data: The background data to use for the imputer. Either a vector of baseline values
                of shape ``(n_features,)`` or a matrix of shape ``(n_samples, n_features)``.
                If the data is a matrix, the baseline values are calculated from the data.

        Returns:
            The initialized imputer.

        Examples:
            >>> import numpy as np
            >>> from shapiq.games.imputer import BaselineImputer
            >>> data = np.array([[1, 2, "a"], [2, 3, "a"], [2, 4, "b"]], dtype=object)
            >>> x = np.array([1, 2, 3])
            >>> imputer = BaselineImputer(model=lambda x: np.sum(x, axis=1), data=data, x=x)
            >>> imputer.baseline_values
            array([[1.66, 3, 'a']], dtype=object)  # computed from data
            >>> baseline_vector = np.array([0, 0, 0])
            >>> imputer.init_background(baseline_vector)
            >>> imputer.baseline_values
            array([[0, 0, 0]])  # given as input

        """
        if data.ndim == 1 or data.shape[0] == 1:  # data is a vector -> use as baseline values
            self.baseline_values = data.reshape(1, self.n_features)
            return self
        # data is a matrix -> calculate baseline values as mean or mode
        self.baseline_values = np.zeros((1, self.n_features), dtype=object)
        for feature in range(self.n_features):
            feature_column = data[:, feature]
            if feature in self._cat_features:  # get mode for categorical features
                values, counts = np.unique(feature_column, return_counts=True)
                summarized_feature = values[np.argmax(counts)]
            else:
                try:  # try to use mean for numerical features
                    summarized_feature = np.mean(feature_column)
                except TypeError:  # fallback to mode for potentially string features
                    values, counts = np.unique(feature_column, return_counts=True)
                    summarized_feature = values[np.argmax(counts)]
                    # add feature to categorical features
                    warnings.warn(
                        f"Feature {feature} is not numerical. Adding it to categorical features.",
                        stacklevel=2,
                    )
                    self._cat_features.append(feature)
            self.baseline_values[0, feature] = summarized_feature
        self.calc_empty_prediction()  # reset the empty prediction to the new baseline values
        return self



[docs]
    def calc_empty_prediction(self) -> float:
        """Runs the model on empty data points (all features missing) to get the empty prediction.

        Returns:
            The empty prediction.

        """
        empty_predictions = self.predict(self.baseline_values)
        empty_prediction = float(empty_predictions[0])
        self.empty_prediction = empty_prediction
        if self.normalize:  # reset the normalization value
            self.normalization_value = empty_prediction
        return empty_prediction