% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/kernelshap.R
\name{kernelshap}
\alias{kernelshap}
\alias{kernelshap.default}
\alias{kernelshap.ranger}
\title{Kernel SHAP}
\usage{
kernelshap(object, ...)

\method{kernelshap}{default}(
  object,
  X,
  bg_X = NULL,
  pred_fun = stats::predict,
  feature_names = colnames(X),
  bg_w = NULL,
  bg_n = 200L,
  exact = length(feature_names) <= 8L,
  hybrid_degree = 1L + length(feature_names) \%in\% 4:16,
  paired_sampling = TRUE,
  m = 2L * length(feature_names) * (1L + 3L * (hybrid_degree == 0L)),
  tol = 0.005,
  max_iter = 100L,
  parallel = FALSE,
  parallel_args = NULL,
  verbose = TRUE,
  ...
)

\method{kernelshap}{ranger}(
  object,
  X,
  bg_X = NULL,
  pred_fun = NULL,
  feature_names = colnames(X),
  bg_w = NULL,
  bg_n = 200L,
  exact = length(feature_names) <= 8L,
  hybrid_degree = 1L + length(feature_names) \%in\% 4:16,
  paired_sampling = TRUE,
  m = 2L * length(feature_names) * (1L + 3L * (hybrid_degree == 0L)),
  tol = 0.005,
  max_iter = 100L,
  parallel = FALSE,
  parallel_args = NULL,
  verbose = TRUE,
  survival = c("chf", "prob"),
  ...
)
}
\arguments{
\item{object}{Fitted model object.}

\item{...}{Additional arguments passed to \code{pred_fun(object, X, ...)}.}

\item{X}{\eqn{(n \times p)} matrix or \code{data.frame} with rows to be explained.
The columns should only represent model features, not the response
(but see \code{feature_names} on how to overrule this).}

\item{bg_X}{Background data used to integrate out "switched off" features,
often a subset of the training data (typically 50 to 500 rows).
In cases with a natural "off" value (like MNIST digits),
this can also be a single row with all values set to the off value.
If no \code{bg_X} is passed (the default) and if \code{X} is sufficiently large,
a random sample of \code{bg_n} rows from \code{X} serves as background data.}

\item{pred_fun}{Prediction function of the form \verb{function(object, X, ...)},
providing \eqn{K \ge 1} predictions per row. Its first argument
represents the model \code{object}, its second argument a data structure like \code{X}.
Additional (named) arguments are passed via \code{...}.
The default, \code{\link[stats:predict]{stats::predict()}}, will work in most cases.}

\item{feature_names}{Optional vector of column names in \code{X} used to calculate
SHAP values. By default, this equals \code{colnames(X)}. Not supported if \code{X}
is a matrix.}

\item{bg_w}{Optional vector of case weights for each row of \code{bg_X}.
If \code{bg_X = NULL}, must be of same length as \code{X}. Set to \code{NULL} for no weights.}

\item{bg_n}{If \code{bg_X = NULL}: Size of background data to be sampled from \code{X}.}

\item{exact}{If \code{TRUE}, the algorithm will produce exact Kernel SHAP values
with respect to the background data. In this case, the arguments \code{hybrid_degree},
\code{m}, \code{paired_sampling}, \code{tol}, and \code{max_iter} are ignored.
The default is \code{TRUE} up to eight features, and \code{FALSE} otherwise.}

\item{hybrid_degree}{Integer controlling the exactness of the hybrid strategy. For
\eqn{4 \le p \le 16}, the default is 2, otherwise it is 1.
Ignored if \code{exact = TRUE}.
\itemize{
\item \code{0}: Pure sampling strategy not involving any exact part. It is strictly
worse than the hybrid strategy and should therefore only be used for
studying properties of the Kernel SHAP algorithm.
\item \code{1}: Uses all \eqn{2p} on-off vectors \eqn{z} with \eqn{\sum z \in \{1, p-1\}}
for the exact part, which covers at least 75\% of the mass of the Kernel weight
distribution. The remaining mass is covered by random sampling.
\item \code{2}: Uses all \eqn{p(p+1)} on-off vectors \eqn{z} with
\eqn{\sum z \in \{1, 2, p-2, p-1\}}. This covers at least 92\% of the mass of the
Kernel weight distribution. The remaining mass is covered by sampling.
Convergence usually happens in the minimal possible number of iterations of two.
\item \code{k>2}: Uses all on-off vectors with
\eqn{\sum z \in \{1, \dots, k, p-k, \dots, p-1\}}.
}}

\item{paired_sampling}{Logical flag indicating whether to do the sampling in a paired
manner. This means that with every on-off vector \eqn{z}, also \eqn{1-z} is
considered. CL21 shows its superiority compared to standard sampling, therefore the
default (\code{TRUE}) should usually not be changed except for studying properties
of Kernel SHAP algorithms. Ignored if \code{exact = TRUE}.}

\item{m}{Even number of on-off vectors sampled during one iteration.
The default is \eqn{2p}, except when \code{hybrid_degree == 0}.
Then it is set to \eqn{8p}. Ignored if \code{exact = TRUE}.}

\item{tol}{Tolerance determining when to stop. Following CL21, the algorithm keeps
iterating until \eqn{\textrm{max}(\sigma_n)/(\textrm{max}(\beta_n) - \textrm{min}(\beta_n)) < \textrm{tol}},
where the \eqn{\beta_n} are the SHAP values of a given observation,
and \eqn{\sigma_n} their standard errors.
For multidimensional predictions, the criterion must be satisfied for each
dimension separately. The stopping criterion uses the fact that standard errors
and SHAP values are all on the same scale. Ignored if \code{exact = TRUE}.}

\item{max_iter}{If the stopping criterion (see \code{tol}) is not reached after
\code{max_iter} iterations, the algorithm stops. Ignored if \code{exact = TRUE}.}

\item{parallel}{If \code{TRUE}, use parallel \code{\link[foreach:foreach]{foreach::foreach()}} to loop over rows
to be explained. Must register backend beforehand, e.g., via 'doFuture' package,
see README for an example. Parallelization automatically disables the progress bar.}

\item{parallel_args}{Named list of arguments passed to \code{\link[foreach:foreach]{foreach::foreach()}}.
Ideally, this is \code{NULL} (default). Only relevant if \code{parallel = TRUE}.
Example on Windows: if \code{object} is a GAM fitted with package 'mgcv',
then one might need to set \code{parallel_args = list(.packages = "mgcv")}.}

\item{verbose}{Set to \code{FALSE} to suppress messages and the progress bar.}

\item{survival}{Should cumulative hazards ("chf", default) or survival
probabilities ("prob") per time be predicted? Only in \code{ranger()} survival models.}
}
\value{
An object of class "kernelshap" with the following components:
\itemize{
\item \code{S}: \eqn{(n \times p)} matrix with SHAP values or, if the model output has
dimension \eqn{K > 1}, a list of \eqn{K} such matrices.
\item \code{X}: Same as input argument \code{X}.
\item \code{baseline}: Vector of length K representing the average prediction on the
background data.
\item \code{bg_X}: The background data.
\item \code{bg_w}: The background case weights.
\item \code{SE}: Standard errors corresponding to \code{S} (and organized like \code{S}).
\item \code{n_iter}: Integer vector of length n providing the number of iterations
per row of \code{X}.
\item \code{converged}: Logical vector of length n indicating convergence per row of \code{X}.
\item \code{m}: Integer providing the effective number of sampled on-off vectors used
per iteration.
\item \code{m_exact}: Integer providing the effective number of exact on-off vectors used
per iteration.
\item \code{prop_exact}: Proportion of the Kernel SHAP weight distribution covered by
exact calculations.
\item \code{exact}: Logical flag indicating whether calculations are exact or not.
\item \code{txt}: Summary text.
\item \code{predictions}: \eqn{(n \times K)} matrix with predictions of \code{X}.
\item \code{algorithm}: "kernelshap".
}
}
\description{
Efficient implementation of Kernel SHAP, see Lundberg and Lee (2017), and
Covert and Lee (2021), abbreviated by CL21.
For up to \eqn{p=8} features, the resulting Kernel SHAP values are exact regarding
the selected background data. For larger \eqn{p}, an almost exact
hybrid algorithm involving iterative sampling is used, see Details.
For up to eight features, however, we recomment to use \code{\link[=permshap]{permshap()}}.
}
\details{
Pure iterative Kernel SHAP sampling as in Covert and Lee (2021) works like this:
\enumerate{
\item A binary "on-off" vector \eqn{z} is drawn from \eqn{\{0, 1\}^p}
such that its sum follows the SHAP Kernel weight distribution
(normalized to the range \eqn{\{1, \dots, p-1\}}).
\item For each \eqn{j} with \eqn{z_j = 1}, the \eqn{j}-th column of the
original background data is replaced by the corresponding feature value \eqn{x_j}
of the observation to be explained.
\item The average prediction \eqn{v_z} on the data of Step 2 is calculated, and the
average prediction \eqn{v_0} on the background data is subtracted.
\item Steps 1 to 3 are repeated \eqn{m} times. This produces a binary \eqn{m \times p}
matrix \eqn{Z} (each row equals one of the \eqn{z}) and a vector \eqn{v} of
shifted predictions.
\item \eqn{v} is regressed onto \eqn{Z} under the constraint that the sum of the
coefficients equals \eqn{v_1 - v_0}, where \eqn{v_1} is the prediction of the
observation to be explained. The resulting coefficients are the Kernel SHAP values.
}

This is repeated multiple times until convergence, see CL21 for details.

A drawback of this strategy is that many (at least 75\%) of the \eqn{z} vectors will
have \eqn{\sum z \in \{1, p-1\}}, producing many duplicates. Similarly, at least 92\%
of the mass will be used for the \eqn{p(p+1)} possible vectors with
\eqn{\sum z \in \{1, 2, p-2, p-1\}}.
This inefficiency can be fixed by a hybrid strategy, combining exact calculations
with sampling.

The hybrid algorithm has two steps:
\enumerate{
\item Step 1 (exact part): There are \eqn{2p} different on-off vectors \eqn{z} with
\eqn{\sum z \in \{1, p-1\}}, covering a large proportion of the Kernel SHAP
distribution. The degree 1 hybrid will list those vectors and use them according
to their weights in the upcoming calculations. Depending on \eqn{p}, we can also go
a step further to a degree 2 hybrid by adding all \eqn{p(p-1)} vectors with
\eqn{\sum z \in \{2, p-2\}} to the process etc. The necessary predictions are
obtained along with other calculations similar to those described in CL21.
\item Step 2 (sampling part): The remaining weight is filled by sampling vectors z
according to Kernel SHAP weights renormalized to the values not yet covered by Step 1.
Together with the results from Step 1 - correctly weighted - this now forms a
complete iteration as in CL21. The difference is that most mass is covered by exact
calculations. Afterwards, the algorithm iterates until convergence.
The output of Step 1 is reused in every iteration, leading to an extremely
efficient strategy.
}

If \eqn{p} is sufficiently small, all possible \eqn{2^p-2} on-off vectors \eqn{z} can be
evaluated. In this case, no sampling is required and the algorithm returns exact
Kernel SHAP values with respect to the given background data.
Since \code{\link[=kernelshap]{kernelshap()}} calculates predictions on data with \eqn{MN} rows
(\eqn{N} is the background data size and \eqn{M} the number of \eqn{z} vectors), \eqn{p}
should not be much higher than 10 for exact calculations.
For similar reasons, degree 2 hybrids should not use \eqn{p} much larger than 40.
}
\section{Methods (by class)}{
\itemize{
\item \code{kernelshap(default)}: Default Kernel SHAP method.

\item \code{kernelshap(ranger)}: Kernel SHAP method for "ranger" models, see Readme for an example.

}}
\examples{
# MODEL ONE: Linear regression
fit <- lm(Sepal.Length ~ ., data = iris)

# Select rows to explain (only feature columns)
X_explain <- iris[-1]

# Calculate SHAP values
s <- kernelshap(fit, X_explain)
s

# MODEL TWO: Multi-response linear regression
fit <- lm(as.matrix(iris[, 1:2]) ~ Petal.Length + Petal.Width + Species, data = iris)
s <- kernelshap(fit, iris[3:5])
s

# Note 1: Feature columns can also be selected 'feature_names'
# Note 2: Especially when X is small, pass a sufficiently large background data bg_X
s <- kernelshap(
  fit,
  iris[1:4, ],
  bg_X = iris,
  feature_names = c("Petal.Length", "Petal.Width", "Species")
)
s
}
\references{
\enumerate{
\item Scott M. Lundberg and Su-In Lee. A unified approach to interpreting model
predictions. Proceedings of the 31st International Conference on Neural
Information Processing Systems, 2017.
\item Ian Covert and Su-In Lee. Improving KernelSHAP: Practical Shapley Value
Estimation Using Linear Regression. Proceedings of The 24th International
Conference on Artificial Intelligence and Statistics, PMLR 130:3457-3465, 2021.
}
}
