% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/divfor.R
\encoding{UTF-8}
\name{divfor}
\alias{divfor}
\title{Construct a Diversity Forest prediction rule}
\usage{
divfor(formula = NULL, data = NULL, num.trees = 500, mtry = NULL,
  importance = "none", write.forest = TRUE, probability = FALSE,
  min.node.size = NULL, max.depth = NULL, replace = TRUE,
  sample.fraction = ifelse(replace, 1, 0.632), case.weights = NULL,
  class.weights = NULL, splitrule = NULL, num.random.splits = 1,
  alpha = 0.5, minprop = 0.1, split.select.weights = NULL,
  always.split.variables = NULL, respect.unordered.factors = NULL,
  scale.permutation.importance = FALSE, keep.inbag = FALSE,
  inbag = NULL, holdout = FALSE, quantreg = FALSE,
  oob.error = TRUE, num.threads = NULL, save.memory = FALSE,
  verbose = TRUE, seed = NULL, dependent.variable.name = NULL,
  status.variable.name = NULL, classification = NULL, nsplits = 30,
  proptry = 1)
}
\arguments{
\item{formula}{Object of class \code{formula} or \code{character} describing the model to fit. Interaction terms supported only for numerical variables.}

\item{data}{Training data of class \code{data.frame}, \code{matrix}, \code{dgCMatrix} (Matrix) or \code{gwaa.data} (GenABEL).}

\item{num.trees}{Number of trees. Default is 500.}

\item{mtry}{Artefact from 'ranger'. NOT needed for diversity forests.}

\item{importance}{Variable importance mode, one of 'none', 'impurity', 'impurity_corrected', 'permutation'. The 'impurity' measure is the Gini index for classification, the variance of the responses for regression and the sum of test statistics (see \code{splitrule}) for survival. NOTE: Currently, only "permutation" (and "none") work for diversity forests.}

\item{write.forest}{Save \code{divfor.forest} object, required for prediction. Set to \code{FALSE} to reduce memory usage if no prediction intended.}

\item{probability}{Grow a probability forest as in Malley et al. (2012).}

\item{min.node.size}{Minimal node size. Default 1 for classification.}

\item{max.depth}{Maximal tree depth. A value of NULL or 0 (the default) corresponds to unlimited depth, 1 to tree stumps (1 split per tree).}

\item{replace}{Sample with replacement.}

\item{sample.fraction}{Fraction of observations to sample. Default is 1 for sampling with replacement and 0.632 for sampling without replacement. For classification, this can be a vector of class-specific values.}

\item{case.weights}{Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.}

\item{class.weights}{Weights for the outcome classes (in order of the factor levels) in the splitting rule (cost sensitive learning). Classification and probability prediction only. For classification the weights are also applied in the majority vote in terminal nodes.}

\item{splitrule}{Splitting rule. For classification and probability estimation "gini" or "extratrees" with default "gini". For regression "variance", "extratrees" or "maxstat" with default "variance". For survival "logrank", "extratrees", "C" or "maxstat" with default "logrank". NOTE: For diversity forests currently only the default splitting rules are supported.}

\item{num.random.splits}{Artefact from 'ranger'. NOT needed for diversity forests.}

\item{alpha}{For "maxstat" splitrule: Significance threshold to allow splitting. NOT needed for diversity forests.}

\item{minprop}{For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting. NOT needed for diversity forests.}

\item{split.select.weights}{Numeric vector with weights between 0 and 1, representing the probability to select variables for splitting. Alternatively, a list of size num.trees, containing split select weight vectors for each tree can be used.}

\item{always.split.variables}{Currently not useable. Character vector with variable names to be always selected.}

\item{respect.unordered.factors}{Handling of unordered factor covariates. One of 'ignore' and 'order' (the option 'partition' possible in 'ranger' is not (yet) possible with diversity forests). Default is 'ignore'. Alternatively TRUE (='order') or FALSE (='ignore') can be used.}

\item{scale.permutation.importance}{Scale permutation importance by standard error as in (Breiman 2001). Only applicable if permutation variable importance mode selected.}

\item{keep.inbag}{Save how often observations are in-bag in each tree.}

\item{inbag}{Manually set observations per tree. List of size num.trees, containing inbag counts for each observation. Can be used for stratified sampling.}

\item{holdout}{Hold-out mode. Hold-out all samples with case weight 0 and use these for variable importance and prediction error.}

\item{quantreg}{Prepare quantile prediction as in quantile regression forests (Meinshausen 2006). Regression only. Set \code{keep.inbag = TRUE} to prepare out-of-bag quantile prediction.}

\item{oob.error}{Compute OOB prediction error. Set to \code{FALSE} to save computation time, e.g. for large survival forests.}

\item{num.threads}{Number of threads. Default is number of CPUs available.}

\item{save.memory}{Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. NOT needed for diversity forests.}

\item{verbose}{Show computation status and estimated runtime.}

\item{seed}{Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed.}

\item{dependent.variable.name}{Name of dependent variable, needed if no formula given. For survival forests this is the time variable. NOTE: Currently, diversity forests are only possible using the formula interface; thus, \code{dependent.variable.name} must not be specified.}

\item{status.variable.name}{Name of status variable, only applicable to survival data and needed if no formula given. Use 1 for event and 0 for censoring. NOTE: Currently, diversity forests are only possible using the formula interface; thus, \code{status.variable.name} must not be specified.}

\item{classification}{Only needed if data is a matrix. Set to \code{TRUE} to grow a classification forest.}

\item{nsplits}{Number of candidate splits to sample for each split. Default is 30.}

\item{proptry}{parameter that restricts the number of candidate splits considered for small nodes. If \code{nsplits} is larger than \code{proptry} times the number of all possible splits, the number of candidate splits to draw is reduced to the largest integer smaller than \code{proptry} times the number of all possible splits. Default is 1, which corresponds to always using \code{nsplits} candidate splits.}
}
\value{
Object of class \code{divfor} with elements
  \item{\code{forest}}{Saved forest (If write.forest set to TRUE). Note that the variable IDs in the \code{split.varIDs} object do not necessarily represent the column number in R.}
  \item{\code{predictions}}{Predicted classes/values, based on out-of-bag samples (classification and regression only).}
  \item{\code{variable.importance}}{Variable importance for each independent variable.}
  \item{\code{prediction.error}}{Overall out-of-bag prediction error. For classification this is the fraction of missclassified samples, for probability estimation the Brier score, for regression the mean squared error and for survival one minus Harrell's C-index.}
  \item{\code{r.squared}}{R squared. Also called explained variance or coefficient of determination (regression only). Computed on out-of-bag data.}
  \item{\code{confusion.matrix}}{Contingency table for classes and predictions based on out-of-bag samples (classification only).}
  \item{\code{unique.death.times}}{Unique death times (survival only).}
  \item{\code{chf}}{Estimated cumulative hazard function for each sample (survival only).}
  \item{\code{survival}}{Estimated survival function for each sample (survival only).}
  \item{\code{call}}{Function call.}
  \item{\code{num.trees}}{Number of trees.}
  \item{\code{num.independent.variables}}{Number of independent variables.}
  \item{\code{min.node.size}}{Value of minimal node size used.}
  \item{\code{treetype}}{Type of forest/tree. classification, regression or survival.}
  \item{\code{importance.mode}}{Importance mode used.}
  \item{\code{num.samples}}{Number of samples.}
  \item{\code{splitrule}}{Splitting rule.}
  \item{\code{replace}}{Sample with replacement.}
  \item{\code{nsplits}}{Value of \code{nsplits} used.}
  \item{\code{proptry}}{Value of \code{proptry} used.}
}
\description{
Implements diversity forests as presented in Hornung (inprep).
Currently, classification, regression and survival prediction are possible.
Moreover, the current version of the package only supports univariate, binary splitting, but future
versions will allow using specific other split procedures. Because 'diversityForest' is a fork of the 
'ranger' R package (see below for details), the documentation is largely taken from
'ranger', where large parts of the documentation do not apply to (the current version of) the 'diversityForest' package.
Moreover, \code{divfor} contains function arguments that are currently not supported, but will be
in future versions of the package. However, the package is fully functional with respect to applying 
diversity forest using univariate, binary splitting. See the example section
for all basic application scenarios.
}
\details{
As noted above, 'diversityForest' is a fork of the R package 'ranger' that implements random forests using an
efficient C++ implementation. More precisely, 'diversityForest' was written by modifying
the code of 'ranger', version 0.11.0. Therefore, details on further functionalities
of the code that are not presented in the help pages of 'diversityForest' are found
in the help pages of 'ranger' (version 0.11.0). The code in the example sections of \code{\link{divfor}} and \code{\link{tunedivfor}}
can be used as a template for all basic application scenarios with respect to classification, regression and survival prediction
using univariate, binary splitting. Some function arguments adopted from the 'ranger' 
package are not useable with diversity forests (for the current package version).
}
\examples{
library("diversityForest")

## Set seed to obtain reproducible results:
set.seed(1234)

## Diversity forest with default settings (NOT recommended)
# Classification:
divfor(Species ~ ., data = iris, num.trees = 20)
# Regression:
iris2 <- iris; iris2$Species <- NULL; iris2$Y <- rnorm(nrow(iris2))
divfor(Y ~ ., data = iris2, num.trees = 20)
# Survival:
library("survival")
divfor(Surv(time, status) ~ ., data = veteran, num.trees = 20, respect.unordered.factors = "order")
# NOTE: num.trees = 20 is specified too small for practical 
# purposes - the prediction performance of the resulting 
# forest will be suboptimal!!
# In practice, num.trees = 500 (default value) or a 
# larger number should be used.

## Diversity forest with specified values for nsplits and proptry (NOT recommended)
divfor(Species ~ ., data = iris, nsplits = 10, proptry = 0.4, num.trees = 20)
# NOTE again: num.trees = 20 is specified too small for practical purposes.

## Applying diversity forest after optimizing the values of nsplits and proptry (recommended)
tuneres <- tunedivfor(formula = Species ~ ., data = iris, num.trees.pre = 20)
# NOTE: num.trees.pre = 20 is specified too small for practical 
# purposes - the out-of-bag error estimates of the forests 
# constructed during optimization will be much too variable!!
# In practice, num.trees.pre = 500 (default value) or a 
# larger number should be used.
divfor(Species ~ ., data = iris, nsplits = tuneres$nsplitsopt, 
  proptry = tuneres$proptryopt, num.trees = 20)
# NOTE again: num.trees = 20 is specified too small for practical purposes.

## Prediction
train.idx <- sample(nrow(iris), 2/3 * nrow(iris))
iris.train <- iris[train.idx, ]
iris.test <- iris[-train.idx, ]
tuneres <- tunedivfor(formula = Species ~ ., data = iris.train, num.trees.pre = 20)
# NOTE again: num.trees.pre = 20 is specified too small for practical purposes.
rg.iris <- divfor(Species ~ ., data = iris.train, nsplits = tuneres$nsplitsopt, 
  proptry = tuneres$proptryopt, num.trees = 20)
# NOTE again: num.trees = 20 is specified too small for practical purposes.
pred.iris <- predict(rg.iris, data = iris.test)
table(iris.test$Species, pred.iris$predictions)

## Variable importance
rg.iris <- divfor(Species ~ ., data = iris, importance = "permutation", num.trees = 20)
# NOTE again: num.trees = 20 is specified too small for practical purposes.
rg.iris$variable.importance

}
\references{
\itemize{
  \item Wright, M. N. & Ziegler, A. (2017). "ranger: A fast implementation of random forests for high dimensional data in C++ and R". J Stat Softw 77:1-17, <\doi{10.18637/jss.v077.i01}>.
  \item Breiman, L. (2001). "Random forests". Mach Learn, 45:5-32, <\doi{10.1023/A:1010933404324}>.
  \item Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). "Probability machines: consistent probability estimation using nonparametric learning machines". Methods Inf Med 51:74-81, <\doi{10.3414/ME00-01-0052}>.
  \item Meinshausen (2006). "Quantile Regression Forests". J Mach Learn Res 7:983-999. \url{http://www.jmlr.org/papers/v7/meinshausen06a.html}.
  }
}
\seealso{
\code{\link{predict.divfor}}
}
\author{
Roman Hornung, Marvin N. Wright
}
